diff options
author | GitLab Bot <gitlab-bot@gitlab.com> | 2022-10-20 12:40:42 +0300 |
---|---|---|
committer | GitLab Bot <gitlab-bot@gitlab.com> | 2022-10-20 12:40:42 +0300 |
commit | ee664acb356f8123f4f6b00b73c1e1cf0866c7fb (patch) | |
tree | f8479f94a28f66654c6a4f6fb99bad6b4e86a40e /lib/gitlab/memory | |
parent | 62f7d5c5b69180e82ae8196b7b429eeffc8e7b4f (diff) |
Add latest changes from gitlab-org/gitlab@15-5-stable-eev15.5.0-rc42
Diffstat (limited to 'lib/gitlab/memory')
-rw-r--r-- | lib/gitlab/memory/diagnostic_reports_logger.rb | 19 | ||||
-rw-r--r-- | lib/gitlab/memory/reports_daemon.rb | 2 | ||||
-rw-r--r-- | lib/gitlab/memory/reports_uploader.rb | 52 | ||||
-rw-r--r-- | lib/gitlab/memory/upload_and_cleanup_reports.rb | 72 | ||||
-rw-r--r-- | lib/gitlab/memory/watchdog.rb | 181 | ||||
-rw-r--r-- | lib/gitlab/memory/watchdog/configuration.rb | 64 | ||||
-rw-r--r-- | lib/gitlab/memory/watchdog/monitor/heap_fragmentation.rb | 51 | ||||
-rw-r--r-- | lib/gitlab/memory/watchdog/monitor/unique_memory_growth.rb | 47 | ||||
-rw-r--r-- | lib/gitlab/memory/watchdog/monitor_state.rb | 85 |
9 files changed, 428 insertions, 145 deletions
diff --git a/lib/gitlab/memory/diagnostic_reports_logger.rb b/lib/gitlab/memory/diagnostic_reports_logger.rb new file mode 100644 index 00000000000..cc5b719fa19 --- /dev/null +++ b/lib/gitlab/memory/diagnostic_reports_logger.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +require 'logger' + +module Gitlab + module Memory + class DiagnosticReportsLogger < ::Logger + def format_message(severity, timestamp, progname, message) + data = {} + data[:severity] = severity + data[:time] = timestamp.utc.iso8601(3) + + data.merge!(message) + + "#{JSON.generate(data)}\n" # rubocop:disable Gitlab/Json + end + end + end +end diff --git a/lib/gitlab/memory/reports_daemon.rb b/lib/gitlab/memory/reports_daemon.rb index ed1da8baab5..0dfc31235e7 100644 --- a/lib/gitlab/memory/reports_daemon.rb +++ b/lib/gitlab/memory/reports_daemon.rb @@ -7,7 +7,7 @@ module Gitlab DEFAULT_SLEEP_MAX_DELTA_S = 600 # 0..10 minutes DEFAULT_SLEEP_BETWEEN_REPORTS_S = 120 # 2 minutes - DEFAULT_REPORTS_PATH = '/tmp' + DEFAULT_REPORTS_PATH = Dir.tmpdir def initialize(**options) super diff --git a/lib/gitlab/memory/reports_uploader.rb b/lib/gitlab/memory/reports_uploader.rb new file mode 100644 index 00000000000..76c3e0862e2 --- /dev/null +++ b/lib/gitlab/memory/reports_uploader.rb @@ -0,0 +1,52 @@ +# frozen_string_literal: true + +require_relative '../metrics/system' + +module Gitlab + module Memory + class ReportsUploader + def initialize(gcs_key:, gcs_project:, gcs_bucket:, logger:) + @gcs_bucket = gcs_bucket + @fog = Fog::Storage::Google.new(google_project: gcs_project, google_json_key_location: gcs_key) + @logger = logger + end + + def upload(path) + log_upload_requested(path) + start_monotonic_time = Gitlab::Metrics::System.monotonic_time + + File.open(path.to_s) { |file| fog.put_object(gcs_bucket, File.basename(path), file) } + + duration_s = Gitlab::Metrics::System.monotonic_time - start_monotonic_time + log_upload_success(path, duration_s) + rescue StandardError, Errno::ENOENT => error + log_exception(error) + end + + private + + attr_reader :gcs_bucket, :fog, :logger + + def log_upload_requested(path) + logger.info(log_labels.merge(perf_report_status: 'upload requested', perf_report_path: path)) + end + + def log_upload_success(path, duration_s) + logger.info(log_labels.merge(perf_report_status: 'upload success', perf_report_path: path, + duration_s: duration_s)) + end + + def log_exception(error) + logger.error(log_labels.merge(perf_report_status: "error", error: error.message)) + end + + def log_labels + { + message: "Diagnostic reports", + class: self.class.name, + pid: $$ + } + end + end + end +end diff --git a/lib/gitlab/memory/upload_and_cleanup_reports.rb b/lib/gitlab/memory/upload_and_cleanup_reports.rb new file mode 100644 index 00000000000..27d94df478c --- /dev/null +++ b/lib/gitlab/memory/upload_and_cleanup_reports.rb @@ -0,0 +1,72 @@ +# frozen_string_literal: true + +module Gitlab + module Memory + class UploadAndCleanupReports + DEFAULT_SLEEP_TIME_SECONDS = 900 # 15 minutes + + def initialize( + uploader:, + reports_path:, + logger:, + sleep_time_seconds: ENV['GITLAB_DIAGNOSTIC_REPORTS_UPLOADER_SLEEP_S']&.to_i || DEFAULT_SLEEP_TIME_SECONDS) + + @uploader = uploader + @reports_path = reports_path + @sleep_time_seconds = sleep_time_seconds + @alive = true + @logger = logger + end + + attr_reader :uploader, :reports_path, :sleep_time_seconds, :logger + + def call + log_started + + loop do + sleep(sleep_time_seconds) + + files_to_process.each { |path| upload_and_cleanup!(path) } + end + end + + private + + def upload_and_cleanup!(path) + uploader.upload(path) + rescue StandardError, Errno::ENOENT => error + log_exception(error) + ensure + cleanup!(path) + end + + def cleanup!(path) + File.unlink(path) if File.exist?(path) + rescue Errno::ENOENT + # Path does not exist: Ignore. We already check `File.exist?`. Rescue to be extra safe. + end + + def files_to_process + Dir.entries(reports_path) + .map { |path| File.join(reports_path, path) } + .select { |path| File.file?(path) } + end + + def log_started + logger.info(log_labels.merge(perf_report_status: "started")) + end + + def log_exception(error) + logger.error(log_labels.merge(perf_report_status: "error", error: error.message)) + end + + def log_labels + { + message: "Diagnostic reports", + class: self.class.name, + pid: $$ + } + end + end + end +end diff --git a/lib/gitlab/memory/watchdog.rb b/lib/gitlab/memory/watchdog.rb index 38231fa933b..7007fdfe386 100644 --- a/lib/gitlab/memory/watchdog.rb +++ b/lib/gitlab/memory/watchdog.rb @@ -2,25 +2,10 @@ module Gitlab module Memory - # A background thread that observes Ruby heap fragmentation and calls - # into a handler when the Ruby heap has been fragmented for an extended - # period of time. - # - # See Gitlab::Metrics::Memory for how heap fragmentation is defined. - # - # To decide whether a given fragmentation level is being exceeded, - # the watchdog regularly polls the GC. Whenever a violation occurs - # a strike is issued. If the maximum number of strikes are reached, - # a handler is invoked to deal with the situation. - # - # The duration for which a process may be above a given fragmentation - # threshold is computed as `max_strikes * sleep_time_seconds`. + # A background thread that monitors Ruby memory and calls + # into a handler when the Ruby process violates defined limits + # for an extended period of time. class Watchdog - DEFAULT_SLEEP_TIME_SECONDS = 60 * 5 - DEFAULT_MAX_HEAP_FRAG = 0.5 - DEFAULT_MAX_MEM_GROWTH = 3.0 - DEFAULT_MAX_STRIKES = 5 - # This handler does nothing. It returns `false` to indicate to the # caller that the situation has not been dealt with so it will # receive calls repeatedly if fragmentation remains high. @@ -62,73 +47,27 @@ module Gitlab end end - # max_heap_fragmentation: - # The degree to which the Ruby heap is allowed to be fragmented. Range [0,1]. - # max_mem_growth: - # A multiplier for how much excess private memory a worker can map compared to a reference process - # (itself or the primary in a pre-fork server.) - # max_strikes: - # How many times the process is allowed to be above max_heap_fragmentation before - # a handler is invoked. - # sleep_time_seconds: - # Used to control the frequency with which the watchdog will wake up and poll the GC. - def initialize( - handler: NullHandler.instance, - logger: Logger.new($stdout), - max_heap_fragmentation: ENV['GITLAB_MEMWD_MAX_HEAP_FRAG']&.to_f || DEFAULT_MAX_HEAP_FRAG, - max_mem_growth: ENV['GITLAB_MEMWD_MAX_MEM_GROWTH']&.to_f || DEFAULT_MAX_MEM_GROWTH, - max_strikes: ENV['GITLAB_MEMWD_MAX_STRIKES']&.to_i || DEFAULT_MAX_STRIKES, - sleep_time_seconds: ENV['GITLAB_MEMWD_SLEEP_TIME_SEC']&.to_i || DEFAULT_SLEEP_TIME_SECONDS, - **options) - super(**options) - - @handler = handler - @logger = logger - @sleep_time_seconds = sleep_time_seconds - @max_strikes = max_strikes - @stats = { - heap_frag: { - max: max_heap_fragmentation, - strikes: 0 - }, - mem_growth: { - max: max_mem_growth, - strikes: 0 - } - } - + def initialize + @configuration = Configuration.new @alive = true - init_prometheus_metrics(max_heap_fragmentation) - end - - attr_reader :max_strikes, :sleep_time_seconds - - def max_heap_fragmentation - @stats[:heap_frag][:max] - end - - def max_mem_growth - @stats[:mem_growth][:max] + init_prometheus_metrics end - def strikes(stat) - @stats[stat][:strikes] + def configure + yield @configuration end def call - @logger.info(log_labels.merge(message: 'started')) + logger.info(log_labels.merge(message: 'started')) while @alive - sleep(@sleep_time_seconds) - - next unless Feature.enabled?(:gitlab_memory_watchdog, type: :ops) + sleep(sleep_time_seconds) - monitor_heap_fragmentation - monitor_memory_growth + monitor if Feature.enabled?(:gitlab_memory_watchdog, type: :ops) end - @logger.info(log_labels.merge(message: 'stopped')) + logger.info(log_labels.merge(message: 'stopped')) end def stop @@ -137,71 +76,24 @@ module Gitlab private - def monitor_memory_condition(stat_key) - return unless @alive - - stat = @stats[stat_key] - - ok, labels = yield(stat) + def monitor + @configuration.monitors.call_each do |result| + break unless @alive - if ok - stat[:strikes] = 0 - else - stat[:strikes] += 1 - @counter_violations.increment(reason: stat_key.to_s) - end + next unless result.threshold_violated? - if stat[:strikes] > @max_strikes - @alive = !memory_limit_exceeded_callback(stat_key, labels) - stat[:strikes] = 0 - end - end + @counter_violations.increment(reason: result.monitor_name) - def monitor_heap_fragmentation - monitor_memory_condition(:heap_frag) do |stat| - heap_fragmentation = Gitlab::Metrics::Memory.gc_heap_fragmentation - [ - heap_fragmentation <= stat[:max], - { - message: 'heap fragmentation limit exceeded', - memwd_cur_heap_frag: heap_fragmentation, - memwd_max_heap_frag: stat[:max] - } - ] - end - end + next unless result.strikes_exceeded? - def monitor_memory_growth - monitor_memory_condition(:mem_growth) do |stat| - worker_uss = Gitlab::Metrics::System.memory_usage_uss_pss[:uss] - reference_uss = reference_mem[:uss] - memory_limit = stat[:max] * reference_uss - [ - worker_uss <= memory_limit, - { - message: 'memory limit exceeded', - memwd_uss_bytes: worker_uss, - memwd_ref_uss_bytes: reference_uss, - memwd_max_uss_bytes: memory_limit - } - ] + @alive = !memory_limit_exceeded_callback(result.monitor_name, result.payload) end end - # On pre-fork systems this would be the primary process memory from which workers fork. - # Otherwise it is the current process' memory. - # - # We initialize this lazily because in the initializer the application may not have - # finished booting yet, which would yield an incorrect baseline. - def reference_mem - @reference_mem ||= Gitlab::Metrics::System.memory_usage_uss_pss(pid: Gitlab::Cluster::PRIMARY_PID) - end - - def memory_limit_exceeded_callback(stat_key, handler_labels) - all_labels = log_labels.merge(handler_labels) - .merge(memwd_cur_strikes: strikes(stat_key)) - @logger.warn(all_labels) - @counter_violations_handled.increment(reason: stat_key.to_s) + def memory_limit_exceeded_callback(monitor_name, monitor_payload) + all_labels = log_labels.merge(monitor_payload) + logger.warn(all_labels) + @counter_violations_handled.increment(reason: monitor_name) handler.call end @@ -211,7 +103,15 @@ module Gitlab # all that happens is we collect logs and Prometheus events for fragmentation violations. return NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops) - @handler + @configuration.handler + end + + def logger + @configuration.logger + end + + def sleep_time_seconds + @configuration.sleep_time_seconds end def log_labels @@ -219,27 +119,20 @@ module Gitlab pid: $$, worker_id: worker_id, memwd_handler_class: handler.class.name, - memwd_sleep_time_s: @sleep_time_seconds, - memwd_max_strikes: @max_strikes, + memwd_sleep_time_s: sleep_time_seconds, memwd_rss_bytes: process_rss_bytes } end - def worker_id - ::Prometheus::PidProvider.worker_id - end - def process_rss_bytes Gitlab::Metrics::System.memory_usage_rss end - def init_prometheus_metrics(max_heap_fragmentation) - @heap_frag_limit = Gitlab::Metrics.gauge( - :gitlab_memwd_heap_frag_limit, - 'The configured limit for how fragmented the Ruby heap is allowed to be' - ) - @heap_frag_limit.set({}, max_heap_fragmentation) + def worker_id + ::Prometheus::PidProvider.worker_id + end + def init_prometheus_metrics default_labels = { pid: worker_id } @counter_violations = Gitlab::Metrics.counter( :gitlab_memwd_violations_total, diff --git a/lib/gitlab/memory/watchdog/configuration.rb b/lib/gitlab/memory/watchdog/configuration.rb new file mode 100644 index 00000000000..2d84b083f55 --- /dev/null +++ b/lib/gitlab/memory/watchdog/configuration.rb @@ -0,0 +1,64 @@ +# frozen_string_literal: true + +module Gitlab + module Memory + class Watchdog + class Configuration + class MonitorStack + def initialize + @monitors = [] + end + + def use(monitor_class, *args, **kwargs, &block) + remove(monitor_class) + @monitors.push(build_monitor_state(monitor_class, *args, **kwargs, &block)) + end + + def call_each + @monitors.each do |monitor| + yield monitor.call + end + end + + private + + def remove(monitor_class) + @monitors.delete_if { |monitor| monitor.monitor_class == monitor_class } + end + + def build_monitor_state(monitor_class, *args, max_strikes:, **kwargs, &block) + monitor = build_monitor(monitor_class, *args, **kwargs, &block) + + Gitlab::Memory::Watchdog::MonitorState.new(monitor, max_strikes: max_strikes) + end + + def build_monitor(monitor_class, *args, **kwargs, &block) + monitor_class.new(*args, **kwargs, &block) + end + end + + DEFAULT_SLEEP_TIME_SECONDS = 60 + + attr_reader :monitors + attr_writer :logger, :handler, :sleep_time_seconds + + def initialize + @monitors = MonitorStack.new + end + + def handler + @handler ||= NullHandler.instance + end + + def logger + @logger ||= Gitlab::Logger.new($stdout) + end + + # Used to control the frequency with which the watchdog will wake up and poll the GC. + def sleep_time_seconds + @sleep_time_seconds ||= DEFAULT_SLEEP_TIME_SECONDS + end + end + end + end +end diff --git a/lib/gitlab/memory/watchdog/monitor/heap_fragmentation.rb b/lib/gitlab/memory/watchdog/monitor/heap_fragmentation.rb new file mode 100644 index 00000000000..7748c19c6d8 --- /dev/null +++ b/lib/gitlab/memory/watchdog/monitor/heap_fragmentation.rb @@ -0,0 +1,51 @@ +# frozen_string_literal: true + +module Gitlab + module Memory + class Watchdog + module Monitor + # A monitor that observes Ruby heap fragmentation and calls + # memory_violation_callback when the Ruby heap has been fragmented for an extended + # period of time. + # + # See Gitlab::Metrics::Memory for how heap fragmentation is defined. + class HeapFragmentation + attr_reader :max_heap_fragmentation + + # max_heap_fragmentation: + # The degree to which the Ruby heap is allowed to be fragmented. Range [0,1]. + def initialize(max_heap_fragmentation:) + @max_heap_fragmentation = max_heap_fragmentation + init_frag_limit_metrics + end + + def call + heap_fragmentation = Gitlab::Metrics::Memory.gc_heap_fragmentation + + return { threshold_violated: false, payload: {} } unless heap_fragmentation > max_heap_fragmentation + + { threshold_violated: true, payload: payload(heap_fragmentation) } + end + + private + + def payload(heap_fragmentation) + { + message: 'heap fragmentation limit exceeded', + memwd_cur_heap_frag: heap_fragmentation, + memwd_max_heap_frag: max_heap_fragmentation + } + end + + def init_frag_limit_metrics + heap_frag_limit = Gitlab::Metrics.gauge( + :gitlab_memwd_heap_frag_limit, + 'The configured limit for how fragmented the Ruby heap is allowed to be' + ) + heap_frag_limit.set({}, max_heap_fragmentation) + end + end + end + end + end +end diff --git a/lib/gitlab/memory/watchdog/monitor/unique_memory_growth.rb b/lib/gitlab/memory/watchdog/monitor/unique_memory_growth.rb new file mode 100644 index 00000000000..2a1512c4cff --- /dev/null +++ b/lib/gitlab/memory/watchdog/monitor/unique_memory_growth.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true + +module Gitlab + module Memory + class Watchdog + module Monitor + class UniqueMemoryGrowth + attr_reader :max_mem_growth + + def initialize(max_mem_growth:) + @max_mem_growth = max_mem_growth + end + + def call + worker_uss = Gitlab::Metrics::System.memory_usage_uss_pss[:uss] + reference_uss = reference_mem[:uss] + memory_limit = max_mem_growth * reference_uss + + return { threshold_violated: false, payload: {} } unless worker_uss > memory_limit + + { threshold_violated: true, payload: payload(worker_uss, reference_uss, memory_limit) } + end + + private + + def payload(worker_uss, reference_uss, memory_limit) + { + message: 'memory limit exceeded', + memwd_uss_bytes: worker_uss, + memwd_ref_uss_bytes: reference_uss, + memwd_max_uss_bytes: memory_limit + } + end + + # On pre-fork systems this would be the primary process memory from which workers fork. + # Otherwise it is the current process' memory. + # + # We initialize this lazily because in the initializer the application may not have + # finished booting yet, which would yield an incorrect baseline. + def reference_mem + @reference_mem ||= Gitlab::Metrics::System.memory_usage_uss_pss(pid: Gitlab::Cluster::PRIMARY_PID) + end + end + end + end + end +end diff --git a/lib/gitlab/memory/watchdog/monitor_state.rb b/lib/gitlab/memory/watchdog/monitor_state.rb new file mode 100644 index 00000000000..73be5de3e45 --- /dev/null +++ b/lib/gitlab/memory/watchdog/monitor_state.rb @@ -0,0 +1,85 @@ +# frozen_string_literal: true + +module Gitlab + module Memory + class Watchdog + class MonitorState + class Result + attr_reader :payload + + def initialize(strikes_exceeded:, threshold_violated:, monitor_class:, payload: ) + @strikes_exceeded = strikes_exceeded + @threshold_violated = threshold_violated + @monitor_class = monitor_class + @payload = payload + end + + def strikes_exceeded? + @strikes_exceeded + end + + def threshold_violated? + @threshold_violated + end + + def monitor_name + @monitor_class.name.demodulize.underscore.to_sym + end + end + + def initialize(monitor, max_strikes:) + @monitor = monitor + @max_strikes = max_strikes + @strikes = 0 + end + + def call + reset_strikes if strikes_exceeded? + + monitor_result = @monitor.call + + if monitor_result[:threshold_violated] + issue_strike + else + reset_strikes + end + + build_result(monitor_result) + end + + def monitor_class + @monitor.class + end + + private + + def build_result(monitor_result) + Result.new( + strikes_exceeded: strikes_exceeded?, + monitor_class: monitor_class, + threshold_violated: monitor_result[:threshold_violated], + payload: payload.merge(monitor_result[:payload])) + end + + def payload + { + memwd_max_strikes: @max_strikes, + memwd_cur_strikes: @strikes + } + end + + def strikes_exceeded? + @strikes > @max_strikes + end + + def issue_strike + @strikes += 1 + end + + def reset_strikes + @strikes = 0 + end + end + end + end +end |