diff options
author | GitLab Bot <gitlab-bot@gitlab.com> | 2022-08-18 11:17:02 +0300 |
---|---|---|
committer | GitLab Bot <gitlab-bot@gitlab.com> | 2022-08-18 11:17:02 +0300 |
commit | b39512ed755239198a9c294b6a45e65c05900235 (patch) | |
tree | d234a3efade1de67c46b9e5a38ce813627726aa7 /lib/gitlab/memory | |
parent | d31474cf3b17ece37939d20082b07f6657cc79a9 (diff) |
Add latest changes from gitlab-org/gitlab@15-3-stable-eev15.3.0-rc42
Diffstat (limited to 'lib/gitlab/memory')
-rw-r--r-- | lib/gitlab/memory/jemalloc.rb | 22 | ||||
-rw-r--r-- | lib/gitlab/memory/reports/jemalloc_stats.rb | 67 | ||||
-rw-r--r-- | lib/gitlab/memory/reports_daemon.rb | 106 | ||||
-rw-r--r-- | lib/gitlab/memory/watchdog.rb | 18 |
4 files changed, 197 insertions, 16 deletions
diff --git a/lib/gitlab/memory/jemalloc.rb b/lib/gitlab/memory/jemalloc.rb index 454c54569de..7163a70a5cb 100644 --- a/lib/gitlab/memory/jemalloc.rb +++ b/lib/gitlab/memory/jemalloc.rb @@ -14,6 +14,8 @@ module Gitlab STATS_DEFAULT_FORMAT = :json + FILENAME_PREFIX = 'jemalloc_stats' + # Return jemalloc stats as a string. def stats(format: STATS_DEFAULT_FORMAT) verify_format!(format) @@ -23,16 +25,24 @@ module Gitlab end end - # Write jemalloc stats to the given directory. - def dump_stats(path:, format: STATS_DEFAULT_FORMAT) + # Write jemalloc stats to the given directory + # @param [String] path Directory path the dump will be put into + # @param [String] format `json` or `txt` + # @param [String] filename_label Optional custom string that will be injected into the file name, e.g. `worker_0` + # @return [String] Full path to the resulting dump file + def dump_stats(path:, format: STATS_DEFAULT_FORMAT, filename_label: nil) verify_format!(format) + format_settings = STATS_FORMATS[format] + file_path = File.join(path, file_name(format_settings[:extension], filename_label)) + with_malloc_stats_print do |stats_print| - format_settings = STATS_FORMATS[format] - File.open(File.join(path, file_name(format_settings[:extension])), 'wb') do |io| + File.open(file_path, 'wb') do |io| write_stats(stats_print, io, format_settings) end end + + file_path end private @@ -80,8 +90,8 @@ module Gitlab stats_print.call(callback, nil, format[:options]) end - def file_name(extension) - "jemalloc_stats.#{$$}.#{Time.current.to_i}.#{extension}" + def file_name(extension, filename_label) + [FILENAME_PREFIX, $$, filename_label, Time.current.to_i, extension].reject(&:blank?).join('.') end end end diff --git a/lib/gitlab/memory/reports/jemalloc_stats.rb b/lib/gitlab/memory/reports/jemalloc_stats.rb new file mode 100644 index 00000000000..b99bec4ac3e --- /dev/null +++ b/lib/gitlab/memory/reports/jemalloc_stats.rb @@ -0,0 +1,67 @@ +# frozen_string_literal: true + +module Gitlab + module Memory + module Reports + class JemallocStats + # On prod, Jemalloc reports sizes were ~2.5 MB: + # https://gitlab.com/gitlab-com/gl-infra/reliability/-/issues/15993#note_1014767214 + # We configured 1GB emptyDir per pod: + # https://gitlab.com/gitlab-com/gl-infra/k8s-workloads/gitlab-com/-/merge_requests/1949 + # The pod will be evicted when the size limit is exceeded. We never want this to happen, for availability. + # + # With the default, we have a headroom (250*2.5MB=625<1000 MB) to fit into configured emptyDir. + # It would allow us to keep 3+ days worth of reports for 6 workers running every 2 hours: 3*6*12=216<250 + # + # The cleanup logic will be redundant after we'll implement the uploads, which would perform the cleanup. + DEFAULT_MAX_REPORTS_STORED = 250 + + def initialize(reports_path:) + @reports_path = reports_path + end + + def run + return unless active? + + Gitlab::Memory::Jemalloc.dump_stats(path: reports_path, filename_label: worker_id).tap { cleanup } + end + + def active? + Feature.enabled?(:report_jemalloc_stats, type: :ops) + end + + private + + attr_reader :reports_path + + def cleanup + reports_files_modified_order[0...-max_reports_stored].each do |f| + File.unlink(f) if File.exist?(f) + rescue Errno::ENOENT + # Path does not exist: Ignore. We already check `File.exist?` + # Rescue to be extra safe, because each worker could perform a cleanup + end + end + + def reports_files_modified_order + pattern = File.join(reports_path, "#{Gitlab::Memory::Jemalloc::FILENAME_PREFIX}*") + + Dir.glob(pattern).sort_by do |f| + test('M', f) + rescue Errno::ENOENT + # Path does not exist: Return any timestamp to proceed with the sort + Time.current + end + end + + def worker_id + ::Prometheus::PidProvider.worker_id + end + + def max_reports_stored + ENV["GITLAB_DIAGNOSTIC_REPORTS_JEMALLOC_MAX_REPORTS_STORED"] || DEFAULT_MAX_REPORTS_STORED + end + end + end + end +end diff --git a/lib/gitlab/memory/reports_daemon.rb b/lib/gitlab/memory/reports_daemon.rb new file mode 100644 index 00000000000..ed1da8baab5 --- /dev/null +++ b/lib/gitlab/memory/reports_daemon.rb @@ -0,0 +1,106 @@ +# frozen_string_literal: true + +module Gitlab + module Memory + class ReportsDaemon < Daemon + DEFAULT_SLEEP_S = 7200 # 2 hours + DEFAULT_SLEEP_MAX_DELTA_S = 600 # 0..10 minutes + DEFAULT_SLEEP_BETWEEN_REPORTS_S = 120 # 2 minutes + + DEFAULT_REPORTS_PATH = '/tmp' + + def initialize(**options) + super + + @alive = true + + @sleep_s = + ENV['GITLAB_DIAGNOSTIC_REPORTS_SLEEP_S']&.to_i || DEFAULT_SLEEP_S + @sleep_max_delta_s = + ENV['GITLAB_DIAGNOSTIC_REPORTS_SLEEP_MAX_DELTA_S']&.to_i || DEFAULT_SLEEP_MAX_DELTA_S + @sleep_between_reports_s = + ENV['GITLAB_DIAGNOSTIC_REPORTS_SLEEP_BETWEEN_REPORTS_S']&.to_i || DEFAULT_SLEEP_BETWEEN_REPORTS_S + + @reports_path = + ENV["GITLAB_DIAGNOSTIC_REPORTS_PATH"] || DEFAULT_REPORTS_PATH + + @reports = [Gitlab::Memory::Reports::JemallocStats.new(reports_path: reports_path)] + + init_prometheus_metrics + end + + attr_reader :sleep_s, :sleep_max_delta_s, :sleep_between_reports_s, :reports_path + + def run_thread + while alive + sleep interval_with_jitter + + reports.select(&:active?).each do |report| + start_monotonic_time = Gitlab::Metrics::System.monotonic_time + start_thread_cpu_time = Gitlab::Metrics::System.thread_cpu_time + + file_path = report.run + + cpu_s = Gitlab::Metrics::System.thread_cpu_duration(start_thread_cpu_time) + duration_s = Gitlab::Metrics::System.monotonic_time - start_monotonic_time + + log_report(label: report_label(report), cpu_s: cpu_s, duration_s: duration_s, size: file_size(file_path)) + @report_duration_counter.increment({ report: report_label(report) }, duration_s) + + sleep sleep_between_reports_s + end + end + end + + private + + attr_reader :alive, :reports + + # Returns the sleep interval with a random adjustment. + # The random adjustment is put in place to ensure continued availability. + def interval_with_jitter + sleep_s + rand(sleep_max_delta_s) + end + + def log_report(label:, duration_s:, cpu_s:, size:) + Gitlab::AppLogger.info( + message: 'finished', + pid: $$, + worker_id: worker_id, + perf_report: label, + duration_s: duration_s.round(2), + cpu_s: cpu_s.round(2), + perf_report_size_bytes: size + ) + end + + def worker_id + ::Prometheus::PidProvider.worker_id + end + + def report_label(report) + report.class.to_s.demodulize.underscore + end + + def stop_working + @alive = false + end + + def init_prometheus_metrics + default_labels = { pid: worker_id } + + @report_duration_counter = Gitlab::Metrics.counter( + :gitlab_diag_report_duration_seconds_total, + 'Total time elapsed for running diagnostic report', + default_labels + ) + end + + def file_size(file_path) + File.size(file_path.to_s) + rescue Errno::ENOENT + 0 + end + end + end +end diff --git a/lib/gitlab/memory/watchdog.rb b/lib/gitlab/memory/watchdog.rb index db75ba8a47d..91edb68ad66 100644 --- a/lib/gitlab/memory/watchdog.rb +++ b/lib/gitlab/memory/watchdog.rb @@ -15,7 +15,7 @@ module Gitlab # # The duration for which a process may be above a given fragmentation # threshold is computed as `max_strikes * sleep_time_seconds`. - class Watchdog < Daemon + class Watchdog DEFAULT_SLEEP_TIME_SECONDS = 60 DEFAULT_HEAP_FRAG_THRESHOLD = 0.5 DEFAULT_MAX_STRIKES = 5 @@ -91,7 +91,7 @@ module Gitlab attr_reader :strikes, :max_heap_fragmentation, :max_strikes, :sleep_time_seconds - def run_thread + def call @logger.info(log_labels.merge(message: 'started')) while @alive @@ -103,6 +103,10 @@ module Gitlab @logger.info(log_labels.merge(message: 'stopped')) end + def stop + @alive = false + end + private def monitor_heap_fragmentation @@ -141,10 +145,6 @@ module Gitlab @handler end - def stop_working - @alive = false - end - def log_labels { pid: $$, @@ -167,15 +167,13 @@ module Gitlab end def init_prometheus_metrics(max_heap_fragmentation) - default_labels = { pid: worker_id } - @heap_frag_limit = Gitlab::Metrics.gauge( :gitlab_memwd_heap_frag_limit, - 'The configured limit for how fragmented the Ruby heap is allowed to be', - default_labels + 'The configured limit for how fragmented the Ruby heap is allowed to be' ) @heap_frag_limit.set({}, max_heap_fragmentation) + default_labels = { pid: worker_id } @heap_frag_violations = Gitlab::Metrics.counter( :gitlab_memwd_heap_frag_violations_total, 'Total number of times heap fragmentation in a Ruby process exceeded its allowed maximum', |