diff options
author | GitLab Bot <gitlab-bot@gitlab.com> | 2022-07-20 18:08:38 +0300 |
---|---|---|
committer | GitLab Bot <gitlab-bot@gitlab.com> | 2022-07-20 18:08:38 +0300 |
commit | 93b6ee78bf98cbc42712b7c5486ab0e78adb339f (patch) | |
tree | 29efc441f06368d2ec05196fcba070ee37c765b7 /lib/gitlab/memory | |
parent | 96add3eb957ee4256910087070e27850dd61cfe9 (diff) |
Add latest changes from gitlab-org/gitlab@master
Diffstat (limited to 'lib/gitlab/memory')
-rw-r--r-- | lib/gitlab/memory/jemalloc.rb | 16 | ||||
-rw-r--r-- | lib/gitlab/memory/reports/jemalloc_stats.rb | 68 | ||||
-rw-r--r-- | lib/gitlab/memory/reports_daemon.rb | 96 |
3 files changed, 175 insertions, 5 deletions
diff --git a/lib/gitlab/memory/jemalloc.rb b/lib/gitlab/memory/jemalloc.rb index 454c54569de..fbe5ae656b9 100644 --- a/lib/gitlab/memory/jemalloc.rb +++ b/lib/gitlab/memory/jemalloc.rb @@ -14,6 +14,8 @@ module Gitlab STATS_DEFAULT_FORMAT = :json + FILENAME_PREFIX = 'jemalloc_stats' + # Return jemalloc stats as a string. def stats(format: STATS_DEFAULT_FORMAT) verify_format!(format) @@ -23,13 +25,17 @@ module Gitlab end end - # Write jemalloc stats to the given directory. - def dump_stats(path:, format: STATS_DEFAULT_FORMAT) + # Write jemalloc stats to the given directory + # @param [String] path Directory path the dump will be put into + # @param [String] format `json` or `txt` + # @param [String] filename_label Optional custom string that will be injected into the file name, e.g. `worker_0` + # @return [void] + def dump_stats(path:, format: STATS_DEFAULT_FORMAT, filename_label: nil) verify_format!(format) with_malloc_stats_print do |stats_print| format_settings = STATS_FORMATS[format] - File.open(File.join(path, file_name(format_settings[:extension])), 'wb') do |io| + File.open(File.join(path, file_name(format_settings[:extension], filename_label)), 'wb') do |io| write_stats(stats_print, io, format_settings) end end @@ -80,8 +86,8 @@ module Gitlab stats_print.call(callback, nil, format[:options]) end - def file_name(extension) - "jemalloc_stats.#{$$}.#{Time.current.to_i}.#{extension}" + def file_name(extension, filename_label) + [FILENAME_PREFIX, $$, filename_label, Time.current.to_i, extension].reject(&:blank?).join('.') end end end diff --git a/lib/gitlab/memory/reports/jemalloc_stats.rb b/lib/gitlab/memory/reports/jemalloc_stats.rb new file mode 100644 index 00000000000..b3848d40770 --- /dev/null +++ b/lib/gitlab/memory/reports/jemalloc_stats.rb @@ -0,0 +1,68 @@ +# frozen_string_literal: true + +module Gitlab + module Memory + module Reports + class JemallocStats + # On prod, Jemalloc reports sizes were ~2.5 MB: + # https://gitlab.com/gitlab-com/gl-infra/reliability/-/issues/15993#note_1014767214 + # We configured 1GB emptyDir per pod: + # https://gitlab.com/gitlab-com/gl-infra/k8s-workloads/gitlab-com/-/merge_requests/1949 + # The pod will be evicted when the size limit is exceeded. We never want this to happen, for availability. + # + # With the default, we have a headroom (250*2.5MB=625<1000 MB) to fit into configured emptyDir. + # It would allow us to keep 3+ days worth of reports for 6 workers running every 2 hours: 3*6*12=216<250 + # + # The cleanup logic will be redundant after we'll implement the uploads, which would perform the cleanup. + DEFAULT_MAX_REPORTS_STORED = 250 + + def initialize(reports_path:) + @reports_path = reports_path + end + + def run + return unless active? + + Gitlab::Memory::Jemalloc.dump_stats(path: reports_path, filename_label: worker_id) + cleanup + end + + def active? + Feature.enabled?(:report_jemalloc_stats, type: :ops) + end + + private + + attr_reader :reports_path + + def cleanup + reports_files_modified_order[0...-max_reports_stored].each do |f| + File.unlink(f) if File.exist?(f) + rescue Errno::ENOENT + # Path does not exist: Ignore. We already check `File.exist?` + # Rescue to be extra safe, because each worker could perform a cleanup + end + end + + def reports_files_modified_order + pattern = File.join(reports_path, "#{Gitlab::Memory::Jemalloc::FILENAME_PREFIX}*") + + Dir.glob(pattern).sort_by do |f| + test('M', f) + rescue Errno::ENOENT + # Path does not exist: Return any timestamp to proceed with the sort + Time.current + end + end + + def worker_id + ::Prometheus::PidProvider.worker_id + end + + def max_reports_stored + ENV["GITLAB_DIAGNOSTIC_REPORTS_JEMALLOC_MAX_REPORTS_STORED"] || DEFAULT_MAX_REPORTS_STORED + end + end + end + end +end diff --git a/lib/gitlab/memory/reports_daemon.rb b/lib/gitlab/memory/reports_daemon.rb new file mode 100644 index 00000000000..2b2e0915e72 --- /dev/null +++ b/lib/gitlab/memory/reports_daemon.rb @@ -0,0 +1,96 @@ +# frozen_string_literal: true + +module Gitlab + module Memory + class ReportsDaemon < Daemon + DEFAULT_SLEEP_S = 7200 # 2 hours + DEFAULT_SLEEP_MAX_DELTA_S = 600 # 0..10 minutes + DEFAULT_SLEEP_BETWEEN_REPORTS_S = 120 # 2 minutes + + DEFAULT_REPORTS_PATH = '/tmp' + + def initialize(**options) + super + + @alive = true + + @sleep_s = + ENV['GITLAB_DIAGNOSTIC_REPORTS_SLEEP_S']&.to_i || DEFAULT_SLEEP_S + @sleep_max_delta_s = + ENV['GITLAB_DIAGNOSTIC_REPORTS_SLEEP_MAX_DELTA_S']&.to_i || DEFAULT_SLEEP_MAX_DELTA_S + @sleep_between_reports_s = + ENV['GITLAB_DIAGNOSTIC_REPORTS_SLEEP_BETWEEN_REPORTS_S']&.to_i || DEFAULT_SLEEP_BETWEEN_REPORTS_S + + @reports_path = + ENV["GITLAB_DIAGNOSTIC_REPORTS_PATH"] || DEFAULT_REPORTS_PATH + + @reports = [Gitlab::Memory::Reports::JemallocStats.new(reports_path: reports_path)] + + init_prometheus_metrics + end + + attr_reader :sleep_s, :sleep_max_delta_s, :sleep_between_reports_s, :reports_path + + def run_thread + while alive + sleep interval_with_jitter + + reports.select(&:active?).each do |report| + tms = Benchmark.measure do + report.run + end + + log_report(report_label(report), tms) + @report_duration_counter.increment({ report: report_label(report) }, tms.real.to_i) + + sleep sleep_between_reports_s + end + end + end + + private + + attr_reader :alive, :reports + + # Returns the sleep interval with a random adjustment. + # The random adjustment is put in place to ensure continued availability. + def interval_with_jitter + sleep_s + rand(sleep_max_delta_s) + end + + def log_report(report_label, tms) + Gitlab::AppLogger.info( + message: 'finished', + pid: $$, + worker_id: worker_id, + report: report_label, + duration_s: tms.real.round(2), + cpu_s: tms.utime.round(2), + sys_cpu_s: tms.stime.round(2) + ) + end + + def worker_id + ::Prometheus::PidProvider.worker_id + end + + def report_label(report) + report.class.to_s.demodulize.underscore + end + + def stop_working + @alive = false + end + + def init_prometheus_metrics + default_labels = { pid: worker_id } + + @report_duration_counter = Gitlab::Metrics.counter( + :gitlab_diag_report_duration_seconds_total, + 'Total time elapsed for running diagnostic report', + default_labels + ) + end + end + end +end |