diff options
Diffstat (limited to 'lib/gitlab/sidekiq_middleware/server_metrics.rb')
-rw-r--r-- | lib/gitlab/sidekiq_middleware/server_metrics.rb | 59 |
1 files changed, 36 insertions, 23 deletions
diff --git a/lib/gitlab/sidekiq_middleware/server_metrics.rb b/lib/gitlab/sidekiq_middleware/server_metrics.rb index 058c23178f8..a8b3683e09f 100644 --- a/lib/gitlab/sidekiq_middleware/server_metrics.rb +++ b/lib/gitlab/sidekiq_middleware/server_metrics.rb @@ -11,28 +11,25 @@ module Gitlab # most of the durations for cpu, gitaly, db and elasticsearch SIDEKIQ_LATENCY_BUCKETS = [0.1, 0.5, 1, 2.5].freeze - # These are the buckets we currently use for alerting, we will likely - # replace these histograms with Application SLIs - # https://gitlab.com/gitlab-com/gl-infra/scalability/-/issues/1313 + # These buckets are only available on self-managed. + # We have replaced with Application SLIs on GitLab.com. + # https://gitlab.com/groups/gitlab-com/gl-infra/-/epics/700 SIDEKIQ_JOB_DURATION_BUCKETS = [10, 300].freeze SIDEKIQ_QUEUE_DURATION_BUCKETS = [10, 60].freeze # These labels from Gitlab::SidekiqMiddleware::MetricsHelper are included in SLI metrics - SIDEKIQ_SLI_LABELS = [:worker, :feature_category, :urgency, :external_dependencies].freeze + SIDEKIQ_SLI_LABELS = [:worker, :feature_category, :urgency, :external_dependencies, :queue].freeze class << self include ::Gitlab::SidekiqMiddleware::MetricsHelper def metrics - { + metrics = { sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds this Sidekiq job spent on the CPU', {}, SIDEKIQ_LATENCY_BUCKETS), - sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete Sidekiq job', {}, SIDEKIQ_JOB_DURATION_BUCKETS), sidekiq_jobs_db_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_db_seconds, 'Seconds of database time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_gitaly_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_gitaly_seconds, 'Seconds of Gitaly time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), - sidekiq_jobs_queue_duration_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_queue_duration_seconds, 'Duration in seconds that a Sidekiq job was queued before being executed', {}, SIDEKIQ_QUEUE_DURATION_BUCKETS), sidekiq_redis_requests_duration_seconds: ::Gitlab::Metrics.histogram(:sidekiq_redis_requests_duration_seconds, 'Duration in seconds that a Sidekiq job spent requests a Redis server', {}, Gitlab::Instrumentation::Redis::QUERY_TIME_BUCKETS), sidekiq_elasticsearch_requests_duration_seconds: ::Gitlab::Metrics.histogram(:sidekiq_elasticsearch_requests_duration_seconds, 'Duration in seconds that a Sidekiq job spent in requests to an Elasticsearch server', {}, SIDEKIQ_LATENCY_BUCKETS), - sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'), sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'), sidekiq_jobs_interrupted_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_interrupted_total, 'Sidekiq jobs interrupted'), sidekiq_redis_requests_total: ::Gitlab::Metrics.counter(:sidekiq_redis_requests_total, 'Redis requests during a Sidekiq job execution'), @@ -41,6 +38,17 @@ module Gitlab sidekiq_concurrency: ::Gitlab::Metrics.gauge(:sidekiq_concurrency, 'Maximum number of Sidekiq jobs', {}, :all), sidekiq_mem_total_bytes: ::Gitlab::Metrics.gauge(:sidekiq_mem_total_bytes, 'Number of bytes allocated for both objects consuming an object slot and objects that required a malloc', {}, :all) } + + if Feature.enabled?(:emit_sidekiq_histogram_metrics, type: :ops) + metrics[:sidekiq_jobs_completion_seconds] = ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete Sidekiq job', {}, SIDEKIQ_JOB_DURATION_BUCKETS) + metrics[:sidekiq_jobs_queue_duration_seconds] = ::Gitlab::Metrics.histogram(:sidekiq_jobs_queue_duration_seconds, 'Duration in seconds that a Sidekiq job was queued before being executed', {}, SIDEKIQ_QUEUE_DURATION_BUCKETS) + metrics[:sidekiq_jobs_failed_total] = ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed') + else + # The sum metric is still used in GitLab.com for dashboards + metrics[:sidekiq_jobs_completion_seconds_sum] = ::Gitlab::Metrics.counter(:sidekiq_jobs_completion_seconds_sum, 'Total of seconds to complete Sidekiq job') + end + + metrics end def initialize_process_metrics @@ -59,13 +67,15 @@ module Gitlab base_labels = create_labels(worker_class, queue, {}) possible_sli_labels << base_labels.slice(*SIDEKIQ_SLI_LABELS) + next unless Feature.enabled?(:emit_sidekiq_histogram_metrics, type: :ops) + %w[done fail].each do |status| metrics[:sidekiq_jobs_completion_seconds].get(base_labels.merge(job_status: status)) end end - Gitlab::Metrics::SidekiqSlis.initialize_execution_slis!(possible_sli_labels) if ::Feature.enabled?(:sidekiq_execution_application_slis) - Gitlab::Metrics::SidekiqSlis.initialize_queueing_slis!(possible_sli_labels) if ::Feature.enabled?(:sidekiq_queueing_application_slis) + Gitlab::Metrics::SidekiqSlis.initialize_execution_slis!(possible_sli_labels) + Gitlab::Metrics::SidekiqSlis.initialize_queueing_slis!(possible_sli_labels) end end @@ -92,7 +102,8 @@ module Gitlab def instrument(job, labels) queue_duration = ::Gitlab::InstrumentationHelper.queue_duration_for_job(job) - @metrics[:sidekiq_jobs_queue_duration_seconds].observe(labels, queue_duration) if queue_duration + @metrics[:sidekiq_jobs_queue_duration_seconds]&.observe(labels, queue_duration) if queue_duration + @metrics[:sidekiq_running_jobs].increment(labels, 1) if job['retry_count'].present? @@ -119,13 +130,21 @@ module Gitlab # sidekiq_running_jobs, sidekiq_jobs_failed_total should not include the job_status label @metrics[:sidekiq_running_jobs].increment(labels, -1) - @metrics[:sidekiq_jobs_failed_total].increment(labels, 1) unless job_succeeded + + if Feature.enabled?(:emit_sidekiq_histogram_metrics, type: :ops) + @metrics[:sidekiq_jobs_failed_total].increment(labels, 1) unless job_succeeded + else + # we don't need job_status label here + @metrics[:sidekiq_jobs_completion_seconds_sum].increment(labels, monotonic_time) + end # job_status: done, fail match the job_status attribute in structured logging labels[:job_status] = job_succeeded ? "done" : "fail" instrumentation = job[:instrumentation] || {} @metrics[:sidekiq_jobs_cpu_seconds].observe(labels, job_thread_cputime) - @metrics[:sidekiq_jobs_completion_seconds].observe(labels, monotonic_time) + + @metrics[:sidekiq_jobs_completion_seconds]&.observe(labels, monotonic_time) + @metrics[:sidekiq_jobs_db_seconds].observe(labels, ActiveRecord::LogSubscriber.runtime / 1000) @metrics[:sidekiq_jobs_gitaly_seconds].observe(labels, get_gitaly_time(instrumentation)) @metrics[:sidekiq_redis_requests_total].increment(labels, get_redis_calls(instrumentation)) @@ -143,16 +162,10 @@ module Gitlab @metrics[:sidekiq_load_balancing_count].increment(labels.merge(load_balancing_labels), 1) end - if ::Feature.enabled?(:sidekiq_execution_application_slis) - sli_labels = labels.slice(*SIDEKIQ_SLI_LABELS) - Gitlab::Metrics::SidekiqSlis.record_execution_apdex(sli_labels, monotonic_time) if job_succeeded - Gitlab::Metrics::SidekiqSlis.record_execution_error(sli_labels, !job_succeeded) - end - - if ::Feature.enabled?(:sidekiq_queueing_application_slis) - sli_labels = labels.slice(*SIDEKIQ_SLI_LABELS) - Gitlab::Metrics::SidekiqSlis.record_queueing_apdex(sli_labels, queue_duration) if queue_duration - end + sli_labels = labels.slice(*SIDEKIQ_SLI_LABELS) + Gitlab::Metrics::SidekiqSlis.record_execution_apdex(sli_labels, monotonic_time) if job_succeeded + Gitlab::Metrics::SidekiqSlis.record_execution_error(sli_labels, !job_succeeded) + Gitlab::Metrics::SidekiqSlis.record_queueing_apdex(sli_labels, queue_duration) if queue_duration end end |