Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitlab-foss.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGitLab Bot <gitlab-bot@gitlab.com>2019-09-18 21:06:14 +0300
committerGitLab Bot <gitlab-bot@gitlab.com>2019-09-18 21:06:14 +0300
commitb08279013423a66f06f5edde4e067f328fe135bd (patch)
tree47aca1a9b0655cd7861bddf31ec17d4b302fc4ea /lib/gitlab/sidekiq_daemon
parent4584eb0e07d372d6014de16ab359965475184c99 (diff)
Add latest changes from gitlab-org/gitlab@master
Diffstat (limited to 'lib/gitlab/sidekiq_daemon')
-rw-r--r--lib/gitlab/sidekiq_daemon/memory_killer.rb218
-rw-r--r--lib/gitlab/sidekiq_daemon/monitor.rb50
2 files changed, 247 insertions, 21 deletions
diff --git a/lib/gitlab/sidekiq_daemon/memory_killer.rb b/lib/gitlab/sidekiq_daemon/memory_killer.rb
new file mode 100644
index 00000000000..2089d82e254
--- /dev/null
+++ b/lib/gitlab/sidekiq_daemon/memory_killer.rb
@@ -0,0 +1,218 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module SidekiqDaemon
+ class MemoryKiller < Daemon
+ include ::Gitlab::Utils::StrongMemoize
+
+ # Today 64-bit CPU support max 256T memory. It is big enough.
+ MAX_MEMORY_KB = 256 * 1024 * 1024 * 1024
+ # RSS below `soft_limit_rss` is considered safe
+ SOFT_LIMIT_RSS_KB = ENV.fetch('SIDEKIQ_MEMORY_KILLER_MAX_RSS', 2000000).to_i
+ # RSS above `hard_limit_rss` will be stopped
+ HARD_LIMIT_RSS_KB = ENV.fetch('SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS', MAX_MEMORY_KB).to_i
+ # RSS in range (soft_limit_rss, hard_limit_rss) is allowed for GRACE_BALLOON_SECONDS
+ GRACE_BALLOON_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_GRACE_TIME', 15 * 60).to_i
+ # Check RSS every CHECK_INTERVAL_SECONDS, minimum 2 seconds
+ CHECK_INTERVAL_SECONDS = [ENV.fetch('SIDEKIQ_MEMORY_KILLER_CHECK_INTERVAL', 3).to_i, 2].max
+ # Give Sidekiq up to 30 seconds to allow existing jobs to finish after exceeding the limit
+ SHUTDOWN_TIMEOUT_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT', 30).to_i
+
+ def initialize
+ super
+
+ @enabled = true
+ end
+
+ private
+
+ def start_working
+ Sidekiq.logger.info(
+ class: self.class.to_s,
+ action: 'start',
+ pid: pid,
+ message: 'Starting Gitlab::SidekiqDaemon::MemoryKiller Daemon'
+ )
+
+ while enabled?
+ begin
+ restart_sidekiq unless rss_within_range?
+ sleep(CHECK_INTERVAL_SECONDS)
+ rescue => e
+ log_exception(e, __method__)
+ rescue Exception => e # rubocop:disable Lint/RescueException
+ log_exception(e, __method__ )
+ raise e
+ end
+ end
+ ensure
+ Sidekiq.logger.warn(
+ class: self.class.to_s,
+ action: 'stop',
+ pid: pid,
+ message: 'Stopping Gitlab::SidekiqDaemon::MemoryKiller Daemon'
+ )
+ end
+
+ def log_exception(exception, method)
+ Sidekiq.logger.warn(
+ class: self.class.to_s,
+ pid: pid,
+ message: "Exception from #{method}: #{exception.message}"
+ )
+ end
+
+ def stop_working
+ @enabled = false
+ end
+
+ def enabled?
+ @enabled
+ end
+
+ def restart_sidekiq
+ # Tell Sidekiq to stop fetching new jobs
+ # We first SIGNAL and then wait given time
+ # We also monitor a number of running jobs and allow to restart early
+ signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs')
+ return unless enabled?
+
+ # Tell sidekiq to restart itself
+ # Keep extra safe to wait `Sidekiq.options[:timeout] + 2` seconds before SIGKILL
+ signal_and_wait(Sidekiq.options[:timeout] + 2, 'SIGTERM', 'gracefully shut down')
+ return unless enabled?
+
+ # Ideally we should never reach this condition
+ # Wait for Sidekiq to shutdown gracefully, and kill it if it didn't
+ # Kill the whole pgroup, so we can be sure no children are left behind
+ signal_pgroup('SIGKILL', 'die')
+ end
+
+ def rss_within_range?
+ current_rss = nil
+ deadline = Time.now + GRACE_BALLOON_SECONDS.seconds
+ loop do
+ return true unless enabled?
+
+ current_rss = get_rss
+
+ # RSS go above hard limit should trigger forcible shutdown right away
+ break if current_rss > hard_limit_rss
+
+ # RSS go below the soft limit
+ return true if current_rss < soft_limit_rss
+
+ # RSS did not go below the soft limit within deadline, restart
+ break if Time.now > deadline
+
+ sleep(CHECK_INTERVAL_SECONDS)
+ end
+
+ log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss)
+
+ false
+ end
+
+ def log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss)
+ Sidekiq.logger.warn(
+ class: self.class.to_s,
+ pid: pid,
+ message: 'Sidekiq worker RSS out of range',
+ current_rss: current_rss,
+ hard_limit_rss: hard_limit_rss,
+ soft_limit_rss: soft_limit_rss,
+ reason: out_of_range_description(current_rss, hard_limit_rss, soft_limit_rss)
+ )
+ end
+
+ def out_of_range_description(rss, hard_limit, soft_limit)
+ if rss > hard_limit
+ "current_rss(#{rss}) > hard_limit_rss(#{hard_limit})"
+ else
+ "current_rss(#{rss}) > soft_limit_rss(#{soft_limit}) longer than GRACE_BALLOON_SECONDS(#{GRACE_BALLOON_SECONDS})"
+ end
+ end
+
+ def get_rss
+ output, status = Gitlab::Popen.popen(%W(ps -o rss= -p #{pid}), Rails.root.to_s)
+ return 0 unless status&.zero?
+
+ output.to_i
+ end
+
+ def soft_limit_rss
+ SOFT_LIMIT_RSS_KB + rss_increase_by_jobs
+ end
+
+ def hard_limit_rss
+ HARD_LIMIT_RSS_KB
+ end
+
+ def signal_and_wait(time, signal, explanation)
+ Sidekiq.logger.warn(
+ class: self.class.to_s,
+ pid: pid,
+ signal: signal,
+ explanation: explanation,
+ wait_time: time,
+ message: "Sending signal and waiting"
+ )
+ Process.kill(signal, pid)
+
+ deadline = Time.now + time
+
+ # we try to finish as early as all jobs finished
+ # so we retest that in loop
+ sleep(CHECK_INTERVAL_SECONDS) while enabled? && any_jobs? && Time.now < deadline
+ end
+
+ def signal_pgroup(signal, explanation)
+ if Process.getpgrp == pid
+ pid_or_pgrp_str = 'PGRP'
+ pid_to_signal = 0
+ else
+ pid_or_pgrp_str = 'PID'
+ pid_to_signal = pid
+ end
+
+ Sidekiq.logger.warn(
+ class: self.class.to_s,
+ signal: signal,
+ pid: pid,
+ message: "sending Sidekiq worker #{pid_or_pgrp_str}-#{pid} #{signal} (#{explanation})"
+ )
+ Process.kill(signal, pid_to_signal)
+ end
+
+ def rss_increase_by_jobs
+ Gitlab::SidekiqDaemon::Monitor.instance.jobs.sum do |job| # rubocop:disable CodeReuse/ActiveRecord
+ rss_increase_by_job(job)
+ end
+ end
+
+ def rss_increase_by_job(job)
+ memory_growth_kb = get_job_options(job, 'memory_killer_memory_growth_kb', 0).to_i
+ max_memory_growth_kb = get_job_options(job, 'memory_killer_max_memory_growth_kb', MAX_MEMORY_KB).to_i
+
+ return 0 if memory_growth_kb.zero?
+
+ time_elapsed = Time.now.to_i - job[:started_at]
+ [memory_growth_kb * time_elapsed, max_memory_growth_kb].min
+ end
+
+ def get_job_options(job, key, default)
+ job[:worker_class].sidekiq_options.fetch(key, default)
+ rescue
+ default
+ end
+
+ def pid
+ Process.pid
+ end
+
+ def any_jobs?
+ Gitlab::SidekiqDaemon::Monitor.instance.jobs.any?
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/sidekiq_daemon/monitor.rb b/lib/gitlab/sidekiq_daemon/monitor.rb
index bbfca130425..2b82adda430 100644
--- a/lib/gitlab/sidekiq_daemon/monitor.rb
+++ b/lib/gitlab/sidekiq_daemon/monitor.rb
@@ -14,19 +14,19 @@ module Gitlab
# that should not be caught by application
CancelledError = Class.new(Exception) # rubocop:disable Lint/InheritException
- attr_reader :jobs_thread
+ attr_reader :jobs
attr_reader :jobs_mutex
def initialize
super
- @jobs_thread = {}
+ @jobs = {}
@jobs_mutex = Mutex.new
end
- def within_job(jid, queue)
+ def within_job(worker_class, jid, queue)
jobs_mutex.synchronize do
- jobs_thread[jid] = Thread.current
+ jobs[jid] = { worker_class: worker_class, thread: Thread.current, started_at: Time.now.to_i }
end
if cancelled?(jid)
@@ -43,7 +43,7 @@ module Gitlab
yield
ensure
jobs_mutex.synchronize do
- jobs_thread.delete(jid)
+ jobs.delete(jid)
end
end
@@ -62,23 +62,27 @@ module Gitlab
private
def start_working
- Sidekiq.logger.info(
- class: self.class.to_s,
- action: 'start',
- message: 'Starting Monitor Daemon'
- )
+ return unless notification_channel_enabled?
- while enabled?
- process_messages
- sleep(RECONNECT_TIME)
- end
+ begin
+ Sidekiq.logger.info(
+ class: self.class.to_s,
+ action: 'start',
+ message: 'Starting Monitor Daemon'
+ )
- ensure
- Sidekiq.logger.warn(
- class: self.class.to_s,
- action: 'stop',
- message: 'Stopping Monitor Daemon'
- )
+ while enabled?
+ process_messages
+ sleep(RECONNECT_TIME)
+ end
+
+ ensure
+ Sidekiq.logger.warn(
+ class: self.class.to_s,
+ action: 'stop',
+ message: 'Stopping Monitor Daemon'
+ )
+ end
end
def stop_working
@@ -156,7 +160,7 @@ module Gitlab
# This is why it passes thread in block,
# to ensure that we do process this thread
def find_thread_unsafe(jid)
- jobs_thread[jid]
+ jobs.dig(jid, :thread)
end
def find_thread_with_lock(jid)
@@ -179,6 +183,10 @@ module Gitlab
def self.cancel_job_key(jid)
"sidekiq:cancel:#{jid}"
end
+
+ def notification_channel_enabled?
+ ENV.fetch("SIDEKIQ_MONITOR_WORKER", 0).to_i.nonzero?
+ end
end
end
end