1 files changed, 192 insertions, 0 deletions
diff --git a/lib/gitlab/memory/watchdog.rb b/lib/gitlab/memory/watchdog.rb
new file mode 100644
index 00000000000..db75ba8a47d
--- /dev/null
+++ b/lib/gitlab/memory/watchdog.rb
@@ -0,0 +1,192 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Memory
+    # A background thread that observes Ruby heap fragmentation and calls
+    # into a handler when the Ruby heap has been fragmented for an extended
+    # period of time.
+    #
+    # See Gitlab::Metrics::Memory for how heap fragmentation is defined.
+    #
+    # To decide whether a given fragmentation level is being exceeded,
+    # the watchdog regularly polls the GC. Whenever a violation occurs
+    # a strike is issued. If the maximum number of strikes are reached,
+    # a handler is invoked to deal with the situation.
+    #
+    # The duration for which a process may be above a given fragmentation
+    # threshold is computed as `max_strikes * sleep_time_seconds`.
+    class Watchdog < Daemon
+      DEFAULT_SLEEP_TIME_SECONDS = 60
+      DEFAULT_HEAP_FRAG_THRESHOLD = 0.5
+      DEFAULT_MAX_STRIKES = 5
+
+      # This handler does nothing. It returns `false` to indicate to the
+      # caller that the situation has not been dealt with so it will
+      # receive calls repeatedly if fragmentation remains high.
+      #
+      # This is useful for "dress rehearsals" in production since it allows
+      # us to observe how frequently the handler is invoked before taking action.
+      class NullHandler
+        include Singleton
+
+        def on_high_heap_fragmentation(value)
+          # NOP
+          false
+        end
+      end
+
+      # This handler sends SIGTERM and considers the situation handled.
+      class TermProcessHandler
+        def initialize(pid = $$)
+          @pid = pid
+        end
+
+        def on_high_heap_fragmentation(value)
+          Process.kill(:TERM, @pid)
+          true
+        end
+      end
+
+      # This handler invokes Puma's graceful termination handler, which takes
+      # into account a configurable grace period during which a process may
+      # remain unresponsive to a SIGTERM.
+      class PumaHandler
+        def initialize(puma_options = ::Puma.cli_config.options)
+          @worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options)
+        end
+
+        def on_high_heap_fragmentation(value)
+          @worker.term
+          true
+        end
+      end
+
+      # max_heap_fragmentation:
+      #   The degree to which the Ruby heap is allowed to be fragmented. Range [0,1].
+      # max_strikes:
+      #   How many times the process is allowed to be above max_heap_fragmentation before
+      #   a handler is invoked.
+      # sleep_time_seconds:
+      #   Used to control the frequency with which the watchdog will wake up and poll the GC.
+      def initialize(
+        handler: NullHandler.instance,
+        logger: Logger.new($stdout),
+        max_heap_fragmentation: ENV['GITLAB_MEMWD_MAX_HEAP_FRAG']&.to_f || DEFAULT_HEAP_FRAG_THRESHOLD,
+        max_strikes: ENV['GITLAB_MEMWD_MAX_STRIKES']&.to_i || DEFAULT_MAX_STRIKES,
+        sleep_time_seconds: ENV['GITLAB_MEMWD_SLEEP_TIME_SEC']&.to_i || DEFAULT_SLEEP_TIME_SECONDS,
+        **options)
+        super(**options)
+
+        @handler = handler
+        @logger = logger
+        @max_heap_fragmentation = max_heap_fragmentation
+        @sleep_time_seconds = sleep_time_seconds
+        @max_strikes = max_strikes
+
+        @alive = true
+        @strikes = 0
+
+        init_prometheus_metrics(max_heap_fragmentation)
+      end
+
+      attr_reader :strikes, :max_heap_fragmentation, :max_strikes, :sleep_time_seconds
+
+      def run_thread
+        @logger.info(log_labels.merge(message: 'started'))
+
+        while @alive
+          sleep(@sleep_time_seconds)
+
+          monitor_heap_fragmentation if Feature.enabled?(:gitlab_memory_watchdog, type: :ops)
+        end
+
+        @logger.info(log_labels.merge(message: 'stopped'))
+      end
+
+      private
+
+      def monitor_heap_fragmentation
+        heap_fragmentation = Gitlab::Metrics::Memory.gc_heap_fragmentation
+
+        if heap_fragmentation > @max_heap_fragmentation
+          @strikes += 1
+          @heap_frag_violations.increment
+        else
+          @strikes = 0
+        end
+
+        if @strikes > @max_strikes
+          # If the handler returns true, it means the event is handled and we can shut down.
+          @alive = !handle_heap_fragmentation_limit_exceeded(heap_fragmentation)
+          @strikes = 0
+        end
+      end
+
+      def handle_heap_fragmentation_limit_exceeded(value)
+        @logger.warn(
+          log_labels.merge(
+            message: 'heap fragmentation limit exceeded',
+            memwd_cur_heap_frag: value
+          ))
+        @heap_frag_violations_handled.increment
+
+        handler.on_high_heap_fragmentation(value)
+      end
+
+      def handler
+        # This allows us to keep the watchdog running but turn it into "friendly mode" where
+        # all that happens is we collect logs and Prometheus events for fragmentation violations.
+        return NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops)
+
+        @handler
+      end
+
+      def stop_working
+        @alive = false
+      end
+
+      def log_labels
+        {
+          pid: $$,
+          worker_id: worker_id,
+          memwd_handler_class: handler.class.name,
+          memwd_sleep_time_s: @sleep_time_seconds,
+          memwd_max_heap_frag: @max_heap_fragmentation,
+          memwd_max_strikes: @max_strikes,
+          memwd_cur_strikes: @strikes,
+          memwd_rss_bytes: process_rss_bytes
+        }
+      end
+
+      def worker_id
+        ::Prometheus::PidProvider.worker_id
+      end
+
+      def process_rss_bytes
+        Gitlab::Metrics::System.memory_usage_rss
+      end
+
+      def init_prometheus_metrics(max_heap_fragmentation)
+        default_labels = { pid: worker_id }
+
+        @heap_frag_limit = Gitlab::Metrics.gauge(
+          :gitlab_memwd_heap_frag_limit,
+          'The configured limit for how fragmented the Ruby heap is allowed to be',
+          default_labels
+        )
+        @heap_frag_limit.set({}, max_heap_fragmentation)
+
+        @heap_frag_violations = Gitlab::Metrics.counter(
+          :gitlab_memwd_heap_frag_violations_total,
+          'Total number of times heap fragmentation in a Ruby process exceeded its allowed maximum',
+          default_labels
+        )
+        @heap_frag_violations_handled = Gitlab::Metrics.counter(
+          :gitlab_memwd_heap_frag_violations_handled_total,
+          'Total number of times heap fragmentation violations in a Ruby process were handled',
+          default_labels
+        )
+      end
+    end
+  end
+end