Welcome to mirror list, hosted at ThFree Co, Russian Federation.

batch_counter.rb « database « gitlab « lib - gitlab.com/gitlab-org/gitlab-foss.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 7abcef5c93bb3733771389b7a2458ba079f07330 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# frozen_string_literal: true

module Gitlab
  module Database
    class BatchCounter
      FALLBACK = -1
      MIN_REQUIRED_BATCH_SIZE = 1_250
      DEFAULT_SUM_BATCH_SIZE = 1_000
      MAX_ALLOWED_LOOPS = 10_000
      SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
      ALLOWED_MODES = [:itself, :distinct].freeze
      FALLBACK_FINISH = 0
      OFFSET_BY_ONE = 1

      # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
      DEFAULT_DISTINCT_BATCH_SIZE = 10_000
      DEFAULT_BATCH_SIZE = 100_000

      def initialize(relation, column: nil, operation: :count, operation_args: nil, max_allowed_loops: nil)
        @relation = relation
        @column = column || relation.primary_key
        @operation = operation
        @operation_args = operation_args
        @max_allowed_loops = max_allowed_loops || MAX_ALLOWED_LOOPS
      end

      def unwanted_configuration?(finish, batch_size, start)
        (@operation == :count && batch_size <= MIN_REQUIRED_BATCH_SIZE) ||
          (@operation == :sum && batch_size < DEFAULT_SUM_BATCH_SIZE) ||
          (finish - start) / batch_size >= @max_allowed_loops ||
          start >= finish
      end

      def count(batch_size: nil, mode: :itself, start: nil, finish: nil)
        result = count_with_timeout(batch_size: batch_size, mode: mode, start: start, finish: finish, timeout: nil)

        return FALLBACK if result[:status] != :completed

        result[:count]
      end

      def count_with_timeout(batch_size: nil, mode: :itself, start: nil, finish: nil, timeout: nil, partial_results: nil)
        raise 'BatchCount can not be run inside a transaction' if transaction_open?

        check_mode!(mode)

        # non-distinct have better performance
        batch_size ||= batch_size_for_mode_and_operation(mode, @operation)

        start = actual_start(start)
        finish = actual_finish(finish)

        raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
        return { status: :bad_config } if unwanted_configuration?(finish, batch_size, start)

        results = partial_results
        batch_start = start

        start_time = ::Gitlab::Metrics::System.monotonic_time.seconds

        while batch_start < finish

          # Timeout elapsed, return partial result so the caller can continue later
          if timeout && ::Gitlab::Metrics::System.monotonic_time.seconds - start_time > timeout
            return { status: :timeout, partial_results: results, continue_from: batch_start }
          end

          begin
            batch_end = [batch_start + batch_size, finish].min
            batch_relation = build_relation_batch(batch_start, batch_end, mode)

            results = merge_results(results, batch_relation.send(@operation, *@operation_args)) # rubocop:disable GitlabSecurity/PublicSend
            batch_start = batch_end
          rescue ActiveRecord::QueryCanceled => error
            # retry with a safe batch size & warmer cache
            if batch_size >= 2 * MIN_REQUIRED_BATCH_SIZE
              batch_size /= 2
            else
              log_canceled_batch_fetch(batch_start, mode, batch_relation.to_sql, error)
              return { status: :cancelled }
            end
          end

          sleep(SLEEP_TIME_IN_SECONDS)
        end

        { status: :completed, count: results }
      end

      def transaction_open?
        @relation.connection.transaction_open?
      end

      def merge_results(results, object)
        return object unless results

        if object.is_a?(Hash)
          results.merge!(object) { |_, a, b| a + b }
        else
          results + object
        end
      end

      private

      def build_relation_batch(start, finish, mode)
        @relation.select(@column).public_send(mode).where(between_condition(start, finish)) # rubocop:disable GitlabSecurity/PublicSend
      end

      def batch_size_for_mode_and_operation(mode, operation)
        return DEFAULT_SUM_BATCH_SIZE if operation == :sum

        mode == :distinct ? DEFAULT_DISTINCT_BATCH_SIZE : DEFAULT_BATCH_SIZE
      end

      def between_condition(start, finish)
        return @column.between(start...finish) if @column.is_a?(Arel::Attributes::Attribute)

        { @column => start...finish }
      end

      def actual_start(start)
        start || @relation.unscope(:group, :having).minimum(@column) || 0
      end

      def actual_finish(finish)
        (finish || @relation.unscope(:group, :having).maximum(@column) || FALLBACK_FINISH) + OFFSET_BY_ONE
      end

      def check_mode!(mode)
        raise "The mode #{mode.inspect} is not supported" unless ALLOWED_MODES.include?(mode)
        raise 'Use distinct count for optimized distinct counting' if @relation.limit(1).distinct_value.present? && mode != :distinct
        raise 'Use distinct count only with non id fields' if @column == :id && mode == :distinct
      end

      def log_canceled_batch_fetch(batch_start, mode, query, error)
        Gitlab::AppJsonLogger
          .error(
            event: 'batch_count',
            relation: @relation.table_name,
            operation: @operation,
            operation_args: @operation_args,
            max_allowed_loops: @max_allowed_loops,
            start: batch_start,
            mode: mode,
            query: query,
            message: "Query has been canceled with message: #{error.message}"
          )
      end
    end
  end
end