diff options
Diffstat (limited to 'lib/gitlab/database/batch_count.rb')
-rw-r--r-- | lib/gitlab/database/batch_count.rb | 156 |
1 files changed, 1 insertions, 155 deletions
diff --git a/lib/gitlab/database/batch_count.rb b/lib/gitlab/database/batch_count.rb index 9002d39e1ee..49f56b5be97 100644 --- a/lib/gitlab/database/batch_count.rb +++ b/lib/gitlab/database/batch_count.rb @@ -18,7 +18,7 @@ # batch_count(::Clusters::Cluster.aws_installed.enabled, :cluster_id) # batch_count(Namespace.group(:type)) # batch_distinct_count(::Project, :creator_id) -# batch_distinct_count(::Project.with_active_services.service_desk_enabled.where(time_period), start: ::User.minimum(:id), finish: ::User.maximum(:id)) +# batch_distinct_count(::Project.aimed_for_deletion.service_desk_enabled.where(time_period), start: ::User.minimum(:id), finish: ::User.maximum(:id)) # batch_distinct_count(Project.group(:visibility_level), :creator_id) # batch_sum(User, :sign_in_count) # batch_sum(Issue.group(:state_id), :weight)) @@ -41,159 +41,5 @@ module Gitlab include BatchCount end end - - class BatchCounter - FALLBACK = -1 - MIN_REQUIRED_BATCH_SIZE = 1_250 - DEFAULT_SUM_BATCH_SIZE = 1_000 - MAX_ALLOWED_LOOPS = 10_000 - SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep - ALLOWED_MODES = [:itself, :distinct].freeze - FALLBACK_FINISH = 0 - OFFSET_BY_ONE = 1 - - # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705 - DEFAULT_DISTINCT_BATCH_SIZE = 10_000 - DEFAULT_BATCH_SIZE = 100_000 - - def initialize(relation, column: nil, operation: :count, operation_args: nil) - @relation = relation - @column = column || relation.primary_key - @operation = operation - @operation_args = operation_args - end - - def unwanted_configuration?(finish, batch_size, start) - (@operation == :count && batch_size <= MIN_REQUIRED_BATCH_SIZE) || - (@operation == :sum && batch_size < DEFAULT_SUM_BATCH_SIZE) || - (finish - start) / batch_size >= MAX_ALLOWED_LOOPS || - start >= finish - end - - def count(batch_size: nil, mode: :itself, start: nil, finish: nil) - raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open? - - check_mode!(mode) - - # non-distinct have better performance - batch_size ||= batch_size_for_mode_and_operation(mode, @operation) - - start = actual_start(start) - finish = actual_finish(finish) - - raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0 - return FALLBACK if unwanted_configuration?(finish, batch_size, start) - - results = nil - batch_start = start - - while batch_start < finish - begin - batch_end = [batch_start + batch_size, finish].min - batch_relation = build_relation_batch(batch_start, batch_end, mode) - - op_args = @operation_args - if @operation == :count && @operation_args.blank? && use_loose_index_scan_for_distinct_values?(mode) - op_args = [Gitlab::Database::LooseIndexScanDistinctCount::COLUMN_ALIAS] - end - - results = merge_results(results, batch_relation.send(@operation, *op_args)) # rubocop:disable GitlabSecurity/PublicSend - batch_start = batch_end - rescue ActiveRecord::QueryCanceled => error - # retry with a safe batch size & warmer cache - if batch_size >= 2 * MIN_REQUIRED_BATCH_SIZE - batch_size /= 2 - else - log_canceled_batch_fetch(batch_start, mode, batch_relation.to_sql, error) - return FALLBACK - end - rescue Gitlab::Database::LooseIndexScanDistinctCount::ColumnConfigurationError => error - Gitlab::AppJsonLogger - .error( - event: 'batch_count', - relation: @relation.table_name, - operation: @operation, - operation_args: @operation_args, - mode: mode, - message: "LooseIndexScanDistinctCount column error: #{error.message}" - ) - - return FALLBACK - end - - sleep(SLEEP_TIME_IN_SECONDS) - end - - results - end - - def merge_results(results, object) - return object unless results - - if object.is_a?(Hash) - results.merge!(object) { |_, a, b| a + b } - else - results + object - end - end - - private - - def build_relation_batch(start, finish, mode) - if use_loose_index_scan_for_distinct_values?(mode) - Gitlab::Database::LooseIndexScanDistinctCount.new(@relation, @column).build_query(from: start, to: finish) - else - @relation.select(@column).public_send(mode).where(between_condition(start, finish)) # rubocop:disable GitlabSecurity/PublicSend - end - end - - def batch_size_for_mode_and_operation(mode, operation) - return DEFAULT_SUM_BATCH_SIZE if operation == :sum - - mode == :distinct ? DEFAULT_DISTINCT_BATCH_SIZE : DEFAULT_BATCH_SIZE - end - - def between_condition(start, finish) - return @column.between(start...finish) if @column.is_a?(Arel::Attributes::Attribute) - - { @column => start...finish } - end - - def actual_start(start) - start || @relation.unscope(:group, :having).minimum(@column) || 0 - end - - def actual_finish(finish) - (finish || @relation.unscope(:group, :having).maximum(@column) || FALLBACK_FINISH) + OFFSET_BY_ONE - end - - def check_mode!(mode) - raise "The mode #{mode.inspect} is not supported" unless ALLOWED_MODES.include?(mode) - raise 'Use distinct count for optimized distinct counting' if @relation.limit(1).distinct_value.present? && mode != :distinct - raise 'Use distinct count only with non id fields' if @column == :id && mode == :distinct - end - - def log_canceled_batch_fetch(batch_start, mode, query, error) - Gitlab::AppJsonLogger - .error( - event: 'batch_count', - relation: @relation.table_name, - operation: @operation, - operation_args: @operation_args, - start: batch_start, - mode: mode, - query: query, - message: "Query has been canceled with message: #{error.message}" - ) - end - - def use_loose_index_scan_for_distinct_values?(mode) - Feature.enabled?(:loose_index_scan_for_distinct_values) && not_group_by_query? && mode == :distinct - end - - def not_group_by_query? - !@relation.is_a?(ActiveRecord::Relation) || @relation.group_values.blank? - end - end end end |