diff options
Diffstat (limited to 'app/models/concerns/each_batch.rb')
-rw-r--r-- | app/models/concerns/each_batch.rb | 61 |
1 files changed, 61 insertions, 0 deletions
diff --git a/app/models/concerns/each_batch.rb b/app/models/concerns/each_batch.rb index 443e1ab53b4..dbc0887dc97 100644 --- a/app/models/concerns/each_batch.rb +++ b/app/models/concerns/each_batch.rb @@ -2,6 +2,7 @@ module EachBatch extend ActiveSupport::Concern + include LooseIndexScan class_methods do # Iterates over the rows in a relation in batches, similar to Rails' @@ -100,5 +101,65 @@ module EachBatch break unless stop end end + + # Iterates over the rows in a relation in batches by skipping duplicated values in the column. + # Example: counting the number of distinct authors in `issues` + # + # - Table size: 100_000 + # - Column: author_id + # - Distinct author_ids in the table: 1000 + # + # The query will read maximum 1000 rows if we have index coverage on user_id. + # + # > count = 0 + # > Issue.distinct_each_batch(column: 'author_id', of: 1000) { |r| count += r.count(:author_id) } + def distinct_each_batch(column:, order: :asc, of: 1000) + start = except(:select) + .select(column) + .reorder(column => order) + + start = start.take + + return unless start + + start_id = start[column] + arel_table = self.arel_table + arel_column = arel_table[column.to_s] + + 1.step do |index| + stop = loose_index_scan(column: column, order: order) do |cte_query, inner_query| + if order == :asc + [cte_query.where(arel_column.gteq(start_id)), inner_query] + else + [cte_query.where(arel_column.lteq(start_id)), inner_query] + end + end.offset(of).take + + if stop + stop_id = stop[column] + + relation = loose_index_scan(column: column, order: order) do |cte_query, inner_query| + if order == :asc + [cte_query.where(arel_column.gteq(start_id)), inner_query.where(arel_column.lt(stop_id))] + else + [cte_query.where(arel_column.lteq(start_id)), inner_query.where(arel_column.gt(stop_id))] + end + end + start_id = stop_id + else + relation = loose_index_scan(column: column, order: order) do |cte_query, inner_query| + if order == :asc + [cte_query.where(arel_column.gteq(start_id)), inner_query] + else + [cte_query.where(arel_column.lteq(start_id)), inner_query] + end + end + end + + unscoped { yield relation, index } + + break unless stop + end + end end end |