Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitlab-foss.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'lib/gitlab/database')
-rw-r--r--lib/gitlab/database/batch_count.rb15
-rw-r--r--lib/gitlab/database/migrations/background_migration_helpers.rb11
-rw-r--r--lib/gitlab/database/postgres_hll_batch_distinct_counter.rb156
3 files changed, 173 insertions, 9 deletions
diff --git a/lib/gitlab/database/batch_count.rb b/lib/gitlab/database/batch_count.rb
index 6f79e965cd5..5a506da0d05 100644
--- a/lib/gitlab/database/batch_count.rb
+++ b/lib/gitlab/database/batch_count.rb
@@ -49,6 +49,8 @@ module Gitlab
MAX_ALLOWED_LOOPS = 10_000
SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
ALLOWED_MODES = [:itself, :distinct].freeze
+ FALLBACK_FINISH = 0
+ OFFSET_BY_ONE = 1
# Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
DEFAULT_DISTINCT_BATCH_SIZE = 10_000
@@ -65,7 +67,7 @@ module Gitlab
(@operation == :count && batch_size <= MIN_REQUIRED_BATCH_SIZE) ||
(@operation == :sum && batch_size < DEFAULT_SUM_BATCH_SIZE) ||
(finish - start) / batch_size >= MAX_ALLOWED_LOOPS ||
- start > finish
+ start >= finish
end
def count(batch_size: nil, mode: :itself, start: nil, finish: nil)
@@ -85,11 +87,13 @@ module Gitlab
results = nil
batch_start = start
- while batch_start <= finish
- batch_relation = build_relation_batch(batch_start, batch_start + batch_size, mode)
+ while batch_start < finish
+ batch_end = [batch_start + batch_size, finish].min
+ batch_relation = build_relation_batch(batch_start, batch_end, mode)
+
begin
results = merge_results(results, batch_relation.send(@operation, *@operation_args)) # rubocop:disable GitlabSecurity/PublicSend
- batch_start += batch_size
+ batch_start = batch_end
rescue ActiveRecord::QueryCanceled => error
# retry with a safe batch size & warmer cache
if batch_size >= 2 * MIN_REQUIRED_BATCH_SIZE
@@ -99,6 +103,7 @@ module Gitlab
return FALLBACK
end
end
+
sleep(SLEEP_TIME_IN_SECONDS)
end
@@ -138,7 +143,7 @@ module Gitlab
end
def actual_finish(finish)
- finish || @relation.unscope(:group, :having).maximum(@column) || 0
+ (finish || @relation.unscope(:group, :having).maximum(@column) || FALLBACK_FINISH) + OFFSET_BY_ONE
end
def check_mode!(mode)
diff --git a/lib/gitlab/database/migrations/background_migration_helpers.rb b/lib/gitlab/database/migrations/background_migration_helpers.rb
index a6cc03aa9eb..36073844765 100644
--- a/lib/gitlab/database/migrations/background_migration_helpers.rb
+++ b/lib/gitlab/database/migrations/background_migration_helpers.rb
@@ -55,7 +55,8 @@ module Gitlab
bulk_migrate_async(jobs) unless jobs.empty?
end
- # Queues background migration jobs for an entire table, batched by ID range.
+ # Queues background migration jobs for an entire table in batches.
+ # The default batching column used is the standard primary key `id`.
# Each job is scheduled with a `delay_interval` in between.
# If you use a small interval, then some jobs may run at the same time.
#
@@ -68,6 +69,7 @@ module Gitlab
# is scheduled to be run. These records can be used to trace execution of the background job, but there is no
# builtin support to manage that automatically at this time. You should only set this flag if you are aware of
# how it works, and intend to manually cleanup the database records in your background job.
+ # primary_column_name - The name of the primary key column if the primary key is not `id`
#
# *Returns the final migration delay*
#
@@ -87,8 +89,9 @@ module Gitlab
# # do something
# end
# end
- def queue_background_migration_jobs_by_range_at_intervals(model_class, job_class_name, delay_interval, batch_size: BACKGROUND_MIGRATION_BATCH_SIZE, other_job_arguments: [], initial_delay: 0, track_jobs: false)
- raise "#{model_class} does not have an ID to use for batch ranges" unless model_class.column_names.include?('id')
+ def queue_background_migration_jobs_by_range_at_intervals(model_class, job_class_name, delay_interval, batch_size: BACKGROUND_MIGRATION_BATCH_SIZE, other_job_arguments: [], initial_delay: 0, track_jobs: false, primary_column_name: :id)
+ raise "#{model_class} does not have an ID column of #{primary_column_name} to use for batch ranges" unless model_class.column_names.include?(primary_column_name.to_s)
+ raise "#{primary_column_name} is not an integer column" unless model_class.columns_hash[primary_column_name.to_s].type == :integer
# To not overload the worker too much we enforce a minimum interval both
# when scheduling and performing jobs.
@@ -99,7 +102,7 @@ module Gitlab
final_delay = 0
model_class.each_batch(of: batch_size) do |relation, index|
- start_id, end_id = relation.pluck(Arel.sql('MIN(id), MAX(id)')).first
+ start_id, end_id = relation.pluck(Arel.sql("MIN(#{primary_column_name}), MAX(#{primary_column_name})")).first
# `BackgroundMigrationWorker.bulk_perform_in` schedules all jobs for
# the same time, which is not helpful in most cases where we wish to
diff --git a/lib/gitlab/database/postgres_hll_batch_distinct_counter.rb b/lib/gitlab/database/postgres_hll_batch_distinct_counter.rb
new file mode 100644
index 00000000000..4b2afcb128f
--- /dev/null
+++ b/lib/gitlab/database/postgres_hll_batch_distinct_counter.rb
@@ -0,0 +1,156 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Database
+ # For large tables, PostgreSQL can take a long time to count rows due to MVCC.
+ # Implements a distinct batch counter based on HyperLogLog algorithm
+ # Needs indexes on the column below to calculate max, min and range queries
+ # For larger tables just set higher batch_size with index optimization
+ #
+ # In order to not use a possible complex time consuming query when calculating min and max values,
+ # the start and finish can be sent specifically, start and finish should contain max and min values for PRIMARY KEY of
+ # relation (most cases `id` column) rather than counted attribute eg:
+ # estimate_distinct_count(start: ::Project.with_active_services.minimum(:id), finish: ::Project.with_active_services.maximum(:id))
+ #
+ # Grouped relations are NOT supported yet.
+ #
+ # @example Usage
+ # ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
+ # ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
+ # .estimate_distinct_count(
+ # batch_size: 1_000,
+ # start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
+ # finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
+ # )
+ #
+ # @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
+ # Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
+ # for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
+ # for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
+ class PostgresHllBatchDistinctCounter
+ FALLBACK = -1
+ MIN_REQUIRED_BATCH_SIZE = 1_250
+ MAX_ALLOWED_LOOPS = 10_000
+ SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
+
+ # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
+ DEFAULT_BATCH_SIZE = 100_000
+
+ BIT_31_MASK = "B'0#{'1' * 31}'"
+ BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
+ # @example source_query
+ # SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
+ # FROM %{relation}
+ # WHERE %{pkey} >= %{batch_start}
+ # AND %{pkey} < %{batch_end}
+ # AND %{column} IS NOT NULL
+ BUCKETED_DATA_SQL = <<~SQL
+ WITH hashed_attributes AS (%{source_query})
+ SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
+ (31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
+ FROM hashed_attributes
+ GROUP BY 1 ORDER BY 1
+ SQL
+
+ TOTAL_BUCKETS_NUMBER = 512
+
+ def initialize(relation, column = nil)
+ @relation = relation
+ @column = column || relation.primary_key
+ end
+
+ def unwanted_configuration?(finish, batch_size, start)
+ batch_size <= MIN_REQUIRED_BATCH_SIZE ||
+ (finish - start) / batch_size >= MAX_ALLOWED_LOOPS ||
+ start > finish
+ end
+
+ def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
+ raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
+
+ batch_size ||= DEFAULT_BATCH_SIZE
+
+ start = actual_start(start)
+ finish = actual_finish(finish)
+
+ raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
+ return FALLBACK if unwanted_configuration?(finish, batch_size, start)
+
+ batch_start = start
+ hll_blob = {}
+
+ while batch_start <= finish
+ begin
+ hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
+ batch_start += batch_size
+ end
+ sleep(SLEEP_TIME_IN_SECONDS)
+ end
+
+ estimate_cardinality(hll_blob)
+ end
+
+ private
+
+ # arbitrary values that are present in #estimate_cardinality
+ # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
+ # article, they are not representing any entity and serves as tune value
+ # for the whole equation
+ def estimate_cardinality(hll_blob)
+ num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
+
+ num_uniques = (
+ ((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
+ (num_zero_buckets + hll_blob.values.sum { |bucket_hash, _| 2**(-1 * bucket_hash)} )
+ ).to_i
+
+ if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
+ ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
+ Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
+ else
+ num_uniques
+ end
+ end
+
+ def hll_blob_for_batch(start, finish)
+ @relation
+ .connection
+ .execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })
+ .map(&:values)
+ .to_h
+ end
+
+ # Generate the source query SQL snippet for the provided id range
+ #
+ # @example SQL query template
+ # SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
+ # FROM %{relation}
+ # WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
+ # AND %{column} IS NOT NULL
+ #
+ # @param start initial id range
+ # @param finish final id range
+ # @return [String] SQL query fragment
+ def source_query(start, finish)
+ col_as_arel = @column.is_a?(Arel::Attributes::Attribute) ? @column : Arel.sql(@column.to_s)
+ col_as_text = Arel::Nodes::NamedFunction.new('CAST', [col_as_arel.as('text')])
+ md5_of_col = Arel::Nodes::NamedFunction.new('md5', [col_as_text])
+ md5_as_hex = Arel::Nodes::Concat.new(Arel.sql("'X'"), md5_of_col)
+ bits = Arel::Nodes::NamedFunction.new('CAST', [md5_as_hex.as('bit(32)')])
+
+ @relation
+ .where(@relation.primary_key => (start...finish))
+ .where(col_as_arel.not_eq(nil))
+ .select(bits.as('attr_hash_32_bits')).to_sql
+ end
+
+ def actual_start(start)
+ start || @relation.unscope(:group, :having).minimum(@relation.primary_key) || 0
+ end
+
+ def actual_finish(finish)
+ finish || @relation.unscope(:group, :having).maximum(@relation.primary_key) || 0
+ end
+ end
+ end
+end