Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitlab-foss.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Speicher <rspeicher@gmail.com>2021-01-20 22:34:23 +0300
committerRobert Speicher <rspeicher@gmail.com>2021-01-20 22:34:23 +0300
commit6438df3a1e0fb944485cebf07976160184697d72 (patch)
tree00b09bfd170e77ae9391b1a2f5a93ef6839f2597 /lib/gitlab/database
parent42bcd54d971da7ef2854b896a7b34f4ef8601067 (diff)
Add latest changes from gitlab-org/gitlab@13-8-stable-eev13.8.0-rc42
Diffstat (limited to 'lib/gitlab/database')
-rw-r--r--lib/gitlab/database/median.rb149
-rw-r--r--lib/gitlab/database/migration_helpers.rb175
-rw-r--r--lib/gitlab/database/migrations/background_migration_helpers.rb10
-rw-r--r--lib/gitlab/database/partitioning_migration_helpers/index_helpers.rb2
-rw-r--r--lib/gitlab/database/postgres_hll/batch_distinct_counter.rb62
-rw-r--r--lib/gitlab/database/postgres_hll/buckets.rb77
-rw-r--r--lib/gitlab/database/reindexing.rb6
-rw-r--r--lib/gitlab/database/reindexing/coordinator.rb38
-rw-r--r--lib/gitlab/database/reindexing/grafana_notifier.rb72
-rw-r--r--lib/gitlab/database/reindexing/reindex_action.rb20
10 files changed, 395 insertions, 216 deletions
diff --git a/lib/gitlab/database/median.rb b/lib/gitlab/database/median.rb
deleted file mode 100644
index 603b125d8b4..00000000000
--- a/lib/gitlab/database/median.rb
+++ /dev/null
@@ -1,149 +0,0 @@
-# frozen_string_literal: true
-
-# https://www.periscopedata.com/blog/medians-in-sql.html
-module Gitlab
- module Database
- module Median
- NotSupportedError = Class.new(StandardError)
-
- def median_datetime(arel_table, query_so_far, column_sym)
- extract_median(execute_queries(arel_table, query_so_far, column_sym)).presence
- end
-
- def median_datetimes(arel_table, query_so_far, column_sym, partition_column)
- extract_medians(execute_queries(arel_table, query_so_far, column_sym, partition_column)).presence
- end
-
- def extract_median(results)
- result = results.compact.first
-
- result = result.first.presence
-
- result['median']&.to_f if result
- end
-
- def extract_medians(results)
- median_values = results.compact.first.values
-
- median_values.each_with_object({}) do |(id, median), hash|
- hash[id.to_i] = median&.to_f
- end
- end
-
- def pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column = nil)
- # Create a CTE with the column we're operating on, row number (after sorting by the column
- # we're operating on), and count of the table we're operating on (duplicated across) all rows
- # of the CTE. For example, if we're looking to find the median of the `projects.star_count`
- # column, the CTE might look like this:
- #
- # star_count | row_id | ct
- # ------------+--------+----
- # 5 | 1 | 3
- # 9 | 2 | 3
- # 15 | 3 | 3
- #
- # If a partition column is used we will do the same operation but for separate partitions,
- # when that happens the CTE might look like this:
- #
- # project_id | star_count | row_id | ct
- # ------------+------------+--------+----
- # 1 | 5 | 1 | 2
- # 1 | 9 | 2 | 2
- # 2 | 10 | 1 | 3
- # 2 | 15 | 2 | 3
- # 2 | 20 | 3 | 3
- cte_table = Arel::Table.new("ordered_records")
-
- cte = Arel::Nodes::As.new(
- cte_table,
- arel_table.project(*rank_rows(arel_table, column_sym, partition_column)).
- # Disallow negative values
- where(arel_table[column_sym].gteq(zero_interval)))
-
- # From the CTE, select either the middle row or the middle two rows (this is accomplished
- # by 'where cte.row_id between cte.ct / 2.0 AND cte.ct / 2.0 + 1'). Find the average of the
- # selected rows, and this is the median value.
- result =
- cte_table
- .project(*median_projections(cte_table, column_sym, partition_column))
- .where(
- Arel::Nodes::Between.new(
- cte_table[:row_id],
- Arel::Nodes::And.new(
- [(cte_table[:ct] / Arel.sql('2.0')),
- (cte_table[:ct] / Arel.sql('2.0') + 1)]
- )
- )
- )
- .with(query_so_far, cte)
-
- result.group(cte_table[partition_column]).order(cte_table[partition_column]) if partition_column
-
- result.to_sql
- end
-
- private
-
- def execute_queries(arel_table, query_so_far, column_sym, partition_column = nil)
- queries = pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column)
-
- Array.wrap(queries).map { |query| ActiveRecord::Base.connection.execute(query) }
- end
-
- def average(args, as)
- Arel::Nodes::NamedFunction.new("AVG", args, as)
- end
-
- def rank_rows(arel_table, column_sym, partition_column)
- column_row = arel_table[column_sym].as(column_sym.to_s)
-
- if partition_column
- partition_row = arel_table[partition_column]
- row_id =
- Arel::Nodes::Over.new(
- Arel::Nodes::NamedFunction.new('rank', []),
- Arel::Nodes::Window.new.partition(arel_table[partition_column])
- .order(arel_table[column_sym])
- ).as('row_id')
-
- count = arel_table.from.from(arel_table.alias)
- .project('COUNT(*)')
- .where(arel_table[partition_column].eq(arel_table.alias[partition_column]))
- .as('ct')
-
- [partition_row, column_row, row_id, count]
- else
- row_id =
- Arel::Nodes::Over.new(
- Arel::Nodes::NamedFunction.new('row_number', []),
- Arel::Nodes::Window.new.order(arel_table[column_sym])
- ).as('row_id')
-
- count = arel_table.where(arel_table[column_sym].gteq(zero_interval)).project("COUNT(1)").as('ct')
-
- [column_row, row_id, count]
- end
- end
-
- def median_projections(table, column_sym, partition_column)
- projections = []
- projections << table[partition_column] if partition_column
- projections << average([extract_epoch(table[column_sym])], "median")
- projections
- end
-
- def extract_epoch(arel_attribute)
- Arel.sql(%Q{EXTRACT(EPOCH FROM "#{arel_attribute.relation.name}"."#{arel_attribute.name}")})
- end
-
- def extract_diff_epoch(diff)
- Arel.sql(%Q{EXTRACT(EPOCH FROM (#{diff.to_sql}))})
- end
-
- # Need to cast '0' to an INTERVAL before we can check if the interval is positive
- def zero_interval
- Arel::Nodes::NamedFunction.new("CAST", [Arel.sql("'0' AS INTERVAL")])
- end
- end
- end
-end
diff --git a/lib/gitlab/database/migration_helpers.rb b/lib/gitlab/database/migration_helpers.rb
index 164fce5a5a3..6b169a504f3 100644
--- a/lib/gitlab/database/migration_helpers.rb
+++ b/lib/gitlab/database/migration_helpers.rb
@@ -70,6 +70,61 @@ module Gitlab
end
end
+ #
+ # Creates a new table, optionally allowing the caller to add check constraints to the table.
+ # Aside from that addition, this method should behave identically to Rails' `create_table` method.
+ #
+ # Example:
+ #
+ # create_table_with_constraints :some_table do |t|
+ # t.integer :thing, null: false
+ # t.text :other_thing
+ #
+ # t.check_constraint :thing_is_not_null, 'thing IS NOT NULL'
+ # t.text_limit :other_thing, 255
+ # end
+ #
+ # See Rails' `create_table` for more info on the available arguments.
+ def create_table_with_constraints(table_name, **options, &block)
+ helper_context = self
+ check_constraints = []
+
+ with_lock_retries do
+ create_table(table_name, **options) do |t|
+ t.define_singleton_method(:check_constraint) do |name, definition|
+ helper_context.send(:validate_check_constraint_name!, name) # rubocop:disable GitlabSecurity/PublicSend
+
+ check_constraints << { name: name, definition: definition }
+ end
+
+ t.define_singleton_method(:text_limit) do |column_name, limit, name: nil|
+ # rubocop:disable GitlabSecurity/PublicSend
+ name = helper_context.send(:text_limit_name, table_name, column_name, name: name)
+ helper_context.send(:validate_check_constraint_name!, name)
+ # rubocop:enable GitlabSecurity/PublicSend
+
+ column_name = helper_context.quote_column_name(column_name)
+ definition = "char_length(#{column_name}) <= #{limit}"
+
+ check_constraints << { name: name, definition: definition }
+ end
+
+ t.instance_eval(&block) unless block.nil?
+ end
+
+ next if check_constraints.empty?
+
+ constraint_clauses = check_constraints.map do |constraint|
+ "ADD CONSTRAINT #{quote_table_name(constraint[:name])} CHECK (#{constraint[:definition]})"
+ end
+
+ execute(<<~SQL)
+ ALTER TABLE #{quote_table_name(table_name)}
+ #{constraint_clauses.join(",\n")}
+ SQL
+ end
+ end
+
# Creates a new index, concurrently
#
# Example:
@@ -858,6 +913,120 @@ module Gitlab
end
end
+ # Initializes the conversion of an integer column to bigint
+ #
+ # It can be used for converting both a Primary Key and any Foreign Keys
+ # that may reference it or any other integer column that we may want to
+ # upgrade (e.g. columns that store IDs, but are not set as FKs).
+ #
+ # - For primary keys and Foreign Keys (or other columns) defined as NOT NULL,
+ # the new bigint column is added with a hardcoded NOT NULL DEFAULT 0
+ # which allows us to skip a very costly verification step once we
+ # are ready to switch it.
+ # This is crucial for Primary Key conversions, because setting a column
+ # as the PK converts even check constraints to NOT NULL constraints
+ # and forces an inline re-verification of the whole table.
+ # - It backfills the new column with the values of the existing primary key
+ # by scheduling background jobs.
+ # - It tracks the scheduled background jobs through the use of
+ # Gitlab::Database::BackgroundMigrationJob
+ # which allows a more thorough check that all jobs succeeded in the
+ # cleanup migration and is way faster for very large tables.
+ # - It sets up a trigger to keep the two columns in sync
+ # - It does not schedule a cleanup job: we have to do that with followup
+ # post deployment migrations in the next release.
+ #
+ # This needs to be done manually by using the
+ # `cleanup_initialize_conversion_of_integer_to_bigint`
+ # (not yet implemented - check #288005)
+ #
+ # table - The name of the database table containing the column
+ # column - The name of the column that we want to convert to bigint.
+ # primary_key - The name of the primary key column (most often :id)
+ # batch_size - The number of rows to schedule in a single background migration
+ # sub_batch_size - The smaller batches that will be used by each scheduled job
+ # to update the table. Useful to keep each update at ~100ms while executing
+ # more updates per interval (2.minutes)
+ # Note that each execution of a sub-batch adds a constant 100ms sleep
+ # time in between the updates, which must be taken into account
+ # while calculating the batch, sub_batch and interval values.
+ # interval - The time interval between every background migration
+ #
+ # example:
+ # Assume that we have figured out that updating 200 records of the events
+ # table takes ~100ms on average.
+ # We can set the sub_batch_size to 200, leave the interval to the default
+ # and set the batch_size to 50_000 which will require
+ # ~50s = (50000 / 200) * (0.1 + 0.1) to complete and leaves breathing space
+ # between the scheduled jobs
+ def initialize_conversion_of_integer_to_bigint(
+ table,
+ column,
+ primary_key: :id,
+ batch_size: 20_000,
+ sub_batch_size: 1000,
+ interval: 2.minutes
+ )
+
+ if transaction_open?
+ raise 'initialize_conversion_of_integer_to_bigint can not be run inside a transaction'
+ end
+
+ unless table_exists?(table)
+ raise "Table #{table} does not exist"
+ end
+
+ unless column_exists?(table, primary_key)
+ raise "Column #{primary_key} does not exist on #{table}"
+ end
+
+ unless column_exists?(table, column)
+ raise "Column #{column} does not exist on #{table}"
+ end
+
+ check_trigger_permissions!(table)
+
+ old_column = column_for(table, column)
+ tmp_column = "#{column}_convert_to_bigint"
+
+ with_lock_retries do
+ if (column.to_s == primary_key.to_s) || !old_column.null
+ # If the column to be converted is either a PK or is defined as NOT NULL,
+ # set it to `NOT NULL DEFAULT 0` and we'll copy paste the correct values bellow
+ # That way, we skip the expensive validation step required to add
+ # a NOT NULL constraint at the end of the process
+ add_column(table, tmp_column, :bigint, default: old_column.default || 0, null: false)
+ else
+ add_column(table, tmp_column, :bigint, default: old_column.default)
+ end
+
+ install_rename_triggers(table, column, tmp_column)
+ end
+
+ source_model = Class.new(ActiveRecord::Base) do
+ include EachBatch
+
+ self.table_name = table
+ self.inheritance_column = :_type_disabled
+ end
+
+ queue_background_migration_jobs_by_range_at_intervals(
+ source_model,
+ 'CopyColumnUsingBackgroundMigrationJob',
+ interval,
+ batch_size: batch_size,
+ other_job_arguments: [table, primary_key, column, tmp_column, sub_batch_size],
+ track_jobs: true,
+ primary_column_name: primary_key
+ )
+
+ if perform_background_migration_inline?
+ # To ensure the schema is up to date immediately we perform the
+ # migration inline in dev / test environments.
+ Gitlab::BackgroundMigration.steal('CopyColumnUsingBackgroundMigrationJob')
+ end
+ end
+
# Performs a concurrent column rename when using PostgreSQL.
def install_rename_triggers_for_postgresql(trigger, table, old, new)
execute <<-EOF.strip_heredoc
@@ -996,9 +1165,9 @@ module Gitlab
Arel::Nodes::SqlLiteral.new(replace.to_sql)
end
- def remove_foreign_key_if_exists(*args)
- if foreign_key_exists?(*args)
- remove_foreign_key(*args)
+ def remove_foreign_key_if_exists(...)
+ if foreign_key_exists?(...)
+ remove_foreign_key(...)
end
end
diff --git a/lib/gitlab/database/migrations/background_migration_helpers.rb b/lib/gitlab/database/migrations/background_migration_helpers.rb
index 36073844765..12dcf68da2f 100644
--- a/lib/gitlab/database/migrations/background_migration_helpers.rb
+++ b/lib/gitlab/database/migrations/background_migration_helpers.rb
@@ -100,6 +100,7 @@ module Gitlab
end
final_delay = 0
+ batch_counter = 0
model_class.each_batch(of: batch_size) do |relation, index|
start_id, end_id = relation.pluck(Arel.sql("MIN(#{primary_column_name}), MAX(#{primary_column_name})")).first
@@ -112,8 +113,17 @@ module Gitlab
track_in_database(job_class_name, full_job_arguments) if track_jobs
migrate_in(final_delay, job_class_name, full_job_arguments)
+
+ batch_counter += 1
end
+ duration = initial_delay + delay_interval * batch_counter
+ say <<~SAY
+ Scheduled #{batch_counter} #{job_class_name} jobs with a maximum of #{batch_size} records per batch and an interval of #{delay_interval} seconds.
+
+ The migration is expected to take at least #{duration} seconds. Expect all jobs to have completed after #{Time.zone.now + duration}."
+ SAY
+
final_delay
end
diff --git a/lib/gitlab/database/partitioning_migration_helpers/index_helpers.rb b/lib/gitlab/database/partitioning_migration_helpers/index_helpers.rb
index f367292f4b0..0bc1343acca 100644
--- a/lib/gitlab/database/partitioning_migration_helpers/index_helpers.rb
+++ b/lib/gitlab/database/partitioning_migration_helpers/index_helpers.rb
@@ -32,7 +32,7 @@ module Gitlab
return
end
- partitioned_table.postgres_partitions.each do |partition|
+ partitioned_table.postgres_partitions.order(:name).each do |partition|
partition_index_name = generated_index_name(partition.identifier, options[:name])
partition_options = options.merge(name: partition_index_name)
diff --git a/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
index 33faa2ef1b0..62dfaeeaae3 100644
--- a/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
+++ b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
@@ -16,9 +16,9 @@ module Gitlab
# Grouped relations are NOT supported yet.
#
# @example Usage
- # ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
+ # ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).execute
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
- # .estimate_distinct_count(
+ # .execute(
# batch_size: 1_000,
# start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
# finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
@@ -30,7 +30,6 @@ module Gitlab
# for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
class BatchDistinctCounter
ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
- FALLBACK = -1
MIN_REQUIRED_BATCH_SIZE = 750
SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
MAX_DATA_VOLUME = 4_000_000_000
@@ -38,8 +37,10 @@ module Gitlab
# Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
DEFAULT_BATCH_SIZE = 10_000
+ ZERO_OFFSET = 1
+ BUCKET_ID_MASK = (Buckets::TOTAL_BUCKETS - ZERO_OFFSET).to_s(2)
BIT_31_MASK = "B'0#{'1' * 31}'"
- BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
+ BIT_32_NORMALIZED_BUCKET_ID_MASK = "B'#{'0' * (32 - BUCKET_ID_MASK.size)}#{BUCKET_ID_MASK}'"
# @example source_query
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
@@ -48,73 +49,58 @@ module Gitlab
# AND %{column} IS NOT NULL
BUCKETED_DATA_SQL = <<~SQL
WITH hashed_attributes AS (%{source_query})
- SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
+ SELECT (attr_hash_32_bits & #{BIT_32_NORMALIZED_BUCKET_ID_MASK})::int AS bucket_num,
(31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
FROM hashed_attributes
GROUP BY 1
SQL
- TOTAL_BUCKETS_NUMBER = 512
+ WRONG_CONFIGURATION_ERROR = Class.new(ActiveRecord::StatementInvalid)
def initialize(relation, column = nil)
@relation = relation
@column = column || relation.primary_key
end
- def unwanted_configuration?(finish, batch_size, start)
- batch_size <= MIN_REQUIRED_BATCH_SIZE ||
- (finish - start) >= MAX_DATA_VOLUME ||
- start > finish
- end
-
- def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
+ # Executes counter that iterates over database source and return Gitlab::Database::PostgresHll::Buckets
+ # that can be used to estimation of number of uniq elements in analysed set
+ #
+ # @param batch_size maximal number of rows that will be analysed by single database query
+ # @param start initial pkey range
+ # @param finish final pkey range
+ # @return [Gitlab::Database::PostgresHll::Buckets] HyperLogLog data structure instance that can estimate number of unique elements
+ def execute(batch_size: nil, start: nil, finish: nil)
raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
batch_size ||= DEFAULT_BATCH_SIZE
-
start = actual_start(start)
finish = actual_finish(finish)
- raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
- return FALLBACK if unwanted_configuration?(finish, batch_size, start)
+ raise WRONG_CONFIGURATION_ERROR if unwanted_configuration?(start, finish, batch_size)
batch_start = start
- hll_blob = {}
+ hll_buckets = Buckets.new
while batch_start <= finish
begin
- hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
+ hll_buckets.merge_hash!(hll_buckets_for_batch(batch_start, batch_start + batch_size))
batch_start += batch_size
end
sleep(SLEEP_TIME_IN_SECONDS)
end
- estimate_cardinality(hll_blob)
+ hll_buckets
end
private
- # arbitrary values that are present in #estimate_cardinality
- # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
- # article, they are not representing any entity and serves as tune value
- # for the whole equation
- def estimate_cardinality(hll_blob)
- num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
-
- num_uniques = (
- ((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
- (num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
- ).to_i
-
- if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
- ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
- Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
- else
- num_uniques
- end
+ def unwanted_configuration?(start, finish, batch_size)
+ batch_size <= MIN_REQUIRED_BATCH_SIZE ||
+ (finish - start) >= MAX_DATA_VOLUME ||
+ start > finish || start < 0 || finish < 0
end
- def hll_blob_for_batch(start, finish)
+ def hll_buckets_for_batch(start, finish)
@relation
.connection
.execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })
diff --git a/lib/gitlab/database/postgres_hll/buckets.rb b/lib/gitlab/database/postgres_hll/buckets.rb
new file mode 100644
index 00000000000..429e823379f
--- /dev/null
+++ b/lib/gitlab/database/postgres_hll/buckets.rb
@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Database
+ module PostgresHll
+ # Bucket class represent data structure build with HyperLogLog algorithm
+ # that models data distribution in analysed set. This representation than can be used
+ # for following purposes
+ # 1. Estimating number of unique elements that this structure represents
+ # 2. Merging with other Buckets structure to later estimate number of unique elements in sum of two
+ # represented data sets
+ # 3. Serializing Buckets structure to json format, that can be stored in various persistence layers
+ #
+ # @example Usage
+ # ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).estimated_distinct_count
+ # ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).merge_hash!(141 => 1, 56 => 5).estimated_distinct_count
+ # ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).to_json
+
+ # @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
+ # Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
+ # for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
+ # for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
+ class Buckets
+ TOTAL_BUCKETS = 512
+
+ def initialize(buckets = {})
+ @buckets = buckets
+ end
+
+ # Based on HyperLogLog structure estimates number of unique elements in analysed set.
+ #
+ # @return [Float] Estimate number of unique elements
+ def estimated_distinct_count
+ @estimated_distinct_count ||= estimate_cardinality
+ end
+
+ # Updates instance underlying HyperLogLog structure by merging it with other HyperLogLog structure
+ #
+ # @param other_buckets_hash hash with HyperLogLog structure representation
+ def merge_hash!(other_buckets_hash)
+ buckets.merge!(other_buckets_hash) {|_key, old, new| new > old ? new : old }
+ end
+
+ # Serialize instance underlying HyperLogLog structure to JSON format, that can be stored in various persistence layers
+ #
+ # @return [String] HyperLogLog data structure serialized to JSON
+ def to_json(_ = nil)
+ buckets.to_json
+ end
+
+ private
+
+ attr_accessor :buckets
+
+ # arbitrary values that are present in #estimate_cardinality
+ # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
+ # article, they are not representing any entity and serves as tune value
+ # for the whole equation
+ def estimate_cardinality
+ num_zero_buckets = TOTAL_BUCKETS - buckets.size
+
+ num_uniques = (
+ ((TOTAL_BUCKETS**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS))) /
+ (num_zero_buckets + buckets.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
+ ).to_i
+
+ if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS
+ ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS)) * (TOTAL_BUCKETS *
+ Math.log2(TOTAL_BUCKETS.to_f / num_zero_buckets)))
+ else
+ num_uniques
+ end
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/database/reindexing.rb b/lib/gitlab/database/reindexing.rb
index 832f7438cf9..0cfad690283 100644
--- a/lib/gitlab/database/reindexing.rb
+++ b/lib/gitlab/database/reindexing.rb
@@ -8,9 +8,9 @@ module Gitlab
# candidate_indexes: Array of Gitlab::Database::PostgresIndex
def self.perform(candidate_indexes, how_many: DEFAULT_INDEXES_PER_INVOCATION)
- indexes = IndexSelection.new(candidate_indexes).take(how_many)
-
- Coordinator.new(indexes).perform
+ IndexSelection.new(candidate_indexes).take(how_many).each do |index|
+ Coordinator.new(index).perform
+ end
end
def self.candidate_indexes
diff --git a/lib/gitlab/database/reindexing/coordinator.rb b/lib/gitlab/database/reindexing/coordinator.rb
index 0957f43e166..7a7d17ca196 100644
--- a/lib/gitlab/database/reindexing/coordinator.rb
+++ b/lib/gitlab/database/reindexing/coordinator.rb
@@ -12,26 +12,44 @@ module Gitlab
# statement timeouts).
TIMEOUT_PER_ACTION = 1.day
- attr_reader :indexes
+ attr_reader :index, :notifier
- def initialize(indexes)
- @indexes = indexes
+ def initialize(index, notifier = GrafanaNotifier.new)
+ @index = index
+ @notifier = notifier
end
def perform
- indexes.each do |index|
- # This obtains a global lease such that there's
- # only one live reindexing process at a time.
- try_obtain_lease do
- ReindexAction.keep_track_of(index) do
- ConcurrentReindex.new(index).perform
- end
+ # This obtains a global lease such that there's
+ # only one live reindexing process at a time.
+ try_obtain_lease do
+ action = ReindexAction.create_for(index)
+
+ with_notifications(action) do
+ perform_for(index, action)
end
end
end
private
+ def with_notifications(action)
+ notifier.notify_start(action)
+ yield
+ ensure
+ notifier.notify_end(action)
+ end
+
+ def perform_for(index, action)
+ ConcurrentReindex.new(index).perform
+ rescue
+ action.state = :failed
+
+ raise
+ ensure
+ action.finish
+ end
+
def lease_timeout
TIMEOUT_PER_ACTION
end
diff --git a/lib/gitlab/database/reindexing/grafana_notifier.rb b/lib/gitlab/database/reindexing/grafana_notifier.rb
new file mode 100644
index 00000000000..b1e5ecb9ade
--- /dev/null
+++ b/lib/gitlab/database/reindexing/grafana_notifier.rb
@@ -0,0 +1,72 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Database
+ module Reindexing
+ # This can be used to send annotations for reindexing to a Grafana API
+ class GrafanaNotifier
+ def initialize(api_key = ENV['GITLAB_GRAFANA_API_KEY'], api_url = ENV['GITLAB_GRAFANA_API_URL'], additional_tag = ENV['GITLAB_REINDEXING_GRAFANA_TAG'] || Rails.env)
+ @api_key = api_key
+ @api_url = api_url
+ @additional_tag = additional_tag
+ end
+
+ def notify_start(action)
+ return unless enabled?
+
+ payload = base_payload(action).merge(
+ text: "Started reindexing of #{action.index.name} on #{action.index.tablename}"
+ )
+
+ annotate(payload)
+ end
+
+ def notify_end(action)
+ return unless enabled?
+
+ payload = base_payload(action).merge(
+ text: "Finished reindexing of #{action.index.name} on #{action.index.tablename} (#{action.state})",
+ timeEnd: (action.action_end.utc.to_f * 1000).to_i,
+ isRegion: true
+ )
+
+ annotate(payload)
+ end
+
+ private
+
+ def base_payload(action)
+ {
+ time: (action.action_start.utc.to_f * 1000).to_i,
+ tags: ['reindex', @additional_tag, action.index.tablename, action.index.name].compact
+ }
+ end
+
+ def annotate(payload)
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": "Bearer #{@api_key}"
+ }
+
+ success = Gitlab::HTTP.post("#{@api_url}/api/annotations", body: payload.to_json, headers: headers, allow_local_requests: true).success?
+
+ log_error("Response code #{response.code}") unless success
+
+ success
+ rescue => err
+ log_error(err)
+
+ false
+ end
+
+ def log_error(err)
+ Gitlab::AppLogger.warn("Unable to notify Grafana from #{self.class}: #{err}")
+ end
+
+ def enabled?
+ !(@api_url.blank? || @api_key.blank?)
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/database/reindexing/reindex_action.rb b/lib/gitlab/database/reindexing/reindex_action.rb
index 8c59cffe5fb..7e58201889f 100644
--- a/lib/gitlab/database/reindexing/reindex_action.rb
+++ b/lib/gitlab/database/reindexing/reindex_action.rb
@@ -14,27 +14,23 @@ module Gitlab
scope :recent, -> { where(state: :finished).where('action_end > ?', Time.zone.now - RECENT_THRESHOLD) }
- def self.keep_track_of(index, &block)
- action = create!(
+ def self.create_for(index)
+ create!(
index_identifier: index.identifier,
action_start: Time.zone.now,
ondisk_size_bytes_start: index.ondisk_size_bytes,
bloat_estimate_bytes_start: index.bloat_size
)
+ end
- yield
-
- action.state = :finished
- rescue
- action.state = :failed
- raise
- ensure
+ def finish
index.reload # rubocop:disable Cop/ActiveRecordAssociationReload
- action.action_end = Time.zone.now
- action.ondisk_size_bytes_end = index.ondisk_size_bytes
+ self.state = :finished unless failed?
+ self.action_end = Time.zone.now
+ self.ondisk_size_bytes_end = index.ondisk_size_bytes
- action.save!
+ save!
end
end
end