diff options
author | GitLab Bot <gitlab-bot@gitlab.com> | 2021-04-21 02:50:22 +0300 |
---|---|---|
committer | GitLab Bot <gitlab-bot@gitlab.com> | 2021-04-21 02:50:22 +0300 |
commit | 9dc93a4519d9d5d7be48ff274127136236a3adb3 (patch) | |
tree | 70467ae3692a0e35e5ea56bcb803eb512a10bedb /lib/gitlab/database | |
parent | 4b0f34b6d759d6299322b3a54453e930c6121ff0 (diff) |
Add latest changes from gitlab-org/gitlab@13-11-stable-eev13.11.0-rc43
Diffstat (limited to 'lib/gitlab/database')
17 files changed, 659 insertions, 130 deletions
diff --git a/lib/gitlab/database/as_with_materialized.rb b/lib/gitlab/database/as_with_materialized.rb new file mode 100644 index 00000000000..7c45f416638 --- /dev/null +++ b/lib/gitlab/database/as_with_materialized.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +module Gitlab + module Database + # This class is a special Arel node which allows optionally define the `MATERIALIZED` keyword for CTE and Recursive CTE queries. + class AsWithMaterialized < Arel::Nodes::Binary + extend Gitlab::Utils::StrongMemoize + + MATERIALIZED = Arel.sql(' MATERIALIZED') + EMPTY_STRING = Arel.sql('') + attr_reader :expr + + def initialize(left, right, materialized: true) + @expr = if materialized && self.class.materialized_supported? + MATERIALIZED + else + EMPTY_STRING + end + + super(left, right) + end + + # Note: to be deleted after the minimum PG version is set to 12.0 + def self.materialized_supported? + strong_memoize(:materialized_supported) do + Gitlab::Database.version.match?(/^1[2-9]\./) # version 12.x and above + end + end + + # Note: to be deleted after the minimum PG version is set to 12.0 + def self.materialized_if_supported + materialized_supported? ? 'MATERIALIZED' : '' + end + end + end +end diff --git a/lib/gitlab/database/background_migration/batch_metrics.rb b/lib/gitlab/database/background_migration/batch_metrics.rb new file mode 100644 index 00000000000..3e6d7ac3c9f --- /dev/null +++ b/lib/gitlab/database/background_migration/batch_metrics.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +module Gitlab + module Database + module BackgroundMigration + class BatchMetrics + attr_reader :timings + + def initialize + @timings = {} + end + + def time_operation(label) + start_time = monotonic_time + + yield + + timings_for_label(label) << monotonic_time - start_time + end + + private + + def timings_for_label(label) + timings[label] ||= [] + end + + def monotonic_time + Gitlab::Metrics::System.monotonic_time + end + end + end + end +end diff --git a/lib/gitlab/database/background_migration/batched_migration.rb b/lib/gitlab/database/background_migration/batched_migration.rb index 0c9add9b355..4aa33ed7946 100644 --- a/lib/gitlab/database/background_migration/batched_migration.rb +++ b/lib/gitlab/database/background_migration/batched_migration.rb @@ -5,7 +5,7 @@ module Gitlab module BackgroundMigration class BatchedMigration < ActiveRecord::Base # rubocop:disable Rails/ApplicationRecord JOB_CLASS_MODULE = 'Gitlab::BackgroundMigration' - BATCH_CLASS_MODULE = "#{JOB_CLASS_MODULE}::BatchingStrategies".freeze + BATCH_CLASS_MODULE = "#{JOB_CLASS_MODULE}::BatchingStrategies" self.table_name = :batched_background_migrations @@ -23,8 +23,15 @@ module Gitlab finished: 3 } - def interval_elapsed? - last_job.nil? || last_job.created_at <= Time.current - interval + def self.active_migration + active.queue_order.first + end + + def interval_elapsed?(variance: 0) + return true unless last_job + + interval_with_variance = interval - variance + last_job.created_at <= Time.current - interval_with_variance end def create_batched_job!(min, max) @@ -50,6 +57,13 @@ module Gitlab def batch_class_name=(class_name) write_attribute(:batch_class_name, class_name.demodulize) end + + def prometheus_labels + @prometheus_labels ||= { + migration_id: id, + migration_identifier: "%s/%s.%s" % [job_class_name, table_name, column_name] + } + end end end end diff --git a/lib/gitlab/database/background_migration/scheduler.rb b/lib/gitlab/database/background_migration/batched_migration_runner.rb index 5f8a5ec06a5..cf8b61f5feb 100644 --- a/lib/gitlab/database/background_migration/scheduler.rb +++ b/lib/gitlab/database/background_migration/batched_migration_runner.rb @@ -3,12 +3,22 @@ module Gitlab module Database module BackgroundMigration - class Scheduler - def perform(migration_wrapper: BatchedMigrationWrapper.new) - active_migration = BatchedMigration.active.queue_order.first - - return unless active_migration&.interval_elapsed? + class BatchedMigrationRunner + def initialize(migration_wrapper = BatchedMigrationWrapper.new) + @migration_wrapper = migration_wrapper + end + # Runs the next batched_job for a batched_background_migration. + # + # The batch bounds of the next job are calculated at runtime, based on the migration + # configuration and the bounds of the most recently created batched_job. Updating the + # migration configuration will cause future jobs to use the updated batch sizes. + # + # The job instance will automatically receive a set of arguments based on the migration + # configuration. For more details, see the BatchedMigrationWrapper class. + # + # Note that this method is primarily intended to called by a scheduled worker. + def run_migration_job(active_migration) if next_batched_job = create_next_batched_job!(active_migration) migration_wrapper.perform(next_batched_job) else @@ -16,8 +26,26 @@ module Gitlab end end + # Runs all remaining batched_jobs for a batched_background_migration. + # + # This method is intended to be used in a test/dev environment to execute the background + # migration inline. It should NOT be used in a real environment for any non-trivial migrations. + def run_entire_migration(migration) + unless Rails.env.development? || Rails.env.test? + raise 'this method is not intended for use in real environments' + end + + while migration.active? + run_migration_job(migration) + + migration.reload_last_job + end + end + private + attr_reader :migration_wrapper + def create_next_batched_job!(active_migration) next_batch_range = find_next_batch_range(active_migration) diff --git a/lib/gitlab/database/background_migration/batched_migration_wrapper.rb b/lib/gitlab/database/background_migration/batched_migration_wrapper.rb index 299bd992197..c276f8ce75b 100644 --- a/lib/gitlab/database/background_migration/batched_migration_wrapper.rb +++ b/lib/gitlab/database/background_migration/batched_migration_wrapper.rb @@ -4,6 +4,15 @@ module Gitlab module Database module BackgroundMigration class BatchedMigrationWrapper + extend Gitlab::Utils::StrongMemoize + + # Wraps the execution of a batched_background_migration. + # + # Updates the job's tracking records with the status of the migration + # when starting and finishing execution, and optionally saves batch_metrics + # the migration provides, if any are given. + # + # The job's batch_metrics are serialized to JSON for storage. def perform(batch_tracking_record) start_tracking_execution(batch_tracking_record) @@ -16,6 +25,7 @@ module Gitlab raise e ensure finish_tracking_execution(batch_tracking_record) + track_prometheus_metrics(batch_tracking_record) end private @@ -34,12 +44,75 @@ module Gitlab tracking_record.migration_column_name, tracking_record.sub_batch_size, *tracking_record.migration_job_arguments) + + if job_instance.respond_to?(:batch_metrics) + tracking_record.metrics = job_instance.batch_metrics + end end def finish_tracking_execution(tracking_record) tracking_record.finished_at = Time.current tracking_record.save! end + + def track_prometheus_metrics(tracking_record) + migration = tracking_record.batched_migration + base_labels = migration.prometheus_labels + + metric_for(:gauge_batch_size).set(base_labels, tracking_record.batch_size) + metric_for(:gauge_sub_batch_size).set(base_labels, tracking_record.sub_batch_size) + metric_for(:counter_updated_tuples).increment(base_labels, tracking_record.batch_size) + + # Time efficiency: Ratio of duration to interval (ideal: less than, but close to 1) + efficiency = (tracking_record.finished_at - tracking_record.started_at).to_i / migration.interval.to_f + metric_for(:histogram_time_efficiency).observe(base_labels, efficiency) + + if metrics = tracking_record.metrics + metrics['timings']&.each do |key, timings| + summary = metric_for(:histogram_timings) + labels = base_labels.merge(operation: key) + + timings.each do |timing| + summary.observe(labels, timing) + end + end + end + end + + def metric_for(name) + self.class.metrics[name] + end + + def self.metrics + strong_memoize(:metrics) do + { + gauge_batch_size: Gitlab::Metrics.gauge( + :batched_migration_job_batch_size, + 'Batch size for a batched migration job' + ), + gauge_sub_batch_size: Gitlab::Metrics.gauge( + :batched_migration_job_sub_batch_size, + 'Sub-batch size for a batched migration job' + ), + counter_updated_tuples: Gitlab::Metrics.counter( + :batched_migration_job_updated_tuples_total, + 'Number of tuples updated by batched migration job' + ), + histogram_timings: Gitlab::Metrics.histogram( + :batched_migration_job_duration_seconds, + 'Timings for a batched migration job', + {}, + [0.1, 0.25, 0.5, 1, 5].freeze + ), + histogram_time_efficiency: Gitlab::Metrics.histogram( + :batched_migration_job_time_efficiency, + 'Ratio of job duration to interval', + {}, + [0.5, 0.9, 1, 1.5, 2].freeze + ) + } + end + end end end end diff --git a/lib/gitlab/database/batch_count.rb b/lib/gitlab/database/batch_count.rb index 5a506da0d05..9002d39e1ee 100644 --- a/lib/gitlab/database/batch_count.rb +++ b/lib/gitlab/database/batch_count.rb @@ -88,11 +88,16 @@ module Gitlab batch_start = start while batch_start < finish - batch_end = [batch_start + batch_size, finish].min - batch_relation = build_relation_batch(batch_start, batch_end, mode) - begin - results = merge_results(results, batch_relation.send(@operation, *@operation_args)) # rubocop:disable GitlabSecurity/PublicSend + batch_end = [batch_start + batch_size, finish].min + batch_relation = build_relation_batch(batch_start, batch_end, mode) + + op_args = @operation_args + if @operation == :count && @operation_args.blank? && use_loose_index_scan_for_distinct_values?(mode) + op_args = [Gitlab::Database::LooseIndexScanDistinctCount::COLUMN_ALIAS] + end + + results = merge_results(results, batch_relation.send(@operation, *op_args)) # rubocop:disable GitlabSecurity/PublicSend batch_start = batch_end rescue ActiveRecord::QueryCanceled => error # retry with a safe batch size & warmer cache @@ -102,6 +107,18 @@ module Gitlab log_canceled_batch_fetch(batch_start, mode, batch_relation.to_sql, error) return FALLBACK end + rescue Gitlab::Database::LooseIndexScanDistinctCount::ColumnConfigurationError => error + Gitlab::AppJsonLogger + .error( + event: 'batch_count', + relation: @relation.table_name, + operation: @operation, + operation_args: @operation_args, + mode: mode, + message: "LooseIndexScanDistinctCount column error: #{error.message}" + ) + + return FALLBACK end sleep(SLEEP_TIME_IN_SECONDS) @@ -123,7 +140,11 @@ module Gitlab private def build_relation_batch(start, finish, mode) - @relation.select(@column).public_send(mode).where(between_condition(start, finish)) # rubocop:disable GitlabSecurity/PublicSend + if use_loose_index_scan_for_distinct_values?(mode) + Gitlab::Database::LooseIndexScanDistinctCount.new(@relation, @column).build_query(from: start, to: finish) + else + @relation.select(@column).public_send(mode).where(between_condition(start, finish)) # rubocop:disable GitlabSecurity/PublicSend + end end def batch_size_for_mode_and_operation(mode, operation) @@ -165,6 +186,14 @@ module Gitlab message: "Query has been canceled with message: #{error.message}" ) end + + def use_loose_index_scan_for_distinct_values?(mode) + Feature.enabled?(:loose_index_scan_for_distinct_values) && not_group_by_query? && mode == :distinct + end + + def not_group_by_query? + !@relation.is_a?(ActiveRecord::Relation) || @relation.group_values.blank? + end end end end diff --git a/lib/gitlab/database/bulk_update.rb b/lib/gitlab/database/bulk_update.rb index 1403d561890..b1f9da30585 100644 --- a/lib/gitlab/database/bulk_update.rb +++ b/lib/gitlab/database/bulk_update.rb @@ -130,7 +130,7 @@ module Gitlab def sql <<~SQL - WITH cte(#{list_of(cte_columns)}) AS (VALUES #{list_of(values)}) + WITH cte(#{list_of(cte_columns)}) AS #{Gitlab::Database::AsWithMaterialized.materialized_if_supported} (VALUES #{list_of(values)}) UPDATE #{table_name} SET #{list_of(updates)} FROM cte WHERE cte_id = id SQL end diff --git a/lib/gitlab/database/count/reltuples_count_strategy.rb b/lib/gitlab/database/count/reltuples_count_strategy.rb index 89190320cf9..a7bfafe2815 100644 --- a/lib/gitlab/database/count/reltuples_count_strategy.rb +++ b/lib/gitlab/database/count/reltuples_count_strategy.rb @@ -3,10 +3,6 @@ module Gitlab module Database module Count - class PgClass < ActiveRecord::Base - self.table_name = 'pg_class' - end - # This strategy counts based on PostgreSQL's statistics in pg_stat_user_tables. # # Specifically, it relies on the column reltuples in said table. An additional @@ -74,7 +70,7 @@ module Gitlab def get_statistics(table_names, check_statistics: true) time = 6.hours.ago - query = PgClass.joins("LEFT JOIN pg_stat_user_tables ON pg_stat_user_tables.relid = pg_class.oid") + query = ::Gitlab::Database::PgClass.joins("LEFT JOIN pg_stat_user_tables ON pg_stat_user_tables.relid = pg_class.oid") .where(relname: table_names) .where('schemaname = current_schema()') .select('pg_class.relname AS table_name, reltuples::bigint AS estimate') diff --git a/lib/gitlab/database/loose_index_scan_distinct_count.rb b/lib/gitlab/database/loose_index_scan_distinct_count.rb new file mode 100644 index 00000000000..884f4d47ff8 --- /dev/null +++ b/lib/gitlab/database/loose_index_scan_distinct_count.rb @@ -0,0 +1,102 @@ +# frozen_string_literal: true + +module Gitlab + module Database + # This class builds efficient batched distinct query by using loose index scan. + # Consider the following example: + # > Issue.distinct(:project_id).where(project_id: (1...100)).count + # + # Note: there is an index on project_id + # + # This query will read each element in the index matching the project_id filter. + # If for a project_id has 100_000 issues, all 100_000 elements will be read. + # + # A loose index scan will read only one entry from the index for each project_id to reduce the number of disk reads. + # + # Usage: + # + # Gitlab::Database::LooseIndexScanDisctinctCount.new(Issue, :project_id).count(from: 1, to: 100) + # + # The query will return the number of distinct projects_ids between 1 and 100 + # + # Getting the Arel query: + # + # Gitlab::Database::LooseIndexScanDisctinctCount.new(Issue, :project_id).build_query(from: 1, to: 100) + class LooseIndexScanDistinctCount + COLUMN_ALIAS = 'distinct_count_column' + + ColumnConfigurationError = Class.new(StandardError) + + def initialize(scope, column) + if scope.is_a?(ActiveRecord::Relation) + @scope = scope + @model = scope.model + else + @scope = scope.where({}) + @model = scope + end + + @column = transform_column(column) + end + + def count(from:, to:) + build_query(from: from, to: to).count(COLUMN_ALIAS) + end + + def build_query(from:, to:) # rubocop:disable Metrics/AbcSize + cte = Gitlab::SQL::RecursiveCTE.new(:counter_cte, union_args: { remove_order: false }) + table = model.arel_table + + cte << @scope + .dup + .select(column.as(COLUMN_ALIAS)) + .where(column.gteq(from)) + .where(column.lt(to)) + .order(column) + .limit(1) + + inner_query = @scope + .dup + .where(column.gt(cte.table[COLUMN_ALIAS])) + .where(column.lt(to)) + .select(column.as(COLUMN_ALIAS)) + .order(column) + .limit(1) + + cte << cte.table + .project(Arel::Nodes::Grouping.new(Arel.sql(inner_query.to_sql)).as(COLUMN_ALIAS)) + .where(cte.table[COLUMN_ALIAS].lt(to)) + + model + .with + .recursive(cte.to_arel) + .from(cte.alias_to(table)) + .unscope(where: :source_type) + .unscope(where: model.inheritance_column) # Remove STI query, not needed here + end + + private + + attr_reader :column, :model + + # Transforms the column so it can be used in Arel expressions + # + # 'table.column' => 'table.column' + # 'column' => 'table_name.column' + # :column => 'table_name.column' + # Arel::Attributes::Attribute => name of the column + def transform_column(column) + if column.is_a?(String) || column.is_a?(Symbol) + column_as_string = column.to_s + column_as_string = "#{model.table_name}.#{column_as_string}" unless column_as_string.include?('.') + + Arel.sql(column_as_string) + elsif column.is_a?(Arel::Attributes::Attribute) + column + else + raise ColumnConfigurationError.new("Cannot transform the column: #{column.inspect}, please provide the column name as string") + end + end + end + end +end diff --git a/lib/gitlab/database/migration_helpers.rb b/lib/gitlab/database/migration_helpers.rb index 31e733050e1..d06a73da8ac 100644 --- a/lib/gitlab/database/migration_helpers.rb +++ b/lib/gitlab/database/migration_helpers.rb @@ -4,6 +4,7 @@ module Gitlab module Database module MigrationHelpers include Migrations::BackgroundMigrationHelpers + include DynamicModelHelpers # https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS MAX_IDENTIFIER_NAME_LENGTH = 63 @@ -576,17 +577,7 @@ module Gitlab # old_column - The name of the old column. # new_column - The name of the new column. def install_rename_triggers(table, old_column, new_column) - trigger_name = rename_trigger_name(table, old_column, new_column) - quoted_table = quote_table_name(table) - quoted_old = quote_column_name(old_column) - quoted_new = quote_column_name(new_column) - - install_rename_triggers_for_postgresql( - trigger_name, - quoted_table, - quoted_old, - quoted_new - ) + install_rename_triggers_for_postgresql(table, old_column, new_column) end # Changes the type of a column concurrently. @@ -927,19 +918,67 @@ module Gitlab # This is crucial for Primary Key conversions, because setting a column # as the PK converts even check constraints to NOT NULL constraints # and forces an inline re-verification of the whole table. - # - It backfills the new column with the values of the existing primary key - # by scheduling background jobs. - # - It tracks the scheduled background jobs through the use of - # Gitlab::Database::BackgroundMigrationJob + # - It sets up a trigger to keep the two columns in sync. + # + # Note: this helper is intended to be used in a regular (pre-deployment) migration. + # + # This helper is part 1 of a multi-step migration process: + # 1. initialize_conversion_of_integer_to_bigint to create the new column and database triggers + # 2. backfill_conversion_of_integer_to_bigint to copy historic data using background migrations + # 3. remaining steps TBD, see #288005 + # + # table - The name of the database table containing the column + # column - The name of the column that we want to convert to bigint. + # primary_key - The name of the primary key column (most often :id) + def initialize_conversion_of_integer_to_bigint(table, column, primary_key: :id) + unless table_exists?(table) + raise "Table #{table} does not exist" + end + + unless column_exists?(table, primary_key) + raise "Column #{primary_key} does not exist on #{table}" + end + + unless column_exists?(table, column) + raise "Column #{column} does not exist on #{table}" + end + + check_trigger_permissions!(table) + + old_column = column_for(table, column) + tmp_column = "#{column}_convert_to_bigint" + + with_lock_retries do + if (column.to_s == primary_key.to_s) || !old_column.null + # If the column to be converted is either a PK or is defined as NOT NULL, + # set it to `NOT NULL DEFAULT 0` and we'll copy paste the correct values bellow + # That way, we skip the expensive validation step required to add + # a NOT NULL constraint at the end of the process + add_column(table, tmp_column, :bigint, default: old_column.default || 0, null: false) + else + add_column(table, tmp_column, :bigint, default: old_column.default) + end + + install_rename_triggers(table, column, tmp_column) + end + end + + # Backfills the new column used in the conversion of an integer column to bigint using background migrations. + # + # - This helper should be called from a post-deployment migration. + # - In order for this helper to work properly, the new column must be first initialized with + # the `initialize_conversion_of_integer_to_bigint` helper. + # - It tracks the scheduled background jobs through Gitlab::Database::BackgroundMigration::BatchedMigration, # which allows a more thorough check that all jobs succeeded in the # cleanup migration and is way faster for very large tables. - # - It sets up a trigger to keep the two columns in sync - # - It does not schedule a cleanup job: we have to do that with followup - # post deployment migrations in the next release. # - # This needs to be done manually by using the - # `cleanup_initialize_conversion_of_integer_to_bigint` - # (not yet implemented - check #288005) + # Note: this helper is intended to be used in a post-deployment migration, to ensure any new code is + # deployed (including background job changes) before we begin processing the background migration. + # + # This helper is part 2 of a multi-step migration process: + # 1. initialize_conversion_of_integer_to_bigint to create the new column and database triggers + # 2. backfill_conversion_of_integer_to_bigint to copy historic data using background migrations + # 3. remaining steps TBD, see #288005 # # table - The name of the database table containing the column # column - The name of the column that we want to convert to bigint. @@ -960,7 +999,7 @@ module Gitlab # and set the batch_size to 50_000 which will require # ~50s = (50000 / 200) * (0.1 + 0.1) to complete and leaves breathing space # between the scheduled jobs - def initialize_conversion_of_integer_to_bigint( + def backfill_conversion_of_integer_to_bigint( table, column, primary_key: :id, @@ -969,10 +1008,6 @@ module Gitlab interval: 2.minutes ) - if transaction_open? - raise 'initialize_conversion_of_integer_to_bigint can not be run inside a transaction' - end - unless table_exists?(table) raise "Table #{table} does not exist" end @@ -985,87 +1020,42 @@ module Gitlab raise "Column #{column} does not exist on #{table}" end - check_trigger_permissions!(table) - - old_column = column_for(table, column) tmp_column = "#{column}_convert_to_bigint" - with_lock_retries do - if (column.to_s == primary_key.to_s) || !old_column.null - # If the column to be converted is either a PK or is defined as NOT NULL, - # set it to `NOT NULL DEFAULT 0` and we'll copy paste the correct values bellow - # That way, we skip the expensive validation step required to add - # a NOT NULL constraint at the end of the process - add_column(table, tmp_column, :bigint, default: old_column.default || 0, null: false) - else - add_column(table, tmp_column, :bigint, default: old_column.default) - end - - install_rename_triggers(table, column, tmp_column) - end - - source_model = Class.new(ActiveRecord::Base) do - include EachBatch - - self.table_name = table - self.inheritance_column = :_type_disabled + unless column_exists?(table, tmp_column) + raise 'The temporary column does not exist, initialize it with `initialize_conversion_of_integer_to_bigint`' end - queue_background_migration_jobs_by_range_at_intervals( - source_model, + batched_migration = queue_batched_background_migration( 'CopyColumnUsingBackgroundMigrationJob', - interval, + table, + primary_key, + column, + tmp_column, + job_interval: interval, batch_size: batch_size, - other_job_arguments: [table, primary_key, sub_batch_size, column, tmp_column], - track_jobs: true, - primary_column_name: primary_key - ) + sub_batch_size: sub_batch_size) if perform_background_migration_inline? # To ensure the schema is up to date immediately we perform the # migration inline in dev / test environments. - Gitlab::BackgroundMigration.steal('CopyColumnUsingBackgroundMigrationJob') + Gitlab::Database::BackgroundMigration::BatchedMigrationRunner.new.run_entire_migration(batched_migration) end end # Performs a concurrent column rename when using PostgreSQL. - def install_rename_triggers_for_postgresql(trigger, table, old, new) - execute <<-EOF.strip_heredoc - CREATE OR REPLACE FUNCTION #{trigger}() - RETURNS trigger AS - $BODY$ - BEGIN - NEW.#{new} := NEW.#{old}; - RETURN NEW; - END; - $BODY$ - LANGUAGE 'plpgsql' - VOLATILE - EOF - - execute <<-EOF.strip_heredoc - DROP TRIGGER IF EXISTS #{trigger} - ON #{table} - EOF - - execute <<-EOF.strip_heredoc - CREATE TRIGGER #{trigger} - BEFORE INSERT OR UPDATE - ON #{table} - FOR EACH ROW - EXECUTE FUNCTION #{trigger}() - EOF + def install_rename_triggers_for_postgresql(table, old, new, trigger_name: nil) + Gitlab::Database::UnidirectionalCopyTrigger.on_table(table).create(old, new, trigger_name: trigger_name) end # Removes the triggers used for renaming a PostgreSQL column concurrently. def remove_rename_triggers_for_postgresql(table, trigger) - execute("DROP TRIGGER IF EXISTS #{trigger} ON #{table}") - execute("DROP FUNCTION IF EXISTS #{trigger}()") + Gitlab::Database::UnidirectionalCopyTrigger.on_table(table).drop(trigger) end # Returns the (base) name to use for triggers when renaming columns. def rename_trigger_name(table, old, new) - 'trigger_' + Digest::SHA256.hexdigest("#{table}_#{old}_#{new}").first(12) + Gitlab::Database::UnidirectionalCopyTrigger.on_table(table).name(old, new) end # Returns an Array containing the indexes for the given column diff --git a/lib/gitlab/database/migrations/background_migration_helpers.rb b/lib/gitlab/database/migrations/background_migration_helpers.rb index e8cbea72887..8d5ea652bfc 100644 --- a/lib/gitlab/database/migrations/background_migration_helpers.rb +++ b/lib/gitlab/database/migrations/background_migration_helpers.rb @@ -190,7 +190,7 @@ module Gitlab migration_status = batch_max_value.nil? ? :finished : :active batch_max_value ||= batch_min_value - Gitlab::Database::BackgroundMigration::BatchedMigration.create!( + migration = Gitlab::Database::BackgroundMigration::BatchedMigration.create!( job_class_name: job_class_name, table_name: batch_table_name, column_name: batch_column_name, @@ -202,6 +202,17 @@ module Gitlab sub_batch_size: sub_batch_size, job_arguments: job_arguments, status: migration_status) + + # This guard is necessary since #total_tuple_count was only introduced schema-wise, + # after this migration helper had been used for the first time. + return migration unless migration.respond_to?(:total_tuple_count) + + # We keep track of the estimated number of tuples to reason later + # about the overall progress of a migration. + migration.total_tuple_count = Gitlab::Database::PgClass.for_table(batch_table_name)&.cardinality_estimate + migration.save! + + migration end def perform_background_migration_inline? @@ -236,6 +247,14 @@ module Gitlab Gitlab::ApplicationContext.with_context(caller_id: self.class.to_s, &block) end + def delete_queued_jobs(class_name) + Gitlab::BackgroundMigration.steal(class_name) do |job| + job.delete + + false + end + end + private def track_in_database(class_name, arguments) diff --git a/lib/gitlab/database/partitioning_migration_helpers/foreign_key_helpers.rb b/lib/gitlab/database/partitioning_migration_helpers/foreign_key_helpers.rb index 2def3a4d3a9..4402c42b136 100644 --- a/lib/gitlab/database/partitioning_migration_helpers/foreign_key_helpers.rb +++ b/lib/gitlab/database/partitioning_migration_helpers/foreign_key_helpers.rb @@ -6,6 +6,80 @@ module Gitlab module ForeignKeyHelpers include ::Gitlab::Database::SchemaHelpers + # Adds a foreign key with only minimal locking on the tables involved. + # + # In concept it works similarly to add_concurrent_foreign_key, but we have + # to add a special helper for partitioned tables for the following reasons: + # - add_concurrent_foreign_key sets the constraint to `NOT VALID` + # before validating it + # - Setting an FK to NOT VALID is not supported currently in Postgres (up to PG13) + # - Also, PostgreSQL will currently ignore NOT VALID constraints on partitions + # when adding a valid FK to the partitioned table, so they have to + # also be validated before we can add the final FK. + # Solution: + # - Add the foreign key first to each partition by using + # add_concurrent_foreign_key and validating it + # - Once all partitions have a foreign key, add it also to the partitioned + # table (there will be no need for a validation at that level) + # For those reasons, this method does not include an option to delay the + # validation, we have to force validate: true. + # + # source - The source (partitioned) table containing the foreign key. + # target - The target table the key points to. + # column - The name of the column to create the foreign key on. + # on_delete - The action to perform when associated data is removed, + # defaults to "CASCADE". + # name - The name of the foreign key. + # + def add_concurrent_partitioned_foreign_key(source, target, column:, on_delete: :cascade, name: nil) + partition_options = { + column: column, + on_delete: on_delete, + + # We'll use the same FK name for all partitions and match it to + # the name used for the partitioned table to follow the convention + # used by PostgreSQL when adding FKs to new partitions + name: name.presence || concurrent_partitioned_foreign_key_name(source, column), + + # Force the FK validation to true for partitions (and the partitioned table) + validate: true + } + + if foreign_key_exists?(source, target, **partition_options) + warning_message = "Foreign key not created because it exists already " \ + "(this may be due to an aborted migration or similar): " \ + "source: #{source}, target: #{target}, column: #{partition_options[:column]}, "\ + "name: #{partition_options[:name]}, on_delete: #{partition_options[:on_delete]}" + + Gitlab::AppLogger.warn warning_message + + return + end + + partitioned_table = find_partitioned_table(source) + + partitioned_table.postgres_partitions.order(:name).each do |partition| + add_concurrent_foreign_key(partition.identifier, target, **partition_options) + end + + with_lock_retries do + add_foreign_key(source, target, **partition_options) + end + end + + # Returns the name for a concurrent partitioned foreign key. + # + # Similar to concurrent_foreign_key_name (Gitlab::Database::MigrationHelpers) + # we just keep a separate method in case we want a different behavior + # for partitioned tables + # + def concurrent_partitioned_foreign_key_name(table, column, prefix: 'fk_rails_') + identifier = "#{table}_#{column}_fk" + hashed_identifier = Digest::SHA256.hexdigest(identifier).first(10) + + "#{prefix}#{hashed_identifier}" + end + # Creates a "foreign key" that references a partitioned table. Because foreign keys referencing partitioned # tables are not supported in PG11, this does not create a true database foreign key, but instead implements the # same functionality at the database level by using triggers. diff --git a/lib/gitlab/database/partitioning_migration_helpers/table_management_helpers.rb b/lib/gitlab/database/partitioning_migration_helpers/table_management_helpers.rb index 1c289391e21..9ccbdc9930e 100644 --- a/lib/gitlab/database/partitioning_migration_helpers/table_management_helpers.rb +++ b/lib/gitlab/database/partitioning_migration_helpers/table_management_helpers.rb @@ -223,6 +223,28 @@ module Gitlab replace_table(table_name, archived_table_name, partitioned_table_name, primary_key_name) end + def drop_nonpartitioned_archive_table(table_name) + assert_table_is_allowed(table_name) + + archived_table_name = make_archived_table_name(table_name) + + with_lock_retries do + drop_sync_trigger(table_name) + end + + drop_table(archived_table_name) + end + + def create_trigger_to_sync_tables(source_table_name, partitioned_table_name, unique_key) + function_name = make_sync_function_name(source_table_name) + trigger_name = make_sync_trigger_name(source_table_name) + + create_sync_function(function_name, partitioned_table_name, unique_key) + create_comment('FUNCTION', function_name, "Partitioning migration: table sync for #{source_table_name} table") + + create_sync_trigger(source_table_name, trigger_name, function_name) + end + private def assert_table_is_allowed(table_name) @@ -316,16 +338,6 @@ module Gitlab create_range_partition(partition_name, table_name, lower_bound, upper_bound) end - def create_trigger_to_sync_tables(source_table_name, partitioned_table_name, unique_key) - function_name = make_sync_function_name(source_table_name) - trigger_name = make_sync_trigger_name(source_table_name) - - create_sync_function(function_name, partitioned_table_name, unique_key) - create_comment('FUNCTION', function_name, "Partitioning migration: table sync for #{source_table_name} table") - - create_sync_trigger(source_table_name, trigger_name, function_name) - end - def drop_sync_trigger(source_table_name) trigger_name = make_sync_trigger_name(source_table_name) drop_trigger(source_table_name, trigger_name) diff --git a/lib/gitlab/database/pg_class.rb b/lib/gitlab/database/pg_class.rb new file mode 100644 index 00000000000..0ce9eebc14c --- /dev/null +++ b/lib/gitlab/database/pg_class.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +module Gitlab + module Database + class PgClass < ActiveRecord::Base + self.table_name = 'pg_class' + + def self.for_table(relname) + joins("LEFT JOIN pg_stat_user_tables ON pg_stat_user_tables.relid = pg_class.oid") + .where('schemaname = current_schema()') + .find_by(relname: relname) + end + + def cardinality_estimate + tuples = reltuples.to_i + + return if tuples < 1 + + tuples + end + end + end +end diff --git a/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb index 62dfaeeaae3..e8b49c7f62c 100644 --- a/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb +++ b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb @@ -41,19 +41,6 @@ module Gitlab BUCKET_ID_MASK = (Buckets::TOTAL_BUCKETS - ZERO_OFFSET).to_s(2) BIT_31_MASK = "B'0#{'1' * 31}'" BIT_32_NORMALIZED_BUCKET_ID_MASK = "B'#{'0' * (32 - BUCKET_ID_MASK.size)}#{BUCKET_ID_MASK}'" - # @example source_query - # SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits - # FROM %{relation} - # WHERE %{pkey} >= %{batch_start} - # AND %{pkey} < %{batch_end} - # AND %{column} IS NOT NULL - BUCKETED_DATA_SQL = <<~SQL - WITH hashed_attributes AS (%{source_query}) - SELECT (attr_hash_32_bits & #{BIT_32_NORMALIZED_BUCKET_ID_MASK})::int AS bucket_num, - (31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash - FROM hashed_attributes - GROUP BY 1 - SQL WRONG_CONFIGURATION_ERROR = Class.new(ActiveRecord::StatementInvalid) @@ -103,7 +90,7 @@ module Gitlab def hll_buckets_for_batch(start, finish) @relation .connection - .execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) }) + .execute(bucketed_data_sql % { source_query: source_query(start, finish) }) .map(&:values) .to_h end @@ -139,6 +126,22 @@ module Gitlab def actual_finish(finish) finish || @relation.unscope(:group, :having).maximum(@relation.primary_key) || 0 end + + # @example source_query + # SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits + # FROM %{relation} + # WHERE %{pkey} >= %{batch_start} + # AND %{pkey} < %{batch_end} + # AND %{column} IS NOT NULL + def bucketed_data_sql + <<~SQL + WITH hashed_attributes AS #{Gitlab::Database::AsWithMaterialized.materialized_if_supported} (%{source_query}) + SELECT (attr_hash_32_bits & #{BIT_32_NORMALIZED_BUCKET_ID_MASK})::int AS bucket_num, + (31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash + FROM hashed_attributes + GROUP BY 1 + SQL + end end end end diff --git a/lib/gitlab/database/similarity_score.rb b/lib/gitlab/database/similarity_score.rb index 40845c0d5e0..20bf6fa4d30 100644 --- a/lib/gitlab/database/similarity_score.rb +++ b/lib/gitlab/database/similarity_score.rb @@ -10,7 +10,7 @@ module Gitlab # Adds a "magic" comment in the generated SQL expression in order to be able to tell if we're sorting by similarity. # Example: /* gitlab/database/similarity_score */ SIMILARITY(COALESCE... - SIMILARITY_FUNCTION_CALL_WITH_ANNOTATION = "/* #{DISPLAY_NAME} */ SIMILARITY".freeze + SIMILARITY_FUNCTION_CALL_WITH_ANNOTATION = "/* #{DISPLAY_NAME} */ SIMILARITY" # This method returns an Arel expression that can be used in an ActiveRecord query to order the resultset by similarity. # diff --git a/lib/gitlab/database/unidirectional_copy_trigger.rb b/lib/gitlab/database/unidirectional_copy_trigger.rb new file mode 100644 index 00000000000..029c894a5ff --- /dev/null +++ b/lib/gitlab/database/unidirectional_copy_trigger.rb @@ -0,0 +1,97 @@ +# frozen_string_literal: true + +module Gitlab + module Database + class UnidirectionalCopyTrigger + def self.on_table(table_name, connection: ActiveRecord::Base.connection) + new(table_name, connection) + end + + def name(from_column_names, to_column_names) + from_column_names, to_column_names = check_column_names!(from_column_names, to_column_names) + + unchecked_name(from_column_names, to_column_names) + end + + def create(from_column_names, to_column_names, trigger_name: nil) + from_column_names, to_column_names = check_column_names!(from_column_names, to_column_names) + trigger_name ||= unchecked_name(from_column_names, to_column_names) + + assignment_clauses = assignment_clauses_for_columns(from_column_names, to_column_names) + + connection.execute(<<~SQL) + CREATE OR REPLACE FUNCTION #{trigger_name}() + RETURNS trigger AS + $BODY$ + BEGIN + #{assignment_clauses}; + RETURN NEW; + END; + $BODY$ + LANGUAGE 'plpgsql' + VOLATILE + SQL + + connection.execute(<<~SQL) + DROP TRIGGER IF EXISTS #{trigger_name} + ON #{quoted_table_name} + SQL + + connection.execute(<<~SQL) + CREATE TRIGGER #{trigger_name} + BEFORE INSERT OR UPDATE + ON #{quoted_table_name} + FOR EACH ROW + EXECUTE FUNCTION #{trigger_name}() + SQL + end + + def drop(trigger_name) + connection.execute("DROP TRIGGER IF EXISTS #{trigger_name} ON #{quoted_table_name}") + connection.execute("DROP FUNCTION IF EXISTS #{trigger_name}()") + end + + private + + attr_reader :table_name, :connection + + def initialize(table_name, connection) + @table_name = table_name + @connection = connection + end + + def quoted_table_name + @quoted_table_name ||= connection.quote_table_name(table_name) + end + + def check_column_names!(from_column_names, to_column_names) + from_column_names = Array.wrap(from_column_names) + to_column_names = Array.wrap(to_column_names) + + unless from_column_names.size == to_column_names.size + raise ArgumentError, 'number of source and destination columns must match' + end + + [from_column_names, to_column_names] + end + + def unchecked_name(from_column_names, to_column_names) + joined_column_names = from_column_names.zip(to_column_names).flatten.join('_') + 'trigger_' + Digest::SHA256.hexdigest("#{table_name}_#{joined_column_names}").first(12) + end + + def assignment_clauses_for_columns(from_column_names, to_column_names) + combined_column_names = to_column_names.zip(from_column_names) + + assignment_clauses = combined_column_names.map do |(new_name, old_name)| + new_name = connection.quote_column_name(new_name) + old_name = connection.quote_column_name(old_name) + + "NEW.#{new_name} := NEW.#{old_name}" + end + + assignment_clauses.join(";\n ") + end + end + end +end |