diff options
author | GitLab Bot <gitlab-bot@gitlab.com> | 2020-08-20 21:42:06 +0300 |
---|---|---|
committer | GitLab Bot <gitlab-bot@gitlab.com> | 2020-08-20 21:42:06 +0300 |
commit | 6e4e1050d9dba2b7b2523fdd1768823ab85feef4 (patch) | |
tree | 78be5963ec075d80116a932011d695dd33910b4e /lib/gitlab/database | |
parent | 1ce776de4ae122aba3f349c02c17cebeaa8ecf07 (diff) |
Add latest changes from gitlab-org/gitlab@13-3-stable-ee
Diffstat (limited to 'lib/gitlab/database')
14 files changed, 296 insertions, 44 deletions
diff --git a/lib/gitlab/database/batch_count.rb b/lib/gitlab/database/batch_count.rb index ab069ce1da1..1762b81b7d8 100644 --- a/lib/gitlab/database/batch_count.rb +++ b/lib/gitlab/database/batch_count.rb @@ -16,6 +16,7 @@ # batch_count(::Clusters::Cluster.aws_installed.enabled, :cluster_id) # batch_distinct_count(::Project, :creator_id) # batch_distinct_count(::Project.with_active_services.service_desk_enabled.where(time_period), start: ::User.minimum(:id), finish: ::User.maximum(:id)) +# batch_sum(User, :sign_in_count) module Gitlab module Database module BatchCount @@ -27,6 +28,10 @@ module Gitlab BatchCounter.new(relation, column: column).count(mode: :distinct, batch_size: batch_size, start: start, finish: finish) end + def batch_sum(relation, column, batch_size: nil, start: nil, finish: nil) + BatchCounter.new(relation, column: nil, operation: :sum, operation_args: [column]).count(batch_size: batch_size, start: start, finish: finish) + end + class << self include BatchCount end @@ -35,6 +40,7 @@ module Gitlab class BatchCounter FALLBACK = -1 MIN_REQUIRED_BATCH_SIZE = 1_250 + DEFAULT_SUM_BATCH_SIZE = 1_000 MAX_ALLOWED_LOOPS = 10_000 SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep ALLOWED_MODES = [:itself, :distinct].freeze @@ -43,13 +49,16 @@ module Gitlab DEFAULT_DISTINCT_BATCH_SIZE = 10_000 DEFAULT_BATCH_SIZE = 100_000 - def initialize(relation, column: nil) + def initialize(relation, column: nil, operation: :count, operation_args: nil) @relation = relation @column = column || relation.primary_key + @operation = operation + @operation_args = operation_args end def unwanted_configuration?(finish, batch_size, start) - batch_size <= MIN_REQUIRED_BATCH_SIZE || + (@operation == :count && batch_size <= MIN_REQUIRED_BATCH_SIZE) || + (@operation == :sum && batch_size < DEFAULT_SUM_BATCH_SIZE) || (finish - start) / batch_size >= MAX_ALLOWED_LOOPS || start > finish end @@ -60,7 +69,7 @@ module Gitlab check_mode!(mode) # non-distinct have better performance - batch_size ||= mode == :distinct ? DEFAULT_DISTINCT_BATCH_SIZE : DEFAULT_BATCH_SIZE + batch_size ||= batch_size_for_mode_and_operation(mode, @operation) start = actual_start(start) finish = actual_finish(finish) @@ -91,11 +100,17 @@ module Gitlab def batch_fetch(start, finish, mode) # rubocop:disable GitlabSecurity/PublicSend - @relation.select(@column).public_send(mode).where(between_condition(start, finish)).count + @relation.select(@column).public_send(mode).where(between_condition(start, finish)).send(@operation, *@operation_args) end private + def batch_size_for_mode_and_operation(mode, operation) + return DEFAULT_SUM_BATCH_SIZE if operation == :sum + + mode == :distinct ? DEFAULT_DISTINCT_BATCH_SIZE : DEFAULT_BATCH_SIZE + end + def between_condition(start, finish) return @column.between(start..(finish - 1)) if @column.is_a?(Arel::Attributes::Attribute) diff --git a/lib/gitlab/database/connection_timer.rb b/lib/gitlab/database/connection_timer.rb index ef8d52ba71c..f9b893ffd0f 100644 --- a/lib/gitlab/database/connection_timer.rb +++ b/lib/gitlab/database/connection_timer.rb @@ -23,7 +23,7 @@ module Gitlab end def interval_with_randomization - interval + rand(RANDOMIZATION_INTERVAL) if interval.positive? + interval + rand(RANDOMIZATION_INTERVAL) if interval > 0 end def current_clock_value diff --git a/lib/gitlab/database/migration_helpers.rb b/lib/gitlab/database/migration_helpers.rb index 006a24da8fe..a618a3017b2 100644 --- a/lib/gitlab/database/migration_helpers.rb +++ b/lib/gitlab/database/migration_helpers.rb @@ -815,7 +815,7 @@ module Gitlab BEFORE INSERT OR UPDATE ON #{table} FOR EACH ROW - EXECUTE PROCEDURE #{trigger}() + EXECUTE FUNCTION #{trigger}() EOF end @@ -1062,7 +1062,7 @@ into similar problems in the future (e.g. when new tables are created). AND pg_class.relname = '#{table}' SQL - connection.select_value(check_sql).positive? + connection.select_value(check_sql) > 0 end # Adds a check constraint to a table diff --git a/lib/gitlab/database/partitioning/partition_creator.rb b/lib/gitlab/database/partitioning/partition_creator.rb index 348dd1ba660..4c1b13fe3b5 100644 --- a/lib/gitlab/database/partitioning/partition_creator.rb +++ b/lib/gitlab/database/partitioning/partition_creator.rb @@ -24,7 +24,7 @@ module Gitlab end def create_partitions - return unless Feature.enabled?(:postgres_dynamic_partition_creation, default_enabled: true) + Gitlab::AppLogger.info("Checking state of dynamic postgres partitions") models.each do |model| # Double-checking before getting the lease: diff --git a/lib/gitlab/database/partitioning_migration_helpers/table_management_helpers.rb b/lib/gitlab/database/partitioning_migration_helpers/table_management_helpers.rb index b676767f41d..e6d8ec55319 100644 --- a/lib/gitlab/database/partitioning_migration_helpers/table_management_helpers.rb +++ b/lib/gitlab/database/partitioning_migration_helpers/table_management_helpers.rb @@ -16,7 +16,9 @@ module Gitlab BATCH_SIZE = 50_000 # Creates a partitioned copy of an existing table, using a RANGE partitioning strategy on a timestamp column. - # One partition is created per month between the given `min_date` and `max_date`. + # One partition is created per month between the given `min_date` and `max_date`. Also installs a trigger on + # the original table to copy writes into the partitioned table. To copy over historic data from before creation + # of the partitioned table, use the `enqueue_partitioning_data_migration` helper in a post-deploy migration. # # A copy of the original table is required as PG currently does not support partitioning existing tables. # @@ -56,10 +58,10 @@ module Gitlab create_range_partitioned_copy(table_name, partitioned_table_name, partition_column, primary_key) create_daterange_partitions(partitioned_table_name, partition_column.name, min_date, max_date) create_trigger_to_sync_tables(table_name, partitioned_table_name, primary_key) - enqueue_background_migration(table_name, partitioned_table_name, primary_key) end - # Clean up a partitioned copy of an existing table. This deletes the partitioned table and all partitions. + # Clean up a partitioned copy of an existing table. First, deletes the database function and trigger that were + # used to copy writes to the partitioned table, then removes the partitioned table (also removing partitions). # # Example: # @@ -69,8 +71,6 @@ module Gitlab assert_table_is_allowed(table_name) assert_not_in_transaction_block(scope: ERROR_SCOPE) - cleanup_migration_jobs(table_name) - with_lock_retries do trigger_name = make_sync_trigger_name(table_name) drop_trigger(table_name, trigger_name) @@ -83,6 +83,38 @@ module Gitlab drop_table(partitioned_table_name) end + # Enqueue the background jobs that will backfill data in the partitioned table, by batch-copying records from + # original table. This helper should be called from a post-deploy migration. + # + # Example: + # + # enqueue_partitioning_data_migration :audit_events + # + def enqueue_partitioning_data_migration(table_name) + assert_table_is_allowed(table_name) + + assert_not_in_transaction_block(scope: ERROR_SCOPE) + + partitioned_table_name = make_partitioned_table_name(table_name) + primary_key = connection.primary_key(table_name) + enqueue_background_migration(table_name, partitioned_table_name, primary_key) + end + + # Cleanup a previously enqueued background migration to copy data into a partitioned table. This will not + # prevent the enqueued jobs from executing, but instead cleans up information in the database used to track the + # state of the background migration. It should be safe to also remove the partitioned table even if the + # background jobs are still in-progress, as the absence of the table will cause them to safely exit. + # + # Example: + # + # cleanup_partitioning_data_migration :audit_events + # + def cleanup_partitioning_data_migration(table_name) + assert_table_is_allowed(table_name) + + cleanup_migration_jobs(table_name) + end + def create_hash_partitions(table_name, number_of_partitions) transaction do (0..number_of_partitions - 1).each do |partition| diff --git a/lib/gitlab/database/postgresql_adapter/dump_schema_versions_mixin.rb b/lib/gitlab/database/postgresql_adapter/dump_schema_versions_mixin.rb new file mode 100644 index 00000000000..59bd24d3c37 --- /dev/null +++ b/lib/gitlab/database/postgresql_adapter/dump_schema_versions_mixin.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true + +module Gitlab + module Database + module PostgresqlAdapter + module DumpSchemaVersionsMixin + extend ActiveSupport::Concern + + def dump_schema_information # :nodoc: + versions = schema_migration.all_versions + Gitlab::Database::SchemaVersionFiles.touch_all(versions) if versions.any? + + nil + end + end + end + end +end diff --git a/lib/gitlab/database/postgresql_adapter/schema_versions_copy_mixin.rb b/lib/gitlab/database/postgresql_adapter/schema_versions_copy_mixin.rb deleted file mode 100644 index d8f96643dcb..00000000000 --- a/lib/gitlab/database/postgresql_adapter/schema_versions_copy_mixin.rb +++ /dev/null @@ -1,28 +0,0 @@ -# frozen_string_literal: true - -module Gitlab - module Database - module PostgresqlAdapter - module SchemaVersionsCopyMixin - extend ActiveSupport::Concern - - def dump_schema_information # :nodoc: - versions = schema_migration.all_versions - copy_versions_sql(versions) if versions.any? - end - - private - - def copy_versions_sql(versions) - sm_table = quote_table_name(schema_migration.table_name) - - sql = +"COPY #{sm_table} (version) FROM STDIN;\n" - sql << versions.map { |v| Integer(v) }.sort.join("\n") - sql << "\n\\.\n" - - sql - end - end - end - end -end diff --git a/lib/gitlab/database/postgresql_database_tasks/load_schema_versions_mixin.rb b/lib/gitlab/database/postgresql_database_tasks/load_schema_versions_mixin.rb new file mode 100644 index 00000000000..cf8342941c4 --- /dev/null +++ b/lib/gitlab/database/postgresql_database_tasks/load_schema_versions_mixin.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +module Gitlab + module Database + module PostgresqlDatabaseTasks + module LoadSchemaVersionsMixin + extend ActiveSupport::Concern + + def structure_load(*args) + super(*args) + Gitlab::Database::SchemaVersionFiles.load_all + end + end + end + end +end diff --git a/lib/gitlab/database/rename_reserved_paths_migration/v1/rename_namespaces.rb b/lib/gitlab/database/rename_reserved_paths_migration/v1/rename_namespaces.rb index 6b9af51a6ab..4fbbfdc4914 100644 --- a/lib/gitlab/database/rename_reserved_paths_migration/v1/rename_namespaces.rb +++ b/lib/gitlab/database/rename_reserved_paths_migration/v1/rename_namespaces.rb @@ -45,7 +45,7 @@ module Gitlab reverts_for_type('namespace') do |path_before_rename, current_path| matches_path = MigrationClasses::Route.arel_table[:path].matches(current_path) namespace = MigrationClasses::Namespace.joins(:route) - .find_by(matches_path)&.becomes(MigrationClasses::Namespace) + .find_by(matches_path)&.becomes(MigrationClasses::Namespace) # rubocop: disable Cop/AvoidBecomes if namespace perform_rename(namespace, current_path, path_before_rename) diff --git a/lib/gitlab/database/schema_cleaner.rb b/lib/gitlab/database/schema_cleaner.rb index ae9d77e635e..7c415287878 100644 --- a/lib/gitlab/database/schema_cleaner.rb +++ b/lib/gitlab/database/schema_cleaner.rb @@ -23,6 +23,11 @@ module Gitlab structure.gsub!(/\n{3,}/, "\n\n") io << structure + io << <<~MSG + -- schema_migrations.version information is no longer stored in this file, + -- but instead tracked in the db/schema_migrations directory + -- see https://gitlab.com/gitlab-org/gitlab/-/issues/218590 for details + MSG nil end diff --git a/lib/gitlab/database/schema_helpers.rb b/lib/gitlab/database/schema_helpers.rb index 34daafd06de..dda4d8eecdb 100644 --- a/lib/gitlab/database/schema_helpers.rb +++ b/lib/gitlab/database/schema_helpers.rb @@ -25,7 +25,7 @@ module Gitlab CREATE TRIGGER #{name} #{fires} ON #{table_name} FOR EACH ROW - EXECUTE PROCEDURE #{function_name}() + EXECUTE FUNCTION #{function_name}() SQL end diff --git a/lib/gitlab/database/schema_version_files.rb b/lib/gitlab/database/schema_version_files.rb new file mode 100644 index 00000000000..27a942404ef --- /dev/null +++ b/lib/gitlab/database/schema_version_files.rb @@ -0,0 +1,64 @@ +# frozen_string_literal: true + +module Gitlab + module Database + class SchemaVersionFiles + SCHEMA_DIRECTORY = 'db/schema_migrations' + MIGRATION_DIRECTORIES = %w[db/migrate db/post_migrate].freeze + MIGRATION_VERSION_GLOB = '20[0-9][0-9]*' + + def self.touch_all(versions_from_database) + versions_from_migration_files = find_versions_from_migration_files + + version_filepaths = find_version_filenames.map { |f| schema_directory.join(f) } + FileUtils.rm(version_filepaths) + + versions_to_create = versions_from_database & versions_from_migration_files + versions_to_create.each do |version| + version_filepath = schema_directory.join(version) + + File.open(version_filepath, 'w') do |file| + file << Digest::SHA256.hexdigest(version) + end + end + end + + def self.load_all + version_filenames = find_version_filenames + return if version_filenames.empty? + + values = version_filenames.map { |vf| "('#{connection.quote_string(vf)}')" } + connection.execute(<<~SQL) + INSERT INTO schema_migrations (version) + VALUES #{values.join(',')} + ON CONFLICT DO NOTHING + SQL + end + + def self.schema_directory + @schema_directory ||= Rails.root.join(SCHEMA_DIRECTORY) + end + + def self.migration_directories + @migration_directories ||= MIGRATION_DIRECTORIES.map { |dir| Rails.root.join(dir) } + end + + def self.find_version_filenames + Dir.glob(MIGRATION_VERSION_GLOB, base: schema_directory) + end + + def self.find_versions_from_migration_files + migration_directories.each_with_object([]) do |directory, migration_versions| + directory_migrations = Dir.glob(MIGRATION_VERSION_GLOB, base: directory) + directory_versions = directory_migrations.map! { |m| m.split('_').first } + + migration_versions.concat(directory_versions) + end + end + + def self.connection + ActiveRecord::Base.connection + end + end + end +end diff --git a/lib/gitlab/database/similarity_score.rb b/lib/gitlab/database/similarity_score.rb new file mode 100644 index 00000000000..2633c29438a --- /dev/null +++ b/lib/gitlab/database/similarity_score.rb @@ -0,0 +1,110 @@ +# frozen_string_literal: true + +module Gitlab + module Database + class SimilarityScore + EMPTY_STRING = Arel.sql("''").freeze + EXPRESSION_ON_INVALID_INPUT = Arel::Nodes::NamedFunction.new('CAST', [Arel.sql('0').as('integer')]).freeze + DEFAULT_MULTIPLIER = 1 + + # This method returns an Arel expression that can be used in an ActiveRecord query to order the resultset by similarity. + # + # Note: Calculating similarity score for large volume of records is inefficient. use SimilarityScore only for smaller + # resultset which is already filtered by other conditions (< 10_000 records). + # + # ==== Parameters + # * +search+ - [String] the user provided search string + # * +rules+ - [{ column: COLUMN, multiplier: 1 }, { column: COLUMN_2, multiplier: 0.5 }] rules for the scoring. + # * +column+ - Arel column expression, example: Project.arel_table["name"] + # * +multiplier+ - Integer or Float to increase or decrease the score (optional, defaults to 1) + # + # ==== Use case + # + # We'd like to search for projects by path, name and description. We want to rank higher the path and name matches, since + # it's more likely that the user was remembering the path or the name of the project. + # + # Rules: + # [ + # { column: Project.arel_table['path'], multiplier: 1 }, + # { column: Project.arel_table['name'], multiplier: 1 }, + # { column: Project.arel_table['description'], multiplier: 0.5 } + # ] + # + # ==== Examples + # + # Similarity calculation based on one column: + # + # Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [{ column: Project.arel_table['name'] }]) + # + # Similarity calculation based on two column, where the second column has lower priority: + # + # Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [ + # { column: Project.arel_table['name'], multiplier: 1 }, + # { column: Project.arel_table['description'], multiplier: 0.5 } + # ]) + # + # Integration with an ActiveRecord query: + # + # table = Project.arel_table + # + # order_expression = Gitlab::Database::SimilarityScore.build_expession(search: 'input', rules: [ + # { column: table['name'], multiplier: 1 }, + # { column: table['description'], multiplier: 0.5 } + # ]) + # + # Project.where("name LIKE ?", '%' + 'input' + '%').order(order_expression.desc) + # + # The expression can be also used in SELECT: + # + # results = Project.select(order_expression.as('similarity')).where("name LIKE ?", '%' + 'input' + '%').order(similarity: :desc) + # puts results.map(&:similarity) + # + def self.build_expression(search:, rules:) + return EXPRESSION_ON_INVALID_INPUT if search.blank? || rules.empty? + + quoted_search = ActiveRecord::Base.connection.quote(search.to_s) + + first_expression, *expressions = rules.map do |rule| + rule_to_arel(quoted_search, rule) + end + + # (SIMILARITY ...) + (SIMILARITY ...) + expressions.inject(first_expression) do |expression1, expression2| + Arel::Nodes::Addition.new(expression1, expression2) + end + end + + # (SIMILARITY(COALESCE(column, ''), 'search_string') * CAST(multiplier AS numeric)) + def self.rule_to_arel(search, rule) + Arel::Nodes::Grouping.new( + Arel::Nodes::Multiplication.new( + similarity_function_call(search, column_expression(rule)), + multiplier_expression(rule) + ) + ) + end + + # COALESCE(column, '') + def self.column_expression(rule) + Arel::Nodes::NamedFunction.new('COALESCE', [rule.fetch(:column), EMPTY_STRING]) + end + + # SIMILARITY(COALESCE(column, ''), 'search_string') + def self.similarity_function_call(search, column) + Arel::Nodes::NamedFunction.new('SIMILARITY', [column, Arel.sql(search)]) + end + + # CAST(multiplier AS numeric) + def self.multiplier_expression(rule) + quoted_multiplier = ActiveRecord::Base.connection.quote(rule.fetch(:multiplier, DEFAULT_MULTIPLIER).to_s) + + Arel::Nodes::NamedFunction.new('CAST', [Arel.sql(quoted_multiplier).as('numeric')]) + end + + private_class_method :rule_to_arel + private_class_method :column_expression + private_class_method :similarity_function_call + private_class_method :multiplier_expression + end + end +end diff --git a/lib/gitlab/database/with_lock_retries.rb b/lib/gitlab/database/with_lock_retries.rb index bebcba6f42e..a9c86e4e267 100644 --- a/lib/gitlab/database/with_lock_retries.rb +++ b/lib/gitlab/database/with_lock_retries.rb @@ -2,7 +2,14 @@ module Gitlab module Database + # This class provides a way to automatically execute code that relies on acquiring a database lock in a way + # designed to minimize impact on a busy production database. + # + # A default timing configuration is provided that makes repeated attempts to acquire the necessary lock, with + # varying lock_timeout settings, and also serves to limit the maximum number of attempts. class WithLockRetries + AttemptsExhaustedError = Class.new(StandardError) + NULL_LOGGER = Gitlab::JsonLogger.new('/dev/null') # Each element of the array represents a retry iteration. @@ -63,7 +70,17 @@ module Gitlab @log_params = { method: 'with_lock_retries', class: klass.to_s } end - def run(&block) + # Executes a block of code, retrying it whenever a database lock can't be acquired in time + # + # When a database lock can't be acquired, ActiveRecord throws ActiveRecord::LockWaitTimeout + # exception which we intercept to re-execute the block of code, until it finishes or we reach the + # max attempt limit. The default behavior when max attempts have been reached is to make a final attempt with the + # lock_timeout disabled, but this can be altered with the raise_on_exhaustion parameter. + # + # @see DEFAULT_TIMING_CONFIGURATION for the timings used when attempting a retry + # @param [Boolean] raise_on_exhaustion whether to raise `AttemptsExhaustedError` when exhausting max attempts + # @param [Proc] block of code that will be executed + def run(raise_on_exhaustion: false, &block) raise 'no block given' unless block_given? @block = block @@ -85,6 +102,9 @@ module Gitlab retry else reset_db_settings + + raise AttemptsExhaustedError, 'configured attempts to obtain locks are exhausted' if raise_on_exhaustion + run_block_without_lock_timeout end |