diff options
author | GitLab Bot <gitlab-bot@gitlab.com> | 2022-01-20 12:16:11 +0300 |
---|---|---|
committer | GitLab Bot <gitlab-bot@gitlab.com> | 2022-01-20 12:16:11 +0300 |
commit | edaa33dee2ff2f7ea3fac488d41558eb5f86d68c (patch) | |
tree | 11f143effbfeba52329fb7afbd05e6e2a3790241 /lib/gitlab/background_migration | |
parent | d8a5691316400a0f7ec4f83832698f1988eb27c1 (diff) |
Add latest changes from gitlab-org/gitlab@14-7-stable-eev14.7.0-rc42
Diffstat (limited to 'lib/gitlab/background_migration')
15 files changed, 539 insertions, 328 deletions
diff --git a/lib/gitlab/background_migration/backfill_ci_namespace_mirrors.rb b/lib/gitlab/background_migration/backfill_ci_namespace_mirrors.rb new file mode 100644 index 00000000000..2247747ba08 --- /dev/null +++ b/lib/gitlab/background_migration/backfill_ci_namespace_mirrors.rb @@ -0,0 +1,77 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # A job to create ci_namespace_mirrors entries in batches + class BackfillCiNamespaceMirrors + class Namespace < ActiveRecord::Base # rubocop:disable Style/Documentation + include ::EachBatch + + self.table_name = 'namespaces' + self.inheritance_column = nil + + scope :base_query, -> do + select(:id, :parent_id) + end + end + + PAUSE_SECONDS = 0.1 + SUB_BATCH_SIZE = 500 + + def perform(start_id, end_id) + batch_query = Namespace.base_query.where(id: start_id..end_id) + batch_query.each_batch(of: SUB_BATCH_SIZE) do |sub_batch| + first, last = sub_batch.pluck(Arel.sql('MIN(id), MAX(id)')).first + ranged_query = Namespace.unscoped.base_query.where(id: first..last) + + update_sql = <<~SQL + INSERT INTO ci_namespace_mirrors (namespace_id, traversal_ids) + #{insert_values(ranged_query)} + ON CONFLICT (namespace_id) DO NOTHING + SQL + # We do nothing on conflict because we consider they were already filled. + + Namespace.connection.execute(update_sql) + + sleep PAUSE_SECONDS + end + + mark_job_as_succeeded(start_id, end_id) + end + + private + + def insert_values(batch) + calculated_traversal_ids( + batch.allow_cross_joins_across_databases(url: 'https://gitlab.com/gitlab-org/gitlab/-/issues/336433') + ) + end + + # Copied from lib/gitlab/background_migration/backfill_namespace_traversal_ids_children.rb + def calculated_traversal_ids(batch) + <<~SQL + WITH RECURSIVE cte(source_id, namespace_id, parent_id, height) AS ( + ( + SELECT batch.id, batch.id, batch.parent_id, 1 + FROM (#{batch.to_sql}) AS batch + ) + UNION ALL + ( + SELECT cte.source_id, n.id, n.parent_id, cte.height+1 + FROM namespaces n, cte + WHERE n.id = cte.parent_id + ) + ) + SELECT flat_hierarchy.source_id as namespace_id, + array_agg(flat_hierarchy.namespace_id ORDER BY flat_hierarchy.height DESC) as traversal_ids + FROM (SELECT * FROM cte FOR UPDATE) flat_hierarchy + GROUP BY flat_hierarchy.source_id + SQL + end + + def mark_job_as_succeeded(*arguments) + Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded('BackfillCiNamespaceMirrors', arguments) + end + end + end +end diff --git a/lib/gitlab/background_migration/backfill_ci_project_mirrors.rb b/lib/gitlab/background_migration/backfill_ci_project_mirrors.rb new file mode 100644 index 00000000000..ff6ab9928b0 --- /dev/null +++ b/lib/gitlab/background_migration/backfill_ci_project_mirrors.rb @@ -0,0 +1,52 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # A job to create ci_project_mirrors entries in batches + class BackfillCiProjectMirrors + class Project < ActiveRecord::Base # rubocop:disable Style/Documentation + include ::EachBatch + + self.table_name = 'projects' + + scope :base_query, -> do + select(:id, :namespace_id) + end + end + + PAUSE_SECONDS = 0.1 + SUB_BATCH_SIZE = 500 + + def perform(start_id, end_id) + batch_query = Project.base_query.where(id: start_id..end_id) + batch_query.each_batch(of: SUB_BATCH_SIZE) do |sub_batch| + first, last = sub_batch.pluck(Arel.sql('MIN(id), MAX(id)')).first + ranged_query = Project.unscoped.base_query.where(id: first..last) + + update_sql = <<~SQL + INSERT INTO ci_project_mirrors (project_id, namespace_id) + #{insert_values(ranged_query)} + ON CONFLICT (project_id) DO NOTHING + SQL + # We do nothing on conflict because we consider they were already filled. + + Project.connection.execute(update_sql) + + sleep PAUSE_SECONDS + end + + mark_job_as_succeeded(start_id, end_id) + end + + private + + def insert_values(batch) + batch.allow_cross_joins_across_databases(url: 'https://gitlab.com/gitlab-org/gitlab/-/issues/336433').to_sql + end + + def mark_job_as_succeeded(*arguments) + Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded('BackfillCiProjectMirrors', arguments) + end + end + end +end diff --git a/lib/gitlab/background_migration/backfill_incident_issue_escalation_statuses.rb b/lib/gitlab/background_migration/backfill_incident_issue_escalation_statuses.rb new file mode 100644 index 00000000000..2d46ff6b933 --- /dev/null +++ b/lib/gitlab/background_migration/backfill_incident_issue_escalation_statuses.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # BackfillIncidentIssueEscalationStatuses adds + # IncidentManagement::IssuableEscalationStatus records for existing Incident issues. + # They will be added with no policy, and escalations_started_at as nil. + class BackfillIncidentIssueEscalationStatuses + def perform(start_id, stop_id) + ActiveRecord::Base.connection.execute <<~SQL + INSERT INTO incident_management_issuable_escalation_statuses (issue_id, created_at, updated_at) + SELECT issues.id, current_timestamp, current_timestamp + FROM issues + WHERE issues.issue_type = 1 + AND issues.id BETWEEN #{start_id} AND #{stop_id} + ON CONFLICT (issue_id) DO NOTHING; + SQL + + mark_job_as_succeeded(start_id, stop_id) + end + + private + + def mark_job_as_succeeded(*arguments) + ::Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded( + self.class.name.demodulize, + arguments + ) + end + end + end +end diff --git a/lib/gitlab/background_migration/base_job.rb b/lib/gitlab/background_migration/base_job.rb new file mode 100644 index 00000000000..e21e7e0e4a3 --- /dev/null +++ b/lib/gitlab/background_migration/base_job.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # Simple base class for background migration job classes which are executed through the sidekiq queue. + # + # Any job class that inherits from the base class will have connection to the tracking database set on + # initialization. + class BaseJob + def initialize(connection:) + @connection = connection + end + + def perform(*arguments) + raise NotImplementedError, "subclasses of #{self.class.name} must implement #{__method__}" + end + + private + + attr_reader :connection + end + end +end diff --git a/lib/gitlab/background_migration/cleanup_concurrent_rename.rb b/lib/gitlab/background_migration/cleanup_concurrent_rename.rb deleted file mode 100644 index d3f366f3480..00000000000 --- a/lib/gitlab/background_migration/cleanup_concurrent_rename.rb +++ /dev/null @@ -1,14 +0,0 @@ -# frozen_string_literal: true - -module Gitlab - module BackgroundMigration - # Background migration for cleaning up a concurrent column rename. - class CleanupConcurrentRename < CleanupConcurrentSchemaChange - RESCHEDULE_DELAY = 10.minutes - - def cleanup_concurrent_schema_change(table, old_column, new_column) - cleanup_concurrent_column_rename(table, old_column, new_column) - end - end - end -end diff --git a/lib/gitlab/background_migration/cleanup_concurrent_schema_change.rb b/lib/gitlab/background_migration/cleanup_concurrent_schema_change.rb deleted file mode 100644 index 91b50c1a493..00000000000 --- a/lib/gitlab/background_migration/cleanup_concurrent_schema_change.rb +++ /dev/null @@ -1,56 +0,0 @@ -# frozen_string_literal: true - -module Gitlab - module BackgroundMigration - # Base class for background migration for rename/type changes. - class CleanupConcurrentSchemaChange - include Database::MigrationHelpers - - # table - The name of the table the migration is performed for. - # old_column - The name of the old (to drop) column. - # new_column - The name of the new column. - def perform(table, old_column, new_column) - return unless column_exists?(table, new_column) && column_exists?(table, old_column) - - rows_to_migrate = define_model_for(table) - .where(new_column => nil) - .where - .not(old_column => nil) - - if rows_to_migrate.any? - BackgroundMigrationWorker.perform_in( - RESCHEDULE_DELAY, - self.class.name, - [table, old_column, new_column] - ) - else - cleanup_concurrent_schema_change(table, old_column, new_column) - end - end - - def cleanup_concurrent_schema_change(_table, _old_column, _new_column) - raise NotImplementedError - end - - # These methods are necessary so we can re-use the migration helpers in - # this class. - def connection - ActiveRecord::Base.connection - end - - def method_missing(name, *args, &block) - connection.__send__(name, *args, &block) # rubocop: disable GitlabSecurity/PublicSend - end - - def respond_to_missing?(*args) - connection.respond_to?(*args) || super - end - - def define_model_for(table) - Class.new(ActiveRecord::Base) do - self.table_name = table - end - end - end - end -end diff --git a/lib/gitlab/background_migration/cleanup_concurrent_type_change.rb b/lib/gitlab/background_migration/cleanup_concurrent_type_change.rb deleted file mode 100644 index 48411095dbb..00000000000 --- a/lib/gitlab/background_migration/cleanup_concurrent_type_change.rb +++ /dev/null @@ -1,14 +0,0 @@ -# frozen_string_literal: true - -module Gitlab - module BackgroundMigration - # Background migration for cleaning up a concurrent column type changeb. - class CleanupConcurrentTypeChange < CleanupConcurrentSchemaChange - RESCHEDULE_DELAY = 10.minutes - - def cleanup_concurrent_schema_change(table, old_column, new_column) - cleanup_concurrent_column_type_change(table, old_column) - end - end - end -end diff --git a/lib/gitlab/background_migration/copy_column.rb b/lib/gitlab/background_migration/copy_column.rb deleted file mode 100644 index ef70f37d5eb..00000000000 --- a/lib/gitlab/background_migration/copy_column.rb +++ /dev/null @@ -1,41 +0,0 @@ -# frozen_string_literal: true - -module Gitlab - module BackgroundMigration - # CopyColumn is a simple (reusable) background migration that can be used to - # update the value of a column based on the value of another column in the - # same table. - # - # For this background migration to work the table that is migrated _has_ to - # have an `id` column as the primary key. - class CopyColumn - # table - The name of the table that contains the columns. - # copy_from - The column containing the data to copy. - # copy_to - The column to copy the data to. - # start_id - The start ID of the range of rows to update. - # end_id - The end ID of the range of rows to update. - def perform(table, copy_from, copy_to, start_id, end_id) - return unless connection.column_exists?(table, copy_to) - - quoted_table = connection.quote_table_name(table) - quoted_copy_from = connection.quote_column_name(copy_from) - quoted_copy_to = connection.quote_column_name(copy_to) - - # We're using raw SQL here since this job may be frequently executed. As - # a result dynamically defining models would lead to many unnecessary - # schema information queries. - connection.execute <<-SQL.strip_heredoc - UPDATE #{quoted_table} - SET #{quoted_copy_to} = #{quoted_copy_from} - WHERE id BETWEEN #{start_id} AND #{end_id} - AND #{quoted_copy_from} IS NOT NULL - AND #{quoted_copy_to} IS NULL - SQL - end - - def connection - ActiveRecord::Base.connection - end - end - end -end diff --git a/lib/gitlab/background_migration/encrypt_static_object_token.rb b/lib/gitlab/background_migration/encrypt_static_object_token.rb new file mode 100644 index 00000000000..80931353e2f --- /dev/null +++ b/lib/gitlab/background_migration/encrypt_static_object_token.rb @@ -0,0 +1,70 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # Populates "static_object_token_encrypted" field with encrypted versions + # of values from "static_object_token" field + class EncryptStaticObjectToken + # rubocop:disable Style/Documentation + class User < ActiveRecord::Base + include ::EachBatch + self.table_name = 'users' + scope :with_static_object_token, -> { where.not(static_object_token: nil) } + scope :without_static_object_token_encrypted, -> { where(static_object_token_encrypted: nil) } + end + # rubocop:enable Style/Documentation + + BATCH_SIZE = 100 + + def perform(start_id, end_id) + ranged_query = User + .where(id: start_id..end_id) + .with_static_object_token + .without_static_object_token_encrypted + + ranged_query.each_batch(of: BATCH_SIZE) do |sub_batch| + first, last = sub_batch.pluck(Arel.sql('min(id), max(id)')).first + + batch_query = User.unscoped + .where(id: first..last) + .with_static_object_token + .without_static_object_token_encrypted + + user_tokens = batch_query.pluck(:id, :static_object_token) + + user_encrypted_tokens = user_tokens.map do |(id, plaintext_token)| + next if plaintext_token.blank? + + [id, Gitlab::CryptoHelper.aes256_gcm_encrypt(plaintext_token)] + end + + encrypted_tokens_sql = user_encrypted_tokens.compact.map { |(id, token)| "(#{id}, '#{token}')" }.join(',') + + if user_encrypted_tokens.present? + User.connection.execute(<<~SQL) + WITH cte(cte_id, cte_token) AS #{::Gitlab::Database::AsWithMaterialized.materialized_if_supported} ( + SELECT * + FROM (VALUES #{encrypted_tokens_sql}) AS t (id, token) + ) + UPDATE #{User.table_name} + SET static_object_token_encrypted = cte_token + FROM cte + WHERE cte_id = id + SQL + end + + mark_job_as_succeeded(start_id, end_id) + end + end + + private + + def mark_job_as_succeeded(*arguments) + Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded( + self.class.name.demodulize, + arguments + ) + end + end + end +end diff --git a/lib/gitlab/background_migration/fix_vulnerability_occurrences_with_hashes_as_raw_metadata.rb b/lib/gitlab/background_migration/fix_vulnerability_occurrences_with_hashes_as_raw_metadata.rb new file mode 100644 index 00000000000..2b049ea2d2f --- /dev/null +++ b/lib/gitlab/background_migration/fix_vulnerability_occurrences_with_hashes_as_raw_metadata.rb @@ -0,0 +1,124 @@ +# frozen_string_literal: true + +require 'parser/ruby27' + +module Gitlab + module BackgroundMigration + # This migration fixes raw_metadata entries which have incorrectly been passed a Ruby Hash instead of JSON data. + class FixVulnerabilityOccurrencesWithHashesAsRawMetadata + CLUSTER_IMAGE_SCANNING_REPORT_TYPE = 7 + GENERIC_REPORT_TYPE = 99 + + # Type error is used to handle unexpected types when parsing stringified hashes. + class TypeError < ::StandardError + attr_reader :message, :type + + def initialize(message, type) + @message = message + @type = type + end + end + + # Migration model namespace isolated from application code. + class Finding < ActiveRecord::Base + include EachBatch + + self.table_name = 'vulnerability_occurrences' + + scope :by_api_report_types, -> { where(report_type: [CLUSTER_IMAGE_SCANNING_REPORT_TYPE, GENERIC_REPORT_TYPE]) } + end + + def perform(start_id, end_id) + Finding.by_api_report_types.where(id: start_id..end_id).each do |finding| + next if valid_json?(finding.raw_metadata) + + metadata = hash_from_s(finding.raw_metadata) + + finding.update(raw_metadata: metadata.to_json) if metadata + end + mark_job_as_succeeded(start_id, end_id) + end + + def hash_from_s(str_hash) + ast = Parser::Ruby27.parse(str_hash) + + unless ast.type == :hash + ::Gitlab::AppLogger.error(message: "expected raw_metadata to be a hash", type: ast.type) + return + end + + parse_hash(ast) + rescue Parser::SyntaxError => e + ::Gitlab::AppLogger.error(message: "error parsing raw_metadata", error: e.message) + nil + rescue TypeError => e + ::Gitlab::AppLogger.error(message: "error parsing raw_metadata", error: e.message, type: e.type) + nil + end + + private + + def mark_job_as_succeeded(*arguments) + Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded( + 'FixVulnerabilityOccurrencesWithHashesAsRawMetadata', + arguments + ) + end + + def valid_json?(metadata) + Oj.load(metadata) + true + rescue Oj::ParseError, Encoding::UndefinedConversionError + false + end + + def parse_hash(hash) + out = {} + hash.children.each do |node| + unless node.type == :pair + raise TypeError.new("expected child of hash to be a `pair`", node.type) + end + + key, value = node.children + + key = parse_key(key) + value = parse_value(value) + + out[key] = value + end + + out + end + + def parse_key(key) + case key.type + when :sym, :str, :int + key.children.first + else + raise TypeError.new("expected key to be either symbol, string, or integer", key.type) + end + end + + def parse_value(value) + case value.type + when :sym, :str, :int + value.children.first + # rubocop:disable Lint/BooleanSymbol + when :true + true + when :false + false + # rubocop:enable Lint/BooleanSymbol + when :nil + nil + when :array + value.children.map { |c| parse_value(c) } + when :hash + parse_hash(value) + else + raise TypeError.new("value of a pair was an unexpected type", value.type) + end + end + end + end +end diff --git a/lib/gitlab/background_migration/job_coordinator.rb b/lib/gitlab/background_migration/job_coordinator.rb index cfbe7167677..5dc77f935e3 100644 --- a/lib/gitlab/background_migration/job_coordinator.rb +++ b/lib/gitlab/background_migration/job_coordinator.rb @@ -36,6 +36,8 @@ module Gitlab attr_reader :worker_class + delegate :minimum_interval, :perform_in, to: :worker_class + def queue @queue ||= worker_class.sidekiq_options['queue'] end @@ -79,7 +81,7 @@ module Gitlab def perform(class_name, arguments) with_shared_connection do - migration_class_for(class_name).new.perform(*arguments) + migration_instance_for(class_name).perform(*arguments) end end @@ -113,6 +115,16 @@ module Gitlab enqueued_job?([retry_set], migration_class) end + def migration_instance_for(class_name) + migration_class = migration_class_for(class_name) + + if migration_class < Gitlab::BackgroundMigration::BaseJob + migration_class.new(connection: connection) + else + migration_class.new + end + end + def migration_class_for(class_name) Gitlab::BackgroundMigration.const_get(class_name, false) end diff --git a/lib/gitlab/background_migration/migrate_legacy_artifacts.rb b/lib/gitlab/background_migration/migrate_legacy_artifacts.rb deleted file mode 100644 index 23d99274232..00000000000 --- a/lib/gitlab/background_migration/migrate_legacy_artifacts.rb +++ /dev/null @@ -1,130 +0,0 @@ -# frozen_string_literal: true -# rubocop:disable Metrics/ClassLength - -module Gitlab - module BackgroundMigration - ## - # The class to migrate job artifacts from `ci_builds` to `ci_job_artifacts` - class MigrateLegacyArtifacts - FILE_LOCAL_STORE = 1 # equal to ObjectStorage::Store::LOCAL - ARCHIVE_FILE_TYPE = 1 # equal to Ci::JobArtifact.file_types['archive'] - METADATA_FILE_TYPE = 2 # equal to Ci::JobArtifact.file_types['metadata'] - LEGACY_PATH_FILE_LOCATION = 1 # equal to Ci::JobArtifact.file_location['legacy_path'] - - def perform(start_id, stop_id) - ActiveRecord::Base.transaction do - insert_archives(start_id, stop_id) - insert_metadatas(start_id, stop_id) - delete_legacy_artifacts(start_id, stop_id) - end - end - - private - - def insert_archives(start_id, stop_id) - ActiveRecord::Base.connection.execute <<~SQL - INSERT INTO - ci_job_artifacts ( - project_id, - job_id, - expire_at, - file_location, - created_at, - updated_at, - file, - size, - file_store, - file_type - ) - SELECT - project_id, - id, - artifacts_expire_at #{add_missing_db_timezone}, - #{LEGACY_PATH_FILE_LOCATION}, - created_at #{add_missing_db_timezone}, - created_at #{add_missing_db_timezone}, - artifacts_file, - artifacts_size, - COALESCE(artifacts_file_store, #{FILE_LOCAL_STORE}), - #{ARCHIVE_FILE_TYPE} - FROM - ci_builds - WHERE - id BETWEEN #{start_id.to_i} AND #{stop_id.to_i} - AND artifacts_file <> '' - AND NOT EXISTS ( - SELECT - 1 - FROM - ci_job_artifacts - WHERE - ci_builds.id = ci_job_artifacts.job_id - AND ci_job_artifacts.file_type = #{ARCHIVE_FILE_TYPE}) - SQL - end - - def insert_metadatas(start_id, stop_id) - ActiveRecord::Base.connection.execute <<~SQL - INSERT INTO - ci_job_artifacts ( - project_id, - job_id, - expire_at, - file_location, - created_at, - updated_at, - file, - size, - file_store, - file_type - ) - SELECT - project_id, - id, - artifacts_expire_at #{add_missing_db_timezone}, - #{LEGACY_PATH_FILE_LOCATION}, - created_at #{add_missing_db_timezone}, - created_at #{add_missing_db_timezone}, - artifacts_metadata, - NULL, - COALESCE(artifacts_metadata_store, #{FILE_LOCAL_STORE}), - #{METADATA_FILE_TYPE} - FROM - ci_builds - WHERE - id BETWEEN #{start_id.to_i} AND #{stop_id.to_i} - AND artifacts_file <> '' - AND artifacts_metadata <> '' - AND NOT EXISTS ( - SELECT - 1 - FROM - ci_job_artifacts - WHERE - ci_builds.id = ci_job_artifacts.job_id - AND ci_job_artifacts.file_type = #{METADATA_FILE_TYPE}) - SQL - end - - def delete_legacy_artifacts(start_id, stop_id) - ActiveRecord::Base.connection.execute <<~SQL - UPDATE - ci_builds - SET - artifacts_file = NULL, - artifacts_file_store = NULL, - artifacts_size = NULL, - artifacts_metadata = NULL, - artifacts_metadata_store = NULL - WHERE - id BETWEEN #{start_id.to_i} AND #{stop_id.to_i} - AND artifacts_file <> '' - SQL - end - - def add_missing_db_timezone - 'at time zone \'UTC\'' - end - end - end -end diff --git a/lib/gitlab/background_migration/populate_test_reports_issue_id.rb b/lib/gitlab/background_migration/populate_test_reports_issue_id.rb new file mode 100644 index 00000000000..301efd0c943 --- /dev/null +++ b/lib/gitlab/background_migration/populate_test_reports_issue_id.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true +# rubocop: disable Style/Documentation + +module Gitlab + module BackgroundMigration + class PopulateTestReportsIssueId + def perform(start_id, stop_id) + # NO OP + end + end + end +end + +Gitlab::BackgroundMigration::PopulateTestReportsIssueId.prepend_mod diff --git a/lib/gitlab/background_migration/recalculate_vulnerabilities_occurrences_uuid.rb b/lib/gitlab/background_migration/recalculate_vulnerabilities_occurrences_uuid.rb index 84ff7423254..c1b8de1f6aa 100644 --- a/lib/gitlab/background_migration/recalculate_vulnerabilities_occurrences_uuid.rb +++ b/lib/gitlab/background_migration/recalculate_vulnerabilities_occurrences_uuid.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true # rubocop: disable Style/Documentation -class Gitlab::BackgroundMigration::RecalculateVulnerabilitiesOccurrencesUuid +class Gitlab::BackgroundMigration::RecalculateVulnerabilitiesOccurrencesUuid # rubocop:disable Metrics/ClassLength # rubocop: disable Gitlab/NamespacedClass class VulnerabilitiesIdentifier < ActiveRecord::Base self.table_name = "vulnerability_identifiers" @@ -9,10 +9,14 @@ class Gitlab::BackgroundMigration::RecalculateVulnerabilitiesOccurrencesUuid end class VulnerabilitiesFinding < ActiveRecord::Base + include EachBatch include ShaAttribute self.table_name = "vulnerability_occurrences" + + has_many :signatures, foreign_key: 'finding_id', class_name: 'VulnerabilityFindingSignature', inverse_of: :finding belongs_to :primary_identifier, class_name: 'VulnerabilitiesIdentifier', inverse_of: :primary_findings, foreign_key: 'primary_identifier_id' + REPORT_TYPES = { sast: 0, dependency_scanning: 1, @@ -20,7 +24,9 @@ class Gitlab::BackgroundMigration::RecalculateVulnerabilitiesOccurrencesUuid dast: 3, secret_detection: 4, coverage_fuzzing: 5, - api_fuzzing: 6 + api_fuzzing: 6, + cluster_image_scanning: 7, + generic: 99 }.with_indifferent_access.freeze enum report_type: REPORT_TYPES @@ -28,6 +34,25 @@ class Gitlab::BackgroundMigration::RecalculateVulnerabilitiesOccurrencesUuid sha_attribute :location_fingerprint end + class VulnerabilityFindingSignature < ActiveRecord::Base + include ShaAttribute + + self.table_name = 'vulnerability_finding_signatures' + belongs_to :finding, foreign_key: 'finding_id', inverse_of: :signatures, class_name: 'VulnerabilitiesFinding' + + sha_attribute :signature_sha + end + + class VulnerabilitiesFindingPipeline < ActiveRecord::Base + include EachBatch + self.table_name = "vulnerability_occurrence_pipelines" + end + + class Vulnerability < ActiveRecord::Base + include EachBatch + self.table_name = "vulnerabilities" + end + class CalculateFindingUUID FINDING_NAMESPACES_IDS = { development: "a143e9e2-41b3-47bc-9a19-081d089229f4", @@ -52,35 +77,122 @@ class Gitlab::BackgroundMigration::RecalculateVulnerabilitiesOccurrencesUuid end # rubocop: enable Gitlab/NamespacedClass + # rubocop: disable Metrics/AbcSize,Metrics/MethodLength,Metrics/BlockLength def perform(start_id, end_id) - findings = VulnerabilitiesFinding - .joins(:primary_identifier) - .select(:id, :report_type, :fingerprint, :location_fingerprint, :project_id) - .where(id: start_id..end_id) - - mappings = findings.each_with_object({}) do |finding, hash| - hash[finding] = { uuid: calculate_uuid_v5_for_finding(finding) } + unless Feature.enabled?(:migrate_vulnerability_finding_uuids, default_enabled: true) + return log_info('Migration is disabled by the feature flag', start_id: start_id, end_id: end_id) end - ::Gitlab::Database::BulkUpdate.execute(%i[uuid], mappings) + log_info('Migration started', start_id: start_id, end_id: end_id) - logger.info(message: 'RecalculateVulnerabilitiesOccurrencesUuid Migration: recalculation is done for:', - finding_ids: mappings.keys.pluck(:id)) + VulnerabilitiesFinding + .joins(:primary_identifier) + .includes(:signatures) + .select(:id, :report_type, :primary_identifier_id, :fingerprint, :location_fingerprint, :project_id, :created_at, :vulnerability_id, :uuid) + .where(id: start_id..end_id) + .each_batch(of: 50) do |relation| + duplicates = find_duplicates(relation) + remove_findings(ids: duplicates) if duplicates.present? + + to_update = relation.reject { |finding| duplicates.include?(finding.id) } + + begin + known_uuids = Set.new + to_be_deleted = [] + + mappings = to_update.each_with_object({}) do |finding, hash| + uuid = calculate_uuid_v5_for_finding(finding) + + if known_uuids.add?(uuid) + hash[finding] = { uuid: uuid } + else + to_be_deleted << finding.id + end + end + + # It is technically still possible to have duplicate uuids + # if the data integrity is broken somehow and the primary identifiers of + # the findings are pointing to different projects with the same fingerprint values. + if to_be_deleted.present? + log_info('Conflicting UUIDs found within the batch', finding_ids: to_be_deleted) + + remove_findings(ids: to_be_deleted) + end + + ::Gitlab::Database::BulkUpdate.execute(%i[uuid], mappings) if mappings.present? + + log_info('Recalculation is done', finding_ids: mappings.keys.pluck(:id)) + rescue ActiveRecord::RecordNotUnique => error + log_info('RecordNotUnique error received') + + match_data = /\(uuid\)=\((?<uuid>\S{36})\)/.match(error.message) + + # This exception returns the **correct** UUIDv5 which probably comes from a later record + # and it's the one we can drop in the easiest way before retrying the UPDATE query + if match_data + uuid = match_data[:uuid] + log_info('Conflicting UUID found', uuid: uuid) + + id = VulnerabilitiesFinding.find_by(uuid: uuid)&.id + remove_findings(ids: id) if id + retry + else + log_error('Couldnt find conflicting uuid') + + Gitlab::ErrorTracking.track_and_raise_exception(error) + end + end + end mark_job_as_succeeded(start_id, end_id) rescue StandardError => error - Gitlab::ErrorTracking.track_and_raise_for_dev_exception(error) + log_error('An exception happened') + + Gitlab::ErrorTracking.track_and_raise_exception(error) end + # rubocop: disable Metrics/AbcSize,Metrics/MethodLength,Metrics/BlockLength private + def find_duplicates(relation) + to_exclude = [] + relation.flat_map do |record| + # Assuming we're scanning id 31 and the duplicate is id 40 + # first we'd process 31 and add 40 to the list of ids to remove + # then we would process record 40 and add 31 to the list of removals + # so we would drop both records + to_exclude << record.id + + VulnerabilitiesFinding.where( + report_type: record.report_type, + location_fingerprint: record.location_fingerprint, + primary_identifier_id: record.primary_identifier_id, + project_id: record.project_id + ).where.not(id: to_exclude).pluck(:id) + end + end + + def remove_findings(ids:) + ids = Array(ids) + log_info('Removing Findings and associated records', ids: ids) + + vulnerability_ids = VulnerabilitiesFinding.where(id: ids).pluck(:vulnerability_id).uniq.compact + + VulnerabilitiesFindingPipeline.where(occurrence_id: ids).each_batch { |batch| batch.delete_all } + Vulnerability.where(id: vulnerability_ids).each_batch { |batch| batch.delete_all } + VulnerabilitiesFinding.where(id: ids).delete_all + end + def calculate_uuid_v5_for_finding(vulnerability_finding) return unless vulnerability_finding + signatures = vulnerability_finding.signatures.sort_by { |signature| signature.algorithm_type_before_type_cast } + location_fingerprint = signatures.last&.signature_sha || vulnerability_finding.location_fingerprint + uuid_v5_name_components = { report_type: vulnerability_finding.report_type, primary_identifier_fingerprint: vulnerability_finding.fingerprint, - location_fingerprint: vulnerability_finding.location_fingerprint, + location_fingerprint: location_fingerprint, project_id: vulnerability_finding.project_id } @@ -89,6 +201,14 @@ class Gitlab::BackgroundMigration::RecalculateVulnerabilitiesOccurrencesUuid CalculateFindingUUID.call(name) end + def log_info(message, **extra) + logger.info(migrator: 'RecalculateVulnerabilitiesOccurrencesUuid', message: message, **extra) + end + + def log_error(message, **extra) + logger.error(migrator: 'RecalculateVulnerabilitiesOccurrencesUuid', message: message, **extra) + end + def logger @logger ||= Gitlab::BackgroundMigration::Logger.build end diff --git a/lib/gitlab/background_migration/remove_duplicate_services.rb b/lib/gitlab/background_migration/remove_duplicate_services.rb deleted file mode 100644 index 59fb9143a72..00000000000 --- a/lib/gitlab/background_migration/remove_duplicate_services.rb +++ /dev/null @@ -1,58 +0,0 @@ -# frozen_string_literal: true - -module Gitlab - module BackgroundMigration - # Remove duplicated service records with the same project and type. - # These were created in the past for unknown reasons, and should be blocked - # now by the uniqueness validation in the Service model. - class RemoveDuplicateServices - # See app/models/service - class Service < ActiveRecord::Base - include EachBatch - - self.table_name = 'services' - self.inheritance_column = :_type_disabled - - scope :project_ids_with_duplicates, -> do - select(:project_id) - .distinct - .where.not(project_id: nil) - .group(:project_id, :type) - .having('count(*) > 1') - end - - scope :types_with_duplicates, -> (project_ids) do - select(:project_id, :type) - .where(project_id: project_ids) - .group(:project_id, :type) - .having('count(*) > 1') - end - end - - def perform(*project_ids) - types_with_duplicates = Service.types_with_duplicates(project_ids).pluck(:project_id, :type) - - types_with_duplicates.each do |project_id, type| - remove_duplicates(project_id, type) - end - end - - private - - def remove_duplicates(project_id, type) - scope = Service.where(project_id: project_id, type: type) - - # Build a subquery to determine which service record is actually in use, - # by querying for it without specifying an order. - # - # This should match the record returned by `Project#find_service`, - # and the `has_one` service associations on `Project`. - correct_service = scope.select(:id).limit(1) - - # Delete all other services with the same `project_id` and `type` - duplicate_services = scope.where.not(id: correct_service) - duplicate_services.delete_all - end - end - end -end |