Welcome to mirror list, hosted at ThFree Co, Russian Federation.

batched_background_migration_helpers.rb « migrations « database « gitlab « lib - gitlab.com/gitlab-org/gitlab-foss.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 64cde273a59f346656af0530914dc9a7151a87c4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# frozen_string_literal: true

module Gitlab
  module Database
    module Migrations
      # BatchedBackgroundMigrations are a new approach to scheduling and executing background migrations, which uses
      # persistent state in the database to track each migration. This avoids having to batch over an entire table and
      # schedule a large number of sidekiq jobs upfront. It also provides for more flexibility as the migration runs,
      # as it can be paused and restarted, and have configuration values like the batch size updated dynamically as the
      # migration runs.
      #
      # For now, these migrations are not considered ready for general use, for more information see the tracking epic:
      # https://gitlab.com/groups/gitlab-org/-/epics/6751
      module BatchedBackgroundMigrationHelpers
        NonExistentMigrationError = Class.new(StandardError)
        BATCH_SIZE = 1_000 # Number of rows to process per job
        SUB_BATCH_SIZE = 100 # Number of rows to process per sub-batch
        BATCH_CLASS_NAME = 'PrimaryKeyBatchingStrategy' # Default batch class for batched migrations
        BATCH_MIN_VALUE = 1 # Default minimum value for batched migrations
        BATCH_MIN_DELAY = 2.minutes.freeze # Minimum delay between batched migrations

        # Creates a batched background migration for the given table. A batched migration runs one job
        # at a time, computing the bounds of the next batch based on the current migration settings and the previous
        # batch bounds. Each job's execution status is tracked in the database as the migration runs. The given job
        # class must be present in the Gitlab::BackgroundMigration module, and the batch class (if specified) must be
        # present in the Gitlab::BackgroundMigration::BatchingStrategies module.
        #
        # If migration with same job_class_name, table_name, column_name, and job_arguments already exists, this helper
        # will log an warning and not create a new one.
        #
        # job_class_name - The background migration job class as a string
        # batch_table_name - The name of the table the migration will batch over
        # batch_column_name - The name of the column the migration will batch over
        # job_arguments - Extra arguments to pass to the job instance when the migration runs
        # job_interval - The pause interval between each job's execution, minimum of 2 minutes
        # batch_min_value - The value in the column the batching will begin at
        # batch_max_value - The value in the column the batching will end at, defaults to `SELECT MAX(batch_column)`
        # batch_class_name - The name of the class that will be called to find the range of each next batch
        # batch_size - The maximum number of rows per job
        # sub_batch_size - The maximum number of rows processed per "iteration" within the job
        # queued_migration_version - Version of the migration that queues the BBM, this is used to establish dependecies
        #
        # queued_migration_version is made optional temporarily to allow prior migrations to not fail,
        # https://gitlab.com/gitlab-org/gitlab/-/issues/426417 will make it mandatory.
        #
        # *Returns the created BatchedMigration record*
        #
        # Example:
        #
        #     queue_batched_background_migration(
        #       'CopyColumnUsingBackgroundMigrationJob',
        #       :events,
        #       :id,
        #       job_interval: 2.minutes,
        #       other_job_arguments: ['column1', 'column2'])
        #
        # Where the the background migration exists:
        #
        #     class Gitlab::BackgroundMigration::CopyColumnUsingBackgroundMigrationJob
        #       def perform(start_id, end_id, batch_table, batch_column, sub_batch_size, *other_args)
        #         # do something
        #       end
        #     end
        def queue_batched_background_migration( # rubocop:disable Metrics/ParameterLists
          job_class_name,
          batch_table_name,
          batch_column_name,
          *job_arguments,
          job_interval:,
          queued_migration_version: nil,
          batch_min_value: BATCH_MIN_VALUE,
          batch_max_value: nil,
          batch_class_name: BATCH_CLASS_NAME,
          batch_size: BATCH_SIZE,
          max_batch_size: nil,
          sub_batch_size: SUB_BATCH_SIZE,
          gitlab_schema: nil
        )
          Gitlab::Database::QueryAnalyzers::RestrictAllowedSchemas.require_dml_mode!

          gitlab_schema ||= gitlab_schema_from_context

          Gitlab::Database::BackgroundMigration::BatchedMigration.reset_column_information

          if Gitlab::Database::BackgroundMigration::BatchedMigration.for_configuration(gitlab_schema, job_class_name, batch_table_name, batch_column_name, job_arguments).exists?
            Gitlab::AppLogger.warn "Batched background migration not enqueued because it already exists: " \
              "job_class_name: #{job_class_name}, table_name: #{batch_table_name}, column_name: #{batch_column_name}, " \
              "job_arguments: #{job_arguments.inspect}"
            return
          end

          job_interval = BATCH_MIN_DELAY if job_interval < BATCH_MIN_DELAY

          batch_max_value ||= connection.select_value(<<~SQL)
            SELECT MAX(#{connection.quote_column_name(batch_column_name)})
            FROM #{connection.quote_table_name(batch_table_name)}
          SQL

          status_event = batch_max_value.nil? ? :finish : :execute
          batch_max_value ||= batch_min_value

          migration = Gitlab::Database::BackgroundMigration::BatchedMigration.new(
            job_class_name: job_class_name,
            table_name: batch_table_name,
            column_name: batch_column_name,
            job_arguments: job_arguments,
            interval: job_interval,
            min_value: batch_min_value,
            max_value: batch_max_value,
            batch_class_name: batch_class_name,
            batch_size: batch_size,
            sub_batch_size: sub_batch_size,
            status_event: status_event
          )

          if migration.job_class.respond_to?(:job_arguments_count) && migration.job_class.job_arguments_count != job_arguments.count
            raise "Wrong number of job arguments for #{migration.job_class_name} " \
              "(given #{job_arguments.count}, expected #{migration.job_class.job_arguments_count})"
          end

          assign_attribtues_safely(
            migration,
            max_batch_size,
            batch_table_name,
            gitlab_schema,
            queued_migration_version
          )

          migration.save!
          migration
        end

        def finalize_batched_background_migration(job_class_name:, table_name:, column_name:, job_arguments:)
          Gitlab::Database::QueryAnalyzers::RestrictAllowedSchemas.require_dml_mode!

          if transaction_open?
            raise 'The `finalize_batched_background_migration` cannot be run inside a transaction. ' \
              'You can disable transactions by calling `disable_ddl_transaction!` in the body of ' \
              'your migration class.'
          end

          Gitlab::Database::BackgroundMigration::BatchedMigration.reset_column_information

          migration = Gitlab::Database::BackgroundMigration::BatchedMigration.find_for_configuration(
            gitlab_schema_from_context, job_class_name, table_name, column_name, job_arguments)

          raise 'Could not find batched background migration' if migration.nil?

          with_restored_connection_stack do |restored_connection|
            Gitlab::Database::QueryAnalyzers::RestrictAllowedSchemas.with_suppressed do
              Gitlab::Database::BackgroundMigration::BatchedMigrationRunner.finalize(
                job_class_name, table_name,
                column_name, job_arguments,
                connection: restored_connection)
            end
          end
        end

        # Deletes batched background migration for the given configuration.
        #
        # job_class_name - The background migration job class as a string
        # table_name - The name of the table the migration iterates over
        # column_name - The name of the column the migration will batch over
        # job_arguments - Migration arguments
        #
        # Example:
        #
        #     delete_batched_background_migration(
        #       'CopyColumnUsingBackgroundMigrationJob',
        #       :events,
        #       :id,
        #       ['column1', 'column2'])
        def delete_batched_background_migration(job_class_name, table_name, column_name, job_arguments)
          Gitlab::Database::QueryAnalyzers::RestrictAllowedSchemas.require_dml_mode!

          Gitlab::Database::BackgroundMigration::BatchedMigration.reset_column_information

          Gitlab::Database::BackgroundMigration::BatchedMigration
            .for_configuration(
              gitlab_schema_from_context, job_class_name, table_name, column_name, job_arguments
            ).delete_all
        end

        def gitlab_schema_from_context
          if respond_to?(:allowed_gitlab_schemas) # Gitlab::Database::Migration::V2_0
            Array(allowed_gitlab_schemas).first
          else                                    # Gitlab::Database::Migration::V1_0
            :gitlab_main
          end
        end

        def ensure_batched_background_migration_is_finished(job_class_name:, table_name:, column_name:, job_arguments:, finalize: true)
          Gitlab::Database::QueryAnalyzers::RestrictAllowedSchemas.require_dml_mode!

          if transaction_open?
            raise 'The `ensure_batched_background_migration_is_finished` cannot be run inside a transaction. ' \
              'You can disable transactions by calling `disable_ddl_transaction!` in the body of ' \
              'your migration class.'
          end

          Gitlab::Database::BackgroundMigration::BatchedMigration.reset_column_information
          migration = Gitlab::Database::BackgroundMigration::BatchedMigration.find_for_configuration(
            Gitlab::Database.gitlab_schemas_for_connection(connection),
            job_class_name, table_name, column_name, job_arguments
          )

          configuration = {
            job_class_name: job_class_name,
            table_name: table_name,
            column_name: column_name,
            job_arguments: job_arguments
          }

          if ENV['DBLAB_ENVIRONMENT'] && migration.nil?
            raise NonExistentMigrationError, 'called ensure_batched_background_migration_is_finished with non-existent migration name'
          end

          return Gitlab::AppLogger.warn "Could not find batched background migration for the given configuration: #{configuration}" if migration.nil?

          return if migration.finished?

          finalize_batched_background_migration(job_class_name: job_class_name, table_name: table_name, column_name: column_name, job_arguments: job_arguments) if finalize

          return if migration.reload.finished? # rubocop:disable Cop/ActiveRecordAssociationReload

          raise "Expected batched background migration for the given configuration to be marked as 'finished', " \
            "but it is '#{migration.status_name}':" \
            "\t#{configuration}" \
            "\n\n" \
            "Finalize it manually by running the following command in a `bash` or `sh` shell:" \
            "\n\n" \
            "\tsudo gitlab-rake gitlab:background_migrations:finalize[#{job_class_name},#{table_name},#{column_name},'#{job_arguments.to_json.gsub(',', '\,')}']" \
            "\n\n" \
            "For more information, check the documentation" \
            "\n\n" \
            "\thttps://docs.gitlab.com/ee/update/background_migrations.html#database-migrations-failing-because-of-batched-background-migration-not-finished"
        end

        private

        # Below `BatchedMigration` attributes were introduced after the
        # initial `batched_background_migrations` table was created, so any
        # migrations that ran relying on initial table schema would not know
        # about columns introduced later on because this model is not
        # isolated in migrations, which is why we need to check for existence
        # of these columns first.
        def assign_attribtues_safely(migration, max_batch_size, batch_table_name, gitlab_schema, queued_migration_version)
          # We keep track of the estimated number of tuples in 'total_tuple_count' to reason later
          # about the overall progress of a migration.
          safe_attributes_value = {
            max_batch_size: max_batch_size,
            total_tuple_count: Gitlab::Database::SharedModel.using_connection(connection) do
              Gitlab::Database::PgClass.for_table(batch_table_name)&.cardinality_estimate
            end,
            gitlab_schema: gitlab_schema,
            queued_migration_version: queued_migration_version
          }

          # rubocop:disable GitlabSecurity/PublicSend
          safe_attributes_value.each do |safe_attribute, value|
            migration.public_send("#{safe_attribute}=", value) if migration.respond_to?(safe_attribute)
          end
          # rubocop:enable GitlabSecurity/PublicSend
        end
      end
    end
  end
end