Add latest changes from gitlab-org/gitlab@13-8-stable-eev13.8.0-rc42

author: Robert Speicher <rspeicher@gmail.com> 2021-01-20 22:34:23 +0300
committer: Robert Speicher <rspeicher@gmail.com> 2021-01-20 22:34:23 +0300
commit: 6438df3a1e0fb944485cebf07976160184697d72 (patch)
tree: 00b09bfd170e77ae9391b1a2f5a93ef6839f2597 /lib/gitlab/database
parent: 42bcd54d971da7ef2854b896a7b34f4ef8601067 (diff)
10 files changed, 395 insertions, 216 deletions
diff --git a/lib/gitlab/database/median.rb b/lib/gitlab/database/median.rb
deleted file mode 100644
index 603b125d8b4..00000000000
--- a/lib/gitlab/database/median.rb
+++ /dev/null
@@ -1,149 +0,0 @@
-# frozen_string_literal: true
-
-# https://www.periscopedata.com/blog/medians-in-sql.html
-module Gitlab
-  module Database
-    module Median
-      NotSupportedError = Class.new(StandardError)
-
-      def median_datetime(arel_table, query_so_far, column_sym)
-        extract_median(execute_queries(arel_table, query_so_far, column_sym)).presence
-      end
-
-      def median_datetimes(arel_table, query_so_far, column_sym, partition_column)
-        extract_medians(execute_queries(arel_table, query_so_far, column_sym, partition_column)).presence
-      end
-
-      def extract_median(results)
-        result = results.compact.first
-
-        result = result.first.presence
-
-        result['median']&.to_f if result
-      end
-
-      def extract_medians(results)
-        median_values = results.compact.first.values
-
-        median_values.each_with_object({}) do |(id, median), hash|
-          hash[id.to_i] = median&.to_f
-        end
-      end
-
-      def pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column = nil)
-        # Create a CTE with the column we're operating on, row number (after sorting by the column
-        # we're operating on), and count of the table we're operating on (duplicated across) all rows
-        # of the CTE. For example, if we're looking to find the median of the `projects.star_count`
-        # column, the CTE might look like this:
-        #
-        #  star_count | row_id | ct
-        # ------------+--------+----
-        #           5 |      1 |  3
-        #           9 |      2 |  3
-        #          15 |      3 |  3
-        #
-        #  If a partition column is used we will do the same operation but for separate partitions,
-        #  when that happens the CTE might look like this:
-        #
-        #  project_id | star_count | row_id | ct
-        # ------------+------------+--------+----
-        #           1 |          5 |     1 |  2
-        #           1 |          9 |     2 |  2
-        #           2 |         10 |     1 |  3
-        #           2 |         15 |     2 |  3
-        #           2 |         20 |     3 |  3
-        cte_table = Arel::Table.new("ordered_records")
-
-        cte = Arel::Nodes::As.new(
-          cte_table,
-          arel_table.project(*rank_rows(arel_table, column_sym, partition_column)).
-            # Disallow negative values
-            where(arel_table[column_sym].gteq(zero_interval)))
-
-        # From the CTE, select either the middle row or the middle two rows (this is accomplished
-        # by 'where cte.row_id between cte.ct / 2.0 AND cte.ct / 2.0 + 1'). Find the average of the
-        # selected rows, and this is the median value.
-        result =
-          cte_table
-            .project(*median_projections(cte_table, column_sym, partition_column))
-            .where(
-              Arel::Nodes::Between.new(
-                cte_table[:row_id],
-                Arel::Nodes::And.new(
-                  [(cte_table[:ct] / Arel.sql('2.0')),
-                   (cte_table[:ct] / Arel.sql('2.0') + 1)]
-                )
-              )
-            )
-            .with(query_so_far, cte)
-
-        result.group(cte_table[partition_column]).order(cte_table[partition_column]) if partition_column
-
-        result.to_sql
-      end
-
-      private
-
-      def execute_queries(arel_table, query_so_far, column_sym, partition_column = nil)
-        queries = pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column)
-
-        Array.wrap(queries).map { |query| ActiveRecord::Base.connection.execute(query) }
-      end
-
-      def average(args, as)
-        Arel::Nodes::NamedFunction.new("AVG", args, as)
-      end
-
-      def rank_rows(arel_table, column_sym, partition_column)
-        column_row = arel_table[column_sym].as(column_sym.to_s)
-
-        if partition_column
-          partition_row = arel_table[partition_column]
-          row_id =
-            Arel::Nodes::Over.new(
-              Arel::Nodes::NamedFunction.new('rank', []),
-              Arel::Nodes::Window.new.partition(arel_table[partition_column])
-                .order(arel_table[column_sym])
-            ).as('row_id')
-
-          count = arel_table.from.from(arel_table.alias)
-                    .project('COUNT(*)')
-                    .where(arel_table[partition_column].eq(arel_table.alias[partition_column]))
-                    .as('ct')
-
-          [partition_row, column_row, row_id, count]
-        else
-          row_id =
-            Arel::Nodes::Over.new(
-              Arel::Nodes::NamedFunction.new('row_number', []),
-              Arel::Nodes::Window.new.order(arel_table[column_sym])
-            ).as('row_id')
-
-          count = arel_table.where(arel_table[column_sym].gteq(zero_interval)).project("COUNT(1)").as('ct')
-
-          [column_row, row_id, count]
-        end
-      end
-
-      def median_projections(table, column_sym, partition_column)
-        projections = []
-        projections << table[partition_column] if partition_column
-        projections << average([extract_epoch(table[column_sym])], "median")
-        projections
-      end
-
-      def extract_epoch(arel_attribute)
-        Arel.sql(%Q{EXTRACT(EPOCH FROM "#{arel_attribute.relation.name}"."#{arel_attribute.name}")})
-      end
-
-      def extract_diff_epoch(diff)
-        Arel.sql(%Q{EXTRACT(EPOCH FROM (#{diff.to_sql}))})
-      end
-
-      # Need to cast '0' to an INTERVAL before we can check if the interval is positive
-      def zero_interval
-        Arel::Nodes::NamedFunction.new("CAST", [Arel.sql("'0' AS INTERVAL")])
-      end
-    end
-  end
-end
diff --git a/lib/gitlab/database/migration_helpers.rb b/lib/gitlab/database/migration_helpers.rb
index 164fce5a5a3..6b169a504f3 100644
--- a/lib/gitlab/database/migration_helpers.rb
+++ b/lib/gitlab/database/migration_helpers.rb
@@ -70,6 +70,61 @@ module Gitlab
         end
       end
 
+      #
+      # Creates a new table, optionally allowing the caller to add check constraints to the table.
+      # Aside from that addition, this method should behave identically to Rails' `create_table` method.
+      #
+      # Example:
+      #
+      #     create_table_with_constraints :some_table do |t|
+      #       t.integer :thing, null: false
+      #       t.text :other_thing
+      #
+      #       t.check_constraint :thing_is_not_null, 'thing IS NOT NULL'
+      #       t.text_limit :other_thing, 255
+      #     end
+      #
+      # See Rails' `create_table` for more info on the available arguments.
+      def create_table_with_constraints(table_name, **options, &block)
+        helper_context = self
+        check_constraints = []
+
+        with_lock_retries do
+          create_table(table_name, **options) do |t|
+            t.define_singleton_method(:check_constraint) do |name, definition|
+              helper_context.send(:validate_check_constraint_name!, name) # rubocop:disable GitlabSecurity/PublicSend
+
+              check_constraints << { name: name, definition: definition }
+            end
+
+            t.define_singleton_method(:text_limit) do |column_name, limit, name: nil|
+              # rubocop:disable GitlabSecurity/PublicSend
+              name = helper_context.send(:text_limit_name, table_name, column_name, name: name)
+              helper_context.send(:validate_check_constraint_name!, name)
+              # rubocop:enable GitlabSecurity/PublicSend
+
+              column_name = helper_context.quote_column_name(column_name)
+              definition = "char_length(#{column_name}) <= #{limit}"
+
+              check_constraints << { name: name, definition: definition }
+            end
+
+            t.instance_eval(&block) unless block.nil?
+          end
+
+          next if check_constraints.empty?
+
+          constraint_clauses = check_constraints.map do |constraint|
+            "ADD CONSTRAINT #{quote_table_name(constraint[:name])} CHECK (#{constraint[:definition]})"
+          end
+
+          execute(<<~SQL)
+            ALTER TABLE #{quote_table_name(table_name)}
+            #{constraint_clauses.join(",\n")}
+          SQL
+        end
+      end
+
       # Creates a new index, concurrently
       #
       # Example:
@@ -858,6 +913,120 @@ module Gitlab
         end
       end
 
+      # Initializes the conversion of an integer column to bigint
+      #
+      # It can be used for converting both a Primary Key and any Foreign Keys
+      # that may reference it or any other integer column that we may want to
+      # upgrade (e.g. columns that store IDs, but are not set as FKs).
+      #
+      # - For primary keys and Foreign Keys (or other columns) defined as NOT NULL,
+      #    the new bigint column is added with a hardcoded NOT NULL DEFAULT 0
+      #    which allows us to skip a very costly verification step once we
+      #    are ready to switch it.
+      #   This is crucial for Primary Key conversions, because setting a column
+      #    as the PK converts even check constraints to NOT NULL constraints
+      #    and forces an inline re-verification of the whole table.
+      # - It backfills the new column with the values of the existing primary key
+      #    by scheduling background jobs.
+      # - It tracks the scheduled background jobs through the use of
+      #    Gitlab::Database::BackgroundMigrationJob
+      #   which allows a more thorough check that all jobs succeeded in the
+      #   cleanup migration and is way faster for very large tables.
+      # - It sets up a trigger to keep the two columns in sync
+      # - It does not schedule a cleanup job: we have to do that with followup
+      #    post deployment migrations in the next release.
+      #
+      #   This needs to be done manually by using the
+      #    `cleanup_initialize_conversion_of_integer_to_bigint`
+      #   (not yet implemented - check #288005)
+      #
+      # table - The name of the database table containing the column
+      # column - The name of the column that we want to convert to bigint.
+      # primary_key - The name of the primary key column (most often :id)
+      # batch_size - The number of rows to schedule in a single background migration
+      # sub_batch_size - The smaller batches that will be used by each scheduled job
+      #   to update the table. Useful to keep each update at ~100ms while executing
+      #   more updates per interval (2.minutes)
+      #   Note that each execution of a sub-batch adds a constant 100ms sleep
+      #    time in between the updates, which must be taken into account
+      #    while calculating the batch, sub_batch and interval values.
+      # interval - The time interval between every background migration
+      #
+      # example:
+      # Assume that we have figured out that updating 200 records of the events
+      #  table takes ~100ms on average.
+      # We can set the sub_batch_size to 200, leave the interval to the default
+      #  and set the batch_size to 50_000 which will require
+      #  ~50s = (50000 / 200) * (0.1 + 0.1) to complete and leaves breathing space
+      #  between the scheduled jobs
+      def initialize_conversion_of_integer_to_bigint(
+        table,
+        column,
+        primary_key: :id,
+        batch_size: 20_000,
+        sub_batch_size: 1000,
+        interval: 2.minutes
+      )
+
+        if transaction_open?
+          raise 'initialize_conversion_of_integer_to_bigint can not be run inside a transaction'
+        end
+
+        unless table_exists?(table)
+          raise "Table #{table} does not exist"
+        end
+
+        unless column_exists?(table, primary_key)
+          raise "Column #{primary_key} does not exist on #{table}"
+        end
+
+        unless column_exists?(table, column)
+          raise "Column #{column} does not exist on #{table}"
+        end
+
+        check_trigger_permissions!(table)
+
+        old_column = column_for(table, column)
+        tmp_column = "#{column}_convert_to_bigint"
+
+        with_lock_retries do
+          if (column.to_s == primary_key.to_s) || !old_column.null
+            # If the column to be converted is either a PK or is defined as NOT NULL,
+            # set it to `NOT NULL DEFAULT 0` and we'll copy paste the correct values bellow
+            # That way, we skip the expensive validation step required to add
+            #  a NOT NULL constraint at the end of the process
+            add_column(table, tmp_column, :bigint, default: old_column.default || 0, null: false)
+          else
+            add_column(table, tmp_column, :bigint, default: old_column.default)
+          end
+
+          install_rename_triggers(table, column, tmp_column)
+        end
+
+        source_model = Class.new(ActiveRecord::Base) do
+          include EachBatch
+
+          self.table_name = table
+          self.inheritance_column = :_type_disabled
+        end
+
+        queue_background_migration_jobs_by_range_at_intervals(
+          source_model,
+          'CopyColumnUsingBackgroundMigrationJob',
+          interval,
+          batch_size: batch_size,
+          other_job_arguments: [table, primary_key, column, tmp_column, sub_batch_size],
+          track_jobs: true,
+          primary_column_name: primary_key
+        )
+
+        if perform_background_migration_inline?
+          # To ensure the schema is up to date immediately we perform the
+          # migration inline in dev / test environments.
+          Gitlab::BackgroundMigration.steal('CopyColumnUsingBackgroundMigrationJob')
+        end
+      end
+
       # Performs a concurrent column rename when using PostgreSQL.
       def install_rename_triggers_for_postgresql(trigger, table, old, new)
         execute <<-EOF.strip_heredoc
@@ -996,9 +1165,9 @@ module Gitlab
         Arel::Nodes::SqlLiteral.new(replace.to_sql)
       end
 
-      def remove_foreign_key_if_exists(*args)
-        if foreign_key_exists?(*args)
-          remove_foreign_key(*args)
+      def remove_foreign_key_if_exists(...)
+        if foreign_key_exists?(...)
+          remove_foreign_key(...)
         end
       end
 
diff --git a/lib/gitlab/database/migrations/background_migration_helpers.rb b/lib/gitlab/database/migrations/background_migration_helpers.rb
index 36073844765..12dcf68da2f 100644
--- a/lib/gitlab/database/migrations/background_migration_helpers.rb
+++ b/lib/gitlab/database/migrations/background_migration_helpers.rb
@@ -100,6 +100,7 @@ module Gitlab
           end
 
           final_delay = 0
+          batch_counter = 0
 
           model_class.each_batch(of: batch_size) do |relation, index|
             start_id, end_id = relation.pluck(Arel.sql("MIN(#{primary_column_name}), MAX(#{primary_column_name})")).first
@@ -112,8 +113,17 @@ module Gitlab
 
             track_in_database(job_class_name, full_job_arguments) if track_jobs
             migrate_in(final_delay, job_class_name, full_job_arguments)
+
+            batch_counter += 1
           end
 
+          duration = initial_delay + delay_interval * batch_counter
+          say <<~SAY
+            Scheduled #{batch_counter} #{job_class_name} jobs with a maximum of #{batch_size} records per batch and an interval of #{delay_interval} seconds.
+
+            The migration is expected to take at least #{duration} seconds. Expect all jobs to have completed after #{Time.zone.now + duration}."
+          SAY
+
           final_delay
         end
 
diff --git a/lib/gitlab/database/partitioning_migration_helpers/index_helpers.rb b/lib/gitlab/database/partitioning_migration_helpers/index_helpers.rb
index f367292f4b0..0bc1343acca 100644
--- a/lib/gitlab/database/partitioning_migration_helpers/index_helpers.rb
+++ b/lib/gitlab/database/partitioning_migration_helpers/index_helpers.rb
@@ -32,7 +32,7 @@ module Gitlab
             return
           end
 
-          partitioned_table.postgres_partitions.each do |partition|
+          partitioned_table.postgres_partitions.order(:name).each do |partition|
             partition_index_name = generated_index_name(partition.identifier, options[:name])
             partition_options = options.merge(name: partition_index_name)
 
diff --git a/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
index 33faa2ef1b0..62dfaeeaae3 100644
--- a/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
+++ b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
@@ -16,9 +16,9 @@ module Gitlab
       # Grouped relations are NOT supported yet.
       #
       # @example Usage
-      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
+      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).execute
       #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
-      #    .estimate_distinct_count(
+      #    .execute(
       #      batch_size: 1_000,
       #      start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
       #      finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
@@ -30,7 +30,6 @@ module Gitlab
       #  for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
       class BatchDistinctCounter
         ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
-        FALLBACK = -1
         MIN_REQUIRED_BATCH_SIZE = 750
         SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
         MAX_DATA_VOLUME = 4_000_000_000
@@ -38,8 +37,10 @@ module Gitlab
         # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
         DEFAULT_BATCH_SIZE = 10_000
 
+        ZERO_OFFSET = 1
+        BUCKET_ID_MASK = (Buckets::TOTAL_BUCKETS - ZERO_OFFSET).to_s(2)
         BIT_31_MASK = "B'0#{'1' * 31}'"
-        BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
+        BIT_32_NORMALIZED_BUCKET_ID_MASK = "B'#{'0' * (32 - BUCKET_ID_MASK.size)}#{BUCKET_ID_MASK}'"
         # @example source_query
         #   SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
         #   FROM %{relation}
@@ -48,73 +49,58 @@ module Gitlab
         #   AND %{column} IS NOT NULL
         BUCKETED_DATA_SQL = <<~SQL
           WITH hashed_attributes AS (%{source_query})
-          SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
+          SELECT (attr_hash_32_bits & #{BIT_32_NORMALIZED_BUCKET_ID_MASK})::int AS bucket_num,
             (31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
           FROM hashed_attributes
           GROUP BY 1
         SQL
 
-        TOTAL_BUCKETS_NUMBER = 512
+        WRONG_CONFIGURATION_ERROR = Class.new(ActiveRecord::StatementInvalid)
 
         def initialize(relation, column = nil)
           @relation = relation
           @column = column || relation.primary_key
         end
 
-        def unwanted_configuration?(finish, batch_size, start)
-          batch_size <= MIN_REQUIRED_BATCH_SIZE ||
-            (finish - start) >= MAX_DATA_VOLUME ||
-            start > finish
-        end
-
-        def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
+        # Executes counter that iterates over database source and return Gitlab::Database::PostgresHll::Buckets
+        # that can be used to estimation of number of uniq elements in analysed set
+        #
+        # @param batch_size maximal number of rows that will be analysed by single database query
+        # @param start initial pkey range
+        # @param finish final pkey range
+        # @return [Gitlab::Database::PostgresHll::Buckets] HyperLogLog data structure instance that can estimate number of unique elements
+        def execute(batch_size: nil, start: nil, finish: nil)
           raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
 
           batch_size ||= DEFAULT_BATCH_SIZE
-
           start = actual_start(start)
           finish = actual_finish(finish)
 
-          raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
-          return FALLBACK if unwanted_configuration?(finish, batch_size, start)
+          raise WRONG_CONFIGURATION_ERROR if unwanted_configuration?(start, finish, batch_size)
 
           batch_start = start
-          hll_blob = {}
+          hll_buckets = Buckets.new
 
           while batch_start <= finish
             begin
-              hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
+              hll_buckets.merge_hash!(hll_buckets_for_batch(batch_start, batch_start + batch_size))
               batch_start += batch_size
             end
             sleep(SLEEP_TIME_IN_SECONDS)
           end
 
-          estimate_cardinality(hll_blob)
+          hll_buckets
         end
 
         private
 
-        # arbitrary values that are present in #estimate_cardinality
-        # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
-        # article, they are not representing any entity and serves as tune value
-        # for the whole equation
-        def estimate_cardinality(hll_blob)
-          num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
-
-          num_uniques = (
-            ((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
-              (num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
-          ).to_i
-
-          if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
-            ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
-              Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
-          else
-            num_uniques
-          end
+        def unwanted_configuration?(start, finish, batch_size)
+          batch_size <= MIN_REQUIRED_BATCH_SIZE ||
+            (finish - start) >= MAX_DATA_VOLUME ||
+            start > finish || start < 0 || finish < 0
         end
 
-        def hll_blob_for_batch(start, finish)
+        def hll_buckets_for_batch(start, finish)
           @relation
             .connection
             .execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })
diff --git a/lib/gitlab/database/postgres_hll/buckets.rb b/lib/gitlab/database/postgres_hll/buckets.rb
new file mode 100644
index 00000000000..429e823379f
--- /dev/null
+++ b/lib/gitlab/database/postgres_hll/buckets.rb
@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Database
+    module PostgresHll
+      # Bucket class represent data structure build with HyperLogLog algorithm
+      # that models data distribution in analysed set. This representation than can be used
+      # for following purposes
+      #   1. Estimating number of unique elements that this structure represents
+      #   2. Merging with other Buckets structure to later estimate number of unique elements in sum of two
+      #      represented data sets
+      #   3. Serializing Buckets structure to json format, that can be stored in various persistence layers
+      #
+      # @example Usage
+      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).estimated_distinct_count
+      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).merge_hash!(141 => 1, 56 => 5).estimated_distinct_count
+      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).to_json
+
+      # @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
+      #  Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
+      #  for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
+      #  for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
+      class Buckets
+        TOTAL_BUCKETS = 512
+
+        def initialize(buckets = {})
+          @buckets = buckets
+        end
+
+        # Based on HyperLogLog structure estimates number of unique elements in analysed set.
+        #
+        # @return [Float] Estimate number of unique elements
+        def estimated_distinct_count
+          @estimated_distinct_count ||= estimate_cardinality
+        end
+
+        # Updates instance underlying HyperLogLog structure by merging it with other HyperLogLog structure
+        #
+        # @param other_buckets_hash hash with HyperLogLog structure representation
+        def merge_hash!(other_buckets_hash)
+          buckets.merge!(other_buckets_hash) {|_key, old, new| new > old ? new : old }
+        end
+
+        # Serialize instance underlying HyperLogLog structure to JSON format, that can be stored in various persistence layers
+        #
+        # @return [String] HyperLogLog data structure serialized to JSON
+        def to_json(_ = nil)
+          buckets.to_json
+        end
+
+        private
+
+        attr_accessor :buckets
+
+        # arbitrary values that are present in #estimate_cardinality
+        # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
+        # article, they are not representing any entity and serves as tune value
+        # for the whole equation
+        def estimate_cardinality
+          num_zero_buckets = TOTAL_BUCKETS - buckets.size
+
+          num_uniques = (
+            ((TOTAL_BUCKETS**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS))) /
+            (num_zero_buckets + buckets.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
+          ).to_i
+
+          if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS
+            ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS)) * (TOTAL_BUCKETS *
+              Math.log2(TOTAL_BUCKETS.to_f / num_zero_buckets)))
+          else
+            num_uniques
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/lib/gitlab/database/reindexing.rb b/lib/gitlab/database/reindexing.rb
index 832f7438cf9..0cfad690283 100644
--- a/lib/gitlab/database/reindexing.rb
+++ b/lib/gitlab/database/reindexing.rb
@@ -8,9 +8,9 @@ module Gitlab
 
       # candidate_indexes: Array of Gitlab::Database::PostgresIndex
       def self.perform(candidate_indexes, how_many: DEFAULT_INDEXES_PER_INVOCATION)
-        indexes = IndexSelection.new(candidate_indexes).take(how_many)
-
-        Coordinator.new(indexes).perform
+        IndexSelection.new(candidate_indexes).take(how_many).each do |index|
+          Coordinator.new(index).perform
+        end
       end
 
       def self.candidate_indexes
diff --git a/lib/gitlab/database/reindexing/coordinator.rb b/lib/gitlab/database/reindexing/coordinator.rb
index 0957f43e166..7a7d17ca196 100644
--- a/lib/gitlab/database/reindexing/coordinator.rb
+++ b/lib/gitlab/database/reindexing/coordinator.rb
@@ -12,26 +12,44 @@ module Gitlab
         # statement timeouts).
         TIMEOUT_PER_ACTION = 1.day
 
-        attr_reader :indexes
+        attr_reader :index, :notifier
 
-        def initialize(indexes)
-          @indexes = indexes
+        def initialize(index, notifier = GrafanaNotifier.new)
+          @index = index
+          @notifier = notifier
         end
 
         def perform
-          indexes.each do |index|
-            # This obtains a global lease such that there's
-            # only one live reindexing process at a time.
-            try_obtain_lease do
-              ReindexAction.keep_track_of(index) do
-                ConcurrentReindex.new(index).perform
-              end
+          # This obtains a global lease such that there's
+          # only one live reindexing process at a time.
+          try_obtain_lease do
+            action = ReindexAction.create_for(index)
+
+            with_notifications(action) do
+              perform_for(index, action)
             end
           end
         end
 
         private
 
+        def with_notifications(action)
+          notifier.notify_start(action)
+          yield
+        ensure
+          notifier.notify_end(action)
+        end
+
+        def perform_for(index, action)
+          ConcurrentReindex.new(index).perform
+        rescue
+          action.state = :failed
+
+          raise
+        ensure
+          action.finish
+        end
+
         def lease_timeout
           TIMEOUT_PER_ACTION
         end
diff --git a/lib/gitlab/database/reindexing/grafana_notifier.rb b/lib/gitlab/database/reindexing/grafana_notifier.rb
new file mode 100644
index 00000000000..b1e5ecb9ade
--- /dev/null
+++ b/lib/gitlab/database/reindexing/grafana_notifier.rb
@@ -0,0 +1,72 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Database
+    module Reindexing
+      # This can be used to send annotations for reindexing to a Grafana API
+      class GrafanaNotifier
+        def initialize(api_key = ENV['GITLAB_GRAFANA_API_KEY'], api_url = ENV['GITLAB_GRAFANA_API_URL'], additional_tag = ENV['GITLAB_REINDEXING_GRAFANA_TAG'] || Rails.env)
+          @api_key = api_key
+          @api_url = api_url
+          @additional_tag = additional_tag
+        end
+
+        def notify_start(action)
+          return unless enabled?
+
+          payload = base_payload(action).merge(
+            text: "Started reindexing of #{action.index.name} on #{action.index.tablename}"
+          )
+
+          annotate(payload)
+        end
+
+        def notify_end(action)
+          return unless enabled?
+
+          payload = base_payload(action).merge(
+            text: "Finished reindexing of #{action.index.name} on #{action.index.tablename} (#{action.state})",
+            timeEnd: (action.action_end.utc.to_f * 1000).to_i,
+            isRegion: true
+          )
+
+          annotate(payload)
+        end
+
+        private
+
+        def base_payload(action)
+          {
+            time: (action.action_start.utc.to_f * 1000).to_i,
+            tags: ['reindex', @additional_tag, action.index.tablename, action.index.name].compact
+          }
+        end
+
+        def annotate(payload)
+          headers = {
+            "Content-Type": "application/json",
+            "Authorization": "Bearer #{@api_key}"
+          }
+
+          success = Gitlab::HTTP.post("#{@api_url}/api/annotations", body: payload.to_json, headers: headers, allow_local_requests: true).success?
+
+          log_error("Response code #{response.code}") unless success
+
+          success
+        rescue => err
+          log_error(err)
+
+          false
+        end
+
+        def log_error(err)
+          Gitlab::AppLogger.warn("Unable to notify Grafana from #{self.class}: #{err}")
+        end
+
+        def enabled?
+          !(@api_url.blank? || @api_key.blank?)
+        end
+      end
+    end
+  end
+end
diff --git a/lib/gitlab/database/reindexing/reindex_action.rb b/lib/gitlab/database/reindexing/reindex_action.rb
index 8c59cffe5fb..7e58201889f 100644
--- a/lib/gitlab/database/reindexing/reindex_action.rb
+++ b/lib/gitlab/database/reindexing/reindex_action.rb
@@ -14,27 +14,23 @@ module Gitlab
 
         scope :recent, -> { where(state: :finished).where('action_end > ?', Time.zone.now - RECENT_THRESHOLD) }
 
-        def self.keep_track_of(index, &block)
-          action = create!(
+        def self.create_for(index)
+          create!(
             index_identifier: index.identifier,
             action_start: Time.zone.now,
             ondisk_size_bytes_start: index.ondisk_size_bytes,
             bloat_estimate_bytes_start: index.bloat_size
           )
+        end
 
-          yield
-
-          action.state = :finished
-        rescue
-          action.state = :failed
-          raise
-        ensure
+        def finish
           index.reload # rubocop:disable Cop/ActiveRecordAssociationReload
 
-          action.action_end = Time.zone.now
-          action.ondisk_size_bytes_end = index.ondisk_size_bytes
+          self.state = :finished unless failed?
+          self.action_end = Time.zone.now
+          self.ondisk_size_bytes_end = index.ondisk_size_bytes
 
-          action.save!
+          save!
         end
       end
     end
author	Robert Speicher <rspeicher@gmail.com>	2021-01-20 22:34:23 +0300
committer	Robert Speicher <rspeicher@gmail.com>	2021-01-20 22:34:23 +0300
commit	6438df3a1e0fb944485cebf07976160184697d72 (patch)
tree	00b09bfd170e77ae9391b1a2f5a93ef6839f2597 /lib/gitlab/database
parent	42bcd54d971da7ef2854b896a7b34f4ef8601067 (diff)