Add latest changes from gitlab-org/gitlab@13-7-stable-eev13.7.0-rc42

author: GitLab Bot <gitlab-bot@gitlab.com> 2020-12-17 14:59:07 +0300
committer: GitLab Bot <gitlab-bot@gitlab.com> 2020-12-17 14:59:07 +0300
commit: 8b573c94895dc0ac0e1d9d59cf3e8745e8b539ca (patch)
tree: 544930fb309b30317ae9797a9683768705d664c4 /lib/gitlab/database
parent: 4b1de649d0168371549608993deac953eb692019 (diff)
10 files changed, 297 insertions, 15 deletions
diff --git a/lib/gitlab/database/batch_count.rb b/lib/gitlab/database/batch_count.rb
index 6f79e965cd5..5a506da0d05 100644
--- a/lib/gitlab/database/batch_count.rb
+++ b/lib/gitlab/database/batch_count.rb
@@ -49,6 +49,8 @@ module Gitlab
       MAX_ALLOWED_LOOPS = 10_000
       SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
       ALLOWED_MODES = [:itself, :distinct].freeze
+      FALLBACK_FINISH = 0
+      OFFSET_BY_ONE = 1
 
       # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
       DEFAULT_DISTINCT_BATCH_SIZE = 10_000
@@ -65,7 +67,7 @@ module Gitlab
         (@operation == :count && batch_size <= MIN_REQUIRED_BATCH_SIZE) ||
           (@operation == :sum && batch_size < DEFAULT_SUM_BATCH_SIZE) ||
           (finish - start) / batch_size >= MAX_ALLOWED_LOOPS ||
-          start > finish
+          start >= finish
       end
 
       def count(batch_size: nil, mode: :itself, start: nil, finish: nil)
@@ -85,11 +87,13 @@ module Gitlab
         results = nil
         batch_start = start
 
-        while batch_start <= finish
-          batch_relation = build_relation_batch(batch_start, batch_start + batch_size, mode)
+        while batch_start < finish
+          batch_end = [batch_start + batch_size, finish].min
+          batch_relation = build_relation_batch(batch_start, batch_end, mode)
+
           begin
             results = merge_results(results, batch_relation.send(@operation, *@operation_args)) # rubocop:disable GitlabSecurity/PublicSend
-            batch_start += batch_size
+            batch_start = batch_end
           rescue ActiveRecord::QueryCanceled => error
             # retry with a safe batch size & warmer cache
             if batch_size >= 2 * MIN_REQUIRED_BATCH_SIZE
@@ -99,6 +103,7 @@ module Gitlab
               return FALLBACK
             end
           end
+
           sleep(SLEEP_TIME_IN_SECONDS)
         end
 
@@ -138,7 +143,7 @@ module Gitlab
       end
 
       def actual_finish(finish)
-        finish || @relation.unscope(:group, :having).maximum(@column) || 0
+        (finish || @relation.unscope(:group, :having).maximum(@column) || FALLBACK_FINISH) + OFFSET_BY_ONE
       end
 
       def check_mode!(mode)
diff --git a/lib/gitlab/database/migrations/background_migration_helpers.rb b/lib/gitlab/database/migrations/background_migration_helpers.rb
index a6cc03aa9eb..36073844765 100644
--- a/lib/gitlab/database/migrations/background_migration_helpers.rb
+++ b/lib/gitlab/database/migrations/background_migration_helpers.rb
@@ -55,7 +55,8 @@ module Gitlab
           bulk_migrate_async(jobs) unless jobs.empty?
         end
 
-        # Queues background migration jobs for an entire table, batched by ID range.
+        # Queues background migration jobs for an entire table in batches.
+        # The default batching column used is the standard primary key `id`.
         # Each job is scheduled with a `delay_interval` in between.
         # If you use a small interval, then some jobs may run at the same time.
         #
@@ -68,6 +69,7 @@ module Gitlab
         # is scheduled to be run. These records can be used to trace execution of the background job, but there is no
         # builtin support to manage that automatically at this time. You should only set this flag if you are aware of
         # how it works, and intend to manually cleanup the database records in your background job.
+        # primary_column_name - The name of the primary key column if the primary key is not `id`
         #
         # *Returns the final migration delay*
         #
@@ -87,8 +89,9 @@ module Gitlab
         #         # do something
         #       end
         #     end
-        def queue_background_migration_jobs_by_range_at_intervals(model_class, job_class_name, delay_interval, batch_size: BACKGROUND_MIGRATION_BATCH_SIZE, other_job_arguments: [], initial_delay: 0, track_jobs: false)
-          raise "#{model_class} does not have an ID to use for batch ranges" unless model_class.column_names.include?('id')
+        def queue_background_migration_jobs_by_range_at_intervals(model_class, job_class_name, delay_interval, batch_size: BACKGROUND_MIGRATION_BATCH_SIZE, other_job_arguments: [], initial_delay: 0, track_jobs: false, primary_column_name: :id)
+          raise "#{model_class} does not have an ID column of #{primary_column_name} to use for batch ranges" unless model_class.column_names.include?(primary_column_name.to_s)
+          raise "#{primary_column_name} is not an integer column" unless model_class.columns_hash[primary_column_name.to_s].type == :integer
 
           # To not overload the worker too much we enforce a minimum interval both
           # when scheduling and performing jobs.
@@ -99,7 +102,7 @@ module Gitlab
           final_delay = 0
 
           model_class.each_batch(of: batch_size) do |relation, index|
-            start_id, end_id = relation.pluck(Arel.sql('MIN(id), MAX(id)')).first
+            start_id, end_id = relation.pluck(Arel.sql("MIN(#{primary_column_name}), MAX(#{primary_column_name})")).first
 
             # `BackgroundMigrationWorker.bulk_perform_in` schedules all jobs for
             # the same time, which is not helpful in most cases where we wish to
diff --git a/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
new file mode 100644
index 00000000000..33faa2ef1b0
--- /dev/null
+++ b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
@@ -0,0 +1,159 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Database
+    module PostgresHll
+      # For large tables, PostgreSQL can take a long time to count rows due to MVCC.
+      # Implements a distinct batch counter based on HyperLogLog algorithm
+      # Needs indexes on the column below to calculate max, min and range queries
+      # For larger tables just set higher batch_size with index optimization
+      #
+      # In order to not use a possible complex time consuming query when calculating min and max values,
+      # the start and finish can be sent specifically, start and finish should contain max and min values for PRIMARY KEY of
+      # relation (most cases `id` column) rather than counted attribute eg:
+      # estimate_distinct_count(start: ::Project.with_active_services.minimum(:id), finish: ::Project.with_active_services.maximum(:id))
+      #
+      # Grouped relations are NOT supported yet.
+      #
+      # @example Usage
+      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
+      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
+      #    .estimate_distinct_count(
+      #      batch_size: 1_000,
+      #      start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
+      #      finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
+      #    )
+      #
+      # @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
+      #  Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
+      #  for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
+      #  for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
+      class BatchDistinctCounter
+        ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
+        FALLBACK = -1
+        MIN_REQUIRED_BATCH_SIZE = 750
+        SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
+        MAX_DATA_VOLUME = 4_000_000_000
+
+        # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
+        DEFAULT_BATCH_SIZE = 10_000
+
+        BIT_31_MASK = "B'0#{'1' * 31}'"
+        BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
+        # @example source_query
+        #   SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
+        #   FROM %{relation}
+        #   WHERE %{pkey} >= %{batch_start}
+        #   AND %{pkey} < %{batch_end}
+        #   AND %{column} IS NOT NULL
+        BUCKETED_DATA_SQL = <<~SQL
+          WITH hashed_attributes AS (%{source_query})
+          SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
+            (31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
+          FROM hashed_attributes
+          GROUP BY 1
+        SQL
+
+        TOTAL_BUCKETS_NUMBER = 512
+
+        def initialize(relation, column = nil)
+          @relation = relation
+          @column = column || relation.primary_key
+        end
+
+        def unwanted_configuration?(finish, batch_size, start)
+          batch_size <= MIN_REQUIRED_BATCH_SIZE ||
+            (finish - start) >= MAX_DATA_VOLUME ||
+            start > finish
+        end
+
+        def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
+          raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
+
+          batch_size ||= DEFAULT_BATCH_SIZE
+
+          start = actual_start(start)
+          finish = actual_finish(finish)
+
+          raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
+          return FALLBACK if unwanted_configuration?(finish, batch_size, start)
+
+          batch_start = start
+          hll_blob = {}
+
+          while batch_start <= finish
+            begin
+              hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
+              batch_start += batch_size
+            end
+            sleep(SLEEP_TIME_IN_SECONDS)
+          end
+
+          estimate_cardinality(hll_blob)
+        end
+
+        private
+
+        # arbitrary values that are present in #estimate_cardinality
+        # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
+        # article, they are not representing any entity and serves as tune value
+        # for the whole equation
+        def estimate_cardinality(hll_blob)
+          num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
+
+          num_uniques = (
+            ((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
+              (num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
+          ).to_i
+
+          if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
+            ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
+              Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
+          else
+            num_uniques
+          end
+        end
+
+        def hll_blob_for_batch(start, finish)
+          @relation
+            .connection
+            .execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })
+            .map(&:values)
+            .to_h
+        end
+
+        # Generate the source query SQL snippet for the provided id range
+        #
+        # @example SQL query template
+        #   SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
+        #   FROM %{relation}
+        #   WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
+        #   AND %{column} IS NOT NULL
+        #
+        # @param start initial id range
+        # @param finish final id range
+        # @return [String] SQL query fragment
+        def source_query(start, finish)
+          col_as_arel = @column.is_a?(Arel::Attributes::Attribute) ? @column : Arel.sql(@column.to_s)
+          col_as_text = Arel::Nodes::NamedFunction.new('CAST', [col_as_arel.as('text')])
+          md5_of_col = Arel::Nodes::NamedFunction.new('md5', [col_as_text])
+          md5_as_hex = Arel::Nodes::Concat.new(Arel.sql("'X'"), md5_of_col)
+          bits = Arel::Nodes::NamedFunction.new('CAST', [md5_as_hex.as('bit(32)')])
+
+          @relation
+            .where(@relation.primary_key => (start...finish))
+            .where(col_as_arel.not_eq(nil))
+            .select(bits.as('attr_hash_32_bits')).to_sql
+        end
+
+        def actual_start(start)
+          start || @relation.unscope(:group, :having).minimum(@relation.primary_key) || 0
+        end
+
+        def actual_finish(finish)
+          finish || @relation.unscope(:group, :having).maximum(@relation.primary_key) || 0
+        end
+      end
+    end
+  end
+end
diff --git a/lib/gitlab/database/postgres_index.rb b/lib/gitlab/database/postgres_index.rb
index 2a9f23f0098..6e734834841 100644
--- a/lib/gitlab/database/postgres_index.rb
+++ b/lib/gitlab/database/postgres_index.rb
@@ -3,9 +3,14 @@
 module Gitlab
   module Database
     class PostgresIndex < ActiveRecord::Base
+      include Gitlab::Utils::StrongMemoize
+
       self.table_name = 'postgres_indexes'
       self.primary_key = 'identifier'
 
+      has_one :bloat_estimate, class_name: 'Gitlab::Database::PostgresIndexBloatEstimate', foreign_key: :identifier
+      has_many :reindexing_actions, class_name: 'Gitlab::Database::Reindexing::ReindexAction', foreign_key: :index_identifier
+
       scope :by_identifier, ->(identifier) do
         raise ArgumentError, "Index name is not fully qualified with a schema: #{identifier}" unless identifier =~ /^\w+\.\w+$/
 
@@ -17,11 +22,17 @@ module Gitlab
       # is defined on a table that is not partitioned.
       scope :regular, -> { where(unique: false, partitioned: false, exclusion: false)}
 
-      scope :random_few, ->(how_many) do
-        limit(how_many).order(Arel.sql('RANDOM()'))
+      scope :not_match, ->(regex) { where("name !~ ?", regex)}
+
+      scope :not_recently_reindexed, -> do
+        recent_actions = Reindexing::ReindexAction.recent.where('index_identifier = identifier')
+
+        where('NOT EXISTS (?)', recent_actions)
       end
 
-      scope :not_match, ->(regex) { where("name !~ ?", regex)}
+      def bloat_size
+        strong_memoize(:bloat_size) { bloat_estimate&.bloat_size || 0 }
+      end
 
       def to_s
         name
diff --git a/lib/gitlab/database/postgres_index_bloat_estimate.rb b/lib/gitlab/database/postgres_index_bloat_estimate.rb
new file mode 100644
index 00000000000..379227bf87c
--- /dev/null
+++ b/lib/gitlab/database/postgres_index_bloat_estimate.rb
@@ -0,0 +1,18 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Database
+    # Use this model with care: Retrieving bloat statistics
+    # for all indexes can be expensive in a large database.
+    #
+    # Best used on a per-index basis.
+    class PostgresIndexBloatEstimate < ActiveRecord::Base
+      self.table_name = 'postgres_index_bloat_estimates'
+      self.primary_key = 'identifier'
+
+      belongs_to :index, foreign_key: :identifier, class_name: 'Gitlab::Database::PostgresIndex'
+
+      alias_attribute :bloat_size, :bloat_size_bytes
+    end
+  end
+end
diff --git a/lib/gitlab/database/postgresql_adapter/empty_query_ping.rb b/lib/gitlab/database/postgresql_adapter/empty_query_ping.rb
new file mode 100644
index 00000000000..906312478ac
--- /dev/null
+++ b/lib/gitlab/database/postgresql_adapter/empty_query_ping.rb
@@ -0,0 +1,22 @@
+# frozen_string_literal: true
+
+# rubocop:disable Gitlab/ModuleWithInstanceVariables
+module Gitlab
+  module Database
+    module PostgresqlAdapter
+      module EmptyQueryPing
+        # ActiveRecord uses `SELECT 1` to check if the connection is alive
+        # We patch this here to use an empty query instead, which is a bit faster
+        def active?
+          @lock.synchronize do
+            @connection.query ';'
+          end
+          true
+        rescue PG::Error
+          false
+        end
+      end
+    end
+  end
+end
+# rubocop:enable Gitlab/ModuleWithInstanceVariables
diff --git a/lib/gitlab/database/reindexing.rb b/lib/gitlab/database/reindexing.rb
index c77e000254f..832f7438cf9 100644
--- a/lib/gitlab/database/reindexing.rb
+++ b/lib/gitlab/database/reindexing.rb
@@ -3,8 +3,14 @@
 module Gitlab
   module Database
     module Reindexing
-      def self.perform(index_selector)
-        Coordinator.new(index_selector).perform
+      # Number of indexes to reindex per invocation
+      DEFAULT_INDEXES_PER_INVOCATION = 2
+
+      # candidate_indexes: Array of Gitlab::Database::PostgresIndex
+      def self.perform(candidate_indexes, how_many: DEFAULT_INDEXES_PER_INVOCATION)
+        indexes = IndexSelection.new(candidate_indexes).take(how_many)
+
+        Coordinator.new(indexes).perform
       end
 
       def self.candidate_indexes
diff --git a/lib/gitlab/database/reindexing/concurrent_reindex.rb b/lib/gitlab/database/reindexing/concurrent_reindex.rb
index fd3dca88567..a6fe7d61a4f 100644
--- a/lib/gitlab/database/reindexing/concurrent_reindex.rb
+++ b/lib/gitlab/database/reindexing/concurrent_reindex.rb
@@ -59,6 +59,13 @@ module Gitlab
             raise ReindexError, "failed to reindex #{index}: #{message}"
           end
 
+          # Some expression indexes (aka functional indexes)
+          # require additional statistics. The existing statistics
+          # are tightly bound to the original index. We have to
+          # rebuild statistics for the new index before dropping
+          # the original one.
+          rebuild_statistics if index.expression?
+
           yield replacement_index
         ensure
           begin
@@ -96,6 +103,14 @@ module Gitlab
           end
         end
 
+        def rebuild_statistics
+          logger.info("rebuilding table statistics for #{index.schema}.#{index.tablename}")
+
+          connection.execute(<<~SQL)
+            ANALYZE #{quote_table_name(index.schema)}.#{quote_table_name(index.tablename)}
+          SQL
+        end
+
         def replacement_index_name
           @replacement_index_name ||= "#{TEMPORARY_INDEX_PREFIX}#{index.indexrelid}"
         end
diff --git a/lib/gitlab/database/reindexing/index_selection.rb b/lib/gitlab/database/reindexing/index_selection.rb
new file mode 100644
index 00000000000..406e70791df
--- /dev/null
+++ b/lib/gitlab/database/reindexing/index_selection.rb
@@ -0,0 +1,36 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Database
+    module Reindexing
+      class IndexSelection
+        include Enumerable
+
+        delegate :each, to: :indexes
+
+        def initialize(candidates)
+          @candidates = candidates
+        end
+
+        private
+
+        attr_reader :candidates
+
+        def indexes
+          # This is an explicit N+1 query:
+          # Bloat estimates are generally available through a view
+          # for all indexes. However, estimating bloat for all
+          # indexes at once is an expensive operation. Therefore,
+          # we force a N+1 pattern here and estimate bloat on a per-index
+          # basis.
+
+          @indexes ||= filter_candidates.sort_by(&:bloat_size).reverse
+        end
+
+        def filter_candidates
+          candidates.not_recently_reindexed
+        end
+      end
+    end
+  end
+end
diff --git a/lib/gitlab/database/reindexing/reindex_action.rb b/lib/gitlab/database/reindexing/reindex_action.rb
index 0928ef90e5d..8c59cffe5fb 100644
--- a/lib/gitlab/database/reindexing/reindex_action.rb
+++ b/lib/gitlab/database/reindexing/reindex_action.rb
@@ -6,13 +6,20 @@ module Gitlab
       class ReindexAction < ActiveRecord::Base
         self.table_name = 'postgres_reindex_actions'
 
+        belongs_to :index, foreign_key: :index_identifier, class_name: 'Gitlab::Database::PostgresIndex'
         enum state: { started: 0, finished: 1, failed: 2 }
 
+        # Amount of time to consider a previous reindexing *recent*
+        RECENT_THRESHOLD = 7.days
+
+        scope :recent, -> { where(state: :finished).where('action_end > ?', Time.zone.now - RECENT_THRESHOLD) }
+
         def self.keep_track_of(index, &block)
           action = create!(
             index_identifier: index.identifier,
             action_start: Time.zone.now,
-            ondisk_size_bytes_start: index.ondisk_size_bytes
+            ondisk_size_bytes_start: index.ondisk_size_bytes,
+            bloat_estimate_bytes_start: index.bloat_size
           )
 
           yield
author	GitLab Bot <gitlab-bot@gitlab.com>	2020-12-17 14:59:07 +0300
committer	GitLab Bot <gitlab-bot@gitlab.com>	2020-12-17 14:59:07 +0300
commit	8b573c94895dc0ac0e1d9d59cf3e8745e8b539ca (patch)
tree	544930fb309b30317ae9797a9683768705d664c4 /lib/gitlab/database
parent	4b1de649d0168371549608993deac953eb692019 (diff)