Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitlab-foss.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Brandl <abrandl@gitlab.com>2018-10-28 21:16:47 +0300
committerAndreas Brandl <abrandl@gitlab.com>2018-12-03 23:26:50 +0300
commit3a7d9b4b02934c259fa2eda46143fa67320c2f5f (patch)
tree011da8a1dedf0da904f3c962ff3c90e46e869123 /lib/gitlab/database
parentdce4a92bb06ae58e08bf104acc9b47a57f62c0dd (diff)
Implement TablesampleCountStrategy.
A tablesample count executes in two phases: * Estimate table sizes based on reltuples. * Based on the estimate: * If the table is considered 'small', execute an exact relation count. * Otherwise, count on a sample of the table using TABLESAMPLE. The size of the sample is chosen in a way that we always roughly scan the same amount of rows (see TABLESAMPLE_ROW_TARGET).
Diffstat (limited to 'lib/gitlab/database')
-rw-r--r--lib/gitlab/database/count.rb128
1 files changed, 108 insertions, 20 deletions
diff --git a/lib/gitlab/database/count.rb b/lib/gitlab/database/count.rb
index e244bf40bbf..01fabcced65 100644
--- a/lib/gitlab/database/count.rb
+++ b/lib/gitlab/database/count.rb
@@ -1,7 +1,12 @@
# frozen_string_literal: true
# For large tables, PostgreSQL can take a long time to count rows due to MVCC.
-# We can optimize this by using the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
+# We can optimize this by using various strategies for approximate counting.
+#
+# For example, we can use the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
+#
+# However, since statistics are not always up to date, we also implement a table sampling strategy
+# that performs an exact count but only on a sample of the table. See TablesampleCountStrategy.
module Gitlab
module Database
module Count
@@ -20,12 +25,17 @@ module Gitlab
end
# Takes in an array of models and returns a Hash for the approximate
- # counts for them. If the model's table has not been vacuumed or
- # analyzed recently, simply run the Model.count to get the data.
+ # counts for them.
+ #
+ # Various count strategies can be specified that are executed in
+ # sequence until all tables have an approximate count attached
+ # or we run out of strategies.
+ #
+ # Note that not all strategies are available on all supported RDBMS.
#
# @param [Array]
# @return [Hash] of Model -> count mapping
- def self.approximate_counts(models, strategies: [ReltuplesCountStrategy, ExactCountStrategy])
+ def self.approximate_counts(models, strategies: [TablesampleCountStrategy, ReltuplesCountStrategy, ExactCountStrategy])
strategies.each_with_object({}) do |strategy, counts_by_model|
if strategy.enabled?
models_with_missing_counts = models - counts_by_model.keys
@@ -41,6 +51,13 @@ module Gitlab
end
end
+ # This strategy performs an exact count on the model.
+ #
+ # This is guaranteed to be accurate, however it also scans the
+ # whole table. Hence, there are no guarantees with respect
+ # to runtime.
+ #
+ # Note that for very large tables, this may even timeout.
class ExactCountStrategy
attr_reader :models
def initialize(models)
@@ -58,6 +75,15 @@ module Gitlab
end
end
+ # This strategy counts based on PostgreSQL's statistics in pg_stat_user_tables.
+ #
+ # Specifically, it relies on the column reltuples in said table. An additional
+ # check is performed to make sure statistics were updated within the last hour.
+ #
+ # Otherwise, this strategy skips tables with outdated statistics.
+ #
+ # There are no guarantees with respect to the accuracy of this strategy. Runtime
+ # however is guaranteed to be "fast", because it only looks up statistics.
class ReltuplesCountStrategy
attr_reader :models
def initialize(models)
@@ -68,7 +94,23 @@ module Gitlab
#
# @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
def count
- query = postgresql_estimate_query(table_names)
+ size_estimates
+ rescue *CONNECTION_ERRORS
+ {}
+ end
+
+ def self.enabled?
+ Gitlab::Database.postgresql?
+ end
+
+ private
+
+ def table_names
+ models.map(&:table_name)
+ end
+
+ def size_estimates(check_statistics: true)
+ query = postgresql_estimate_query(table_names, check_statistics: check_statistics)
rows = []
# Querying tuple stats only works on the primary. Due to load
@@ -83,18 +125,6 @@ module Gitlab
model = table_to_model[row['table_name']]
data[model] = row['estimate'].to_i
end
- rescue *CONNECTION_ERRORS => e
- {}
- end
-
- def self.enabled?
- Gitlab::Database.postgresql?
- end
-
- private
-
- def table_names
- models.map(&:table_name)
end
# Generates the PostgreSQL query to return the tuples for tables
@@ -102,14 +132,72 @@ module Gitlab
#
# @param [Array] table names
# @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
- def postgresql_estimate_query(table_names)
+ def postgresql_estimate_query(table_names, check_statistics: true)
time = "to_timestamp(#{1.hour.ago.to_i})"
- <<~SQL
+ base_query = <<~SQL
SELECT pg_class.relname AS table_name, reltuples::bigint AS estimate FROM pg_class
LEFT JOIN pg_stat_user_tables ON pg_class.relname = pg_stat_user_tables.relname
WHERE pg_class.relname IN (#{table_names.map { |table| "'#{table}'" }.join(',')})
- AND (last_vacuum > #{time} OR last_autovacuum > #{time} OR last_analyze > #{time} OR last_autoanalyze > #{time})
SQL
+ if check_statistics
+ base_query + "AND (last_vacuum > #{time} OR last_autovacuum > #{time} OR last_analyze > #{time} OR last_autoanalyze > #{time})"
+ else
+ base_query
+ end
+ end
+ end
+
+ # A tablesample count executes in two phases:
+ # * Estimate table sizes based on reltuples.
+ # * Based on the estimate:
+ # * If the table is considered 'small', execute an exact relation count.
+ # * Otherwise, count on a sample of the table using TABLESAMPLE.
+ #
+ # The size of the sample is chosen in a way that we always roughly scan
+ # the same amount of rows (see TABLESAMPLE_ROW_TARGET).
+ #
+ # There are no guarantees with respect to the accuracy of the result or runtime.
+ class TablesampleCountStrategy < ReltuplesCountStrategy
+ EXACT_COUNT_THRESHOLD = 100_000
+ TABLESAMPLE_ROW_TARGET = 100_000
+
+ def count
+ estimates = size_estimates(check_statistics: false)
+
+ models.each_with_object({}) do |model, count_by_model|
+ count = perform_count(model, estimates[model])
+ count_by_model[model] = count if count
+ end
+ rescue *CONNECTION_ERRORS
+ {}
+ end
+
+ private
+ def perform_count(model, estimate)
+ # If we estimate 0, we may not have statistics at all. Don't use them.
+ return nil unless estimate && estimate > 0
+
+ if estimate < EXACT_COUNT_THRESHOLD
+ # The table is considered small, the assumption here is that
+ # the exact count will be fast anyways.
+ model.count
+ else
+ # The table is considered large, let's only count on a sample.
+ tablesample_count(model, estimate)
+ end
+ end
+
+ def tablesample_count(model, estimate)
+ portion = (TABLESAMPLE_ROW_TARGET.to_f / estimate).round(4)
+ inverse = 1/portion
+ query = <<~SQL
+ SELECT (COUNT(*)*#{inverse})::integer AS count
+ FROM #{model.table_name} TABLESAMPLE SYSTEM (#{portion*100})
+ SQL
+
+ rows = ActiveRecord::Base.connection.select_all(query)
+
+ Integer(rows.first['count'])
end
end
end