1 files changed, 159 insertions, 0 deletions
diff --git a/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
new file mode 100644
index 00000000000..33faa2ef1b0
--- /dev/null
+++ b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
@@ -0,0 +1,159 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Database
+    module PostgresHll
+      # For large tables, PostgreSQL can take a long time to count rows due to MVCC.
+      # Implements a distinct batch counter based on HyperLogLog algorithm
+      # Needs indexes on the column below to calculate max, min and range queries
+      # For larger tables just set higher batch_size with index optimization
+      #
+      # In order to not use a possible complex time consuming query when calculating min and max values,
+      # the start and finish can be sent specifically, start and finish should contain max and min values for PRIMARY KEY of
+      # relation (most cases `id` column) rather than counted attribute eg:
+      # estimate_distinct_count(start: ::Project.with_active_services.minimum(:id), finish: ::Project.with_active_services.maximum(:id))
+      #
+      # Grouped relations are NOT supported yet.
+      #
+      # @example Usage
+      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
+      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
+      #    .estimate_distinct_count(
+      #      batch_size: 1_000,
+      #      start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
+      #      finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
+      #    )
+      #
+      # @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
+      #  Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
+      #  for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
+      #  for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
+      class BatchDistinctCounter
+        ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
+        FALLBACK = -1
+        MIN_REQUIRED_BATCH_SIZE = 750
+        SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
+        MAX_DATA_VOLUME = 4_000_000_000
+
+        # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
+        DEFAULT_BATCH_SIZE = 10_000
+
+        BIT_31_MASK = "B'0#{'1' * 31}'"
+        BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
+        # @example source_query
+        #   SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
+        #   FROM %{relation}
+        #   WHERE %{pkey} >= %{batch_start}
+        #   AND %{pkey} < %{batch_end}
+        #   AND %{column} IS NOT NULL
+        BUCKETED_DATA_SQL = <<~SQL
+          WITH hashed_attributes AS (%{source_query})
+          SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
+            (31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
+          FROM hashed_attributes
+          GROUP BY 1
+        SQL
+
+        TOTAL_BUCKETS_NUMBER = 512
+
+        def initialize(relation, column = nil)
+          @relation = relation
+          @column = column || relation.primary_key
+        end
+
+        def unwanted_configuration?(finish, batch_size, start)
+          batch_size <= MIN_REQUIRED_BATCH_SIZE ||
+            (finish - start) >= MAX_DATA_VOLUME ||
+            start > finish
+        end
+
+        def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
+          raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
+
+          batch_size ||= DEFAULT_BATCH_SIZE
+
+          start = actual_start(start)
+          finish = actual_finish(finish)
+
+          raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
+          return FALLBACK if unwanted_configuration?(finish, batch_size, start)
+
+          batch_start = start
+          hll_blob = {}
+
+          while batch_start <= finish
+            begin
+              hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
+              batch_start += batch_size
+            end
+            sleep(SLEEP_TIME_IN_SECONDS)
+          end
+
+          estimate_cardinality(hll_blob)
+        end
+
+        private
+
+        # arbitrary values that are present in #estimate_cardinality
+        # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
+        # article, they are not representing any entity and serves as tune value
+        # for the whole equation
+        def estimate_cardinality(hll_blob)
+          num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
+
+          num_uniques = (
+            ((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
+              (num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
+          ).to_i
+
+          if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
+            ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
+              Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
+          else
+            num_uniques
+          end
+        end
+
+        def hll_blob_for_batch(start, finish)
+          @relation
+            .connection
+            .execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })
+            .map(&:values)
+            .to_h
+        end
+
+        # Generate the source query SQL snippet for the provided id range
+        #
+        # @example SQL query template
+        #   SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
+        #   FROM %{relation}
+        #   WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
+        #   AND %{column} IS NOT NULL
+        #
+        # @param start initial id range
+        # @param finish final id range
+        # @return [String] SQL query fragment
+        def source_query(start, finish)
+          col_as_arel = @column.is_a?(Arel::Attributes::Attribute) ? @column : Arel.sql(@column.to_s)
+          col_as_text = Arel::Nodes::NamedFunction.new('CAST', [col_as_arel.as('text')])
+          md5_of_col = Arel::Nodes::NamedFunction.new('md5', [col_as_text])
+          md5_as_hex = Arel::Nodes::Concat.new(Arel.sql("'X'"), md5_of_col)
+          bits = Arel::Nodes::NamedFunction.new('CAST', [md5_as_hex.as('bit(32)')])
+
+          @relation
+            .where(@relation.primary_key => (start...finish))
+            .where(col_as_arel.not_eq(nil))
+            .select(bits.as('attr_hash_32_bits')).to_sql
+        end
+
+        def actual_start(start)
+          start || @relation.unscope(:group, :having).minimum(@relation.primary_key) || 0
+        end
+
+        def actual_finish(finish)
+          finish || @relation.unscope(:group, :having).maximum(@relation.primary_key) || 0
+        end
+      end
+    end
+  end
+end