Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitlab-foss.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'lib/gitlab/database/similarity_score.rb')
-rw-r--r--lib/gitlab/database/similarity_score.rb110
1 files changed, 110 insertions, 0 deletions
diff --git a/lib/gitlab/database/similarity_score.rb b/lib/gitlab/database/similarity_score.rb
new file mode 100644
index 00000000000..2633c29438a
--- /dev/null
+++ b/lib/gitlab/database/similarity_score.rb
@@ -0,0 +1,110 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Database
+ class SimilarityScore
+ EMPTY_STRING = Arel.sql("''").freeze
+ EXPRESSION_ON_INVALID_INPUT = Arel::Nodes::NamedFunction.new('CAST', [Arel.sql('0').as('integer')]).freeze
+ DEFAULT_MULTIPLIER = 1
+
+ # This method returns an Arel expression that can be used in an ActiveRecord query to order the resultset by similarity.
+ #
+ # Note: Calculating similarity score for large volume of records is inefficient. use SimilarityScore only for smaller
+ # resultset which is already filtered by other conditions (< 10_000 records).
+ #
+ # ==== Parameters
+ # * +search+ - [String] the user provided search string
+ # * +rules+ - [{ column: COLUMN, multiplier: 1 }, { column: COLUMN_2, multiplier: 0.5 }] rules for the scoring.
+ # * +column+ - Arel column expression, example: Project.arel_table["name"]
+ # * +multiplier+ - Integer or Float to increase or decrease the score (optional, defaults to 1)
+ #
+ # ==== Use case
+ #
+ # We'd like to search for projects by path, name and description. We want to rank higher the path and name matches, since
+ # it's more likely that the user was remembering the path or the name of the project.
+ #
+ # Rules:
+ # [
+ # { column: Project.arel_table['path'], multiplier: 1 },
+ # { column: Project.arel_table['name'], multiplier: 1 },
+ # { column: Project.arel_table['description'], multiplier: 0.5 }
+ # ]
+ #
+ # ==== Examples
+ #
+ # Similarity calculation based on one column:
+ #
+ # Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [{ column: Project.arel_table['name'] }])
+ #
+ # Similarity calculation based on two column, where the second column has lower priority:
+ #
+ # Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [
+ # { column: Project.arel_table['name'], multiplier: 1 },
+ # { column: Project.arel_table['description'], multiplier: 0.5 }
+ # ])
+ #
+ # Integration with an ActiveRecord query:
+ #
+ # table = Project.arel_table
+ #
+ # order_expression = Gitlab::Database::SimilarityScore.build_expession(search: 'input', rules: [
+ # { column: table['name'], multiplier: 1 },
+ # { column: table['description'], multiplier: 0.5 }
+ # ])
+ #
+ # Project.where("name LIKE ?", '%' + 'input' + '%').order(order_expression.desc)
+ #
+ # The expression can be also used in SELECT:
+ #
+ # results = Project.select(order_expression.as('similarity')).where("name LIKE ?", '%' + 'input' + '%').order(similarity: :desc)
+ # puts results.map(&:similarity)
+ #
+ def self.build_expression(search:, rules:)
+ return EXPRESSION_ON_INVALID_INPUT if search.blank? || rules.empty?
+
+ quoted_search = ActiveRecord::Base.connection.quote(search.to_s)
+
+ first_expression, *expressions = rules.map do |rule|
+ rule_to_arel(quoted_search, rule)
+ end
+
+ # (SIMILARITY ...) + (SIMILARITY ...)
+ expressions.inject(first_expression) do |expression1, expression2|
+ Arel::Nodes::Addition.new(expression1, expression2)
+ end
+ end
+
+ # (SIMILARITY(COALESCE(column, ''), 'search_string') * CAST(multiplier AS numeric))
+ def self.rule_to_arel(search, rule)
+ Arel::Nodes::Grouping.new(
+ Arel::Nodes::Multiplication.new(
+ similarity_function_call(search, column_expression(rule)),
+ multiplier_expression(rule)
+ )
+ )
+ end
+
+ # COALESCE(column, '')
+ def self.column_expression(rule)
+ Arel::Nodes::NamedFunction.new('COALESCE', [rule.fetch(:column), EMPTY_STRING])
+ end
+
+ # SIMILARITY(COALESCE(column, ''), 'search_string')
+ def self.similarity_function_call(search, column)
+ Arel::Nodes::NamedFunction.new('SIMILARITY', [column, Arel.sql(search)])
+ end
+
+ # CAST(multiplier AS numeric)
+ def self.multiplier_expression(rule)
+ quoted_multiplier = ActiveRecord::Base.connection.quote(rule.fetch(:multiplier, DEFAULT_MULTIPLIER).to_s)
+
+ Arel::Nodes::NamedFunction.new('CAST', [Arel.sql(quoted_multiplier).as('numeric')])
+ end
+
+ private_class_method :rule_to_arel
+ private_class_method :column_expression
+ private_class_method :similarity_function_call
+ private_class_method :multiplier_expression
+ end
+ end
+end