From 58bfd733310effa94af0e1f1f19e53e34235cffc Mon Sep 17 00:00:00 2001 From: Jan Provaznik Date: Sun, 2 Dec 2018 22:47:33 +0100 Subject: Optimized file search to work without limits * removed 100 limit on file search results because we load all results anyway * expensive processing (parsing match content, utf encoding) is done only for selected page in paginated output --- lib/gitlab/search/found_blob.rb | 162 ++++++++++++++++++++++++++++++++++++++++ lib/gitlab/search/query.rb | 6 +- 2 files changed, 167 insertions(+), 1 deletion(-) create mode 100644 lib/gitlab/search/found_blob.rb (limited to 'lib/gitlab/search') diff --git a/lib/gitlab/search/found_blob.rb b/lib/gitlab/search/found_blob.rb new file mode 100644 index 00000000000..a62ab1521a7 --- /dev/null +++ b/lib/gitlab/search/found_blob.rb @@ -0,0 +1,162 @@ +# frozen_string_literal: true + +module Gitlab + module Search + class FoundBlob + include EncodingHelper + include Presentable + include BlobLanguageFromGitAttributes + include Gitlab::Utils::StrongMemoize + + attr_reader :project, :content_match, :blob_filename + + FILENAME_REGEXP = /\A(?[^:]*):(?[^\x00]*)\x00/.freeze + CONTENT_REGEXP = /^(?[^:]*):(?[^\x00]*)\x00(?\d+)\x00/.freeze + + def self.preload_blobs(blobs) + to_fetch = blobs.select { |blob| blob.is_a?(self) && blob.blob_filename } + + to_fetch.each { |blob| blob.fetch_blob } + end + + def initialize(opts = {}) + @id = opts.fetch(:id, nil) + @binary_filename = opts.fetch(:filename, nil) + @binary_basename = opts.fetch(:basename, nil) + @ref = opts.fetch(:ref, nil) + @startline = opts.fetch(:startline, nil) + @binary_data = opts.fetch(:data, nil) + @per_page = opts.fetch(:per_page, 20) + @project = opts.fetch(:project, nil) + # Some caller does not have project object (e.g. elastic search), + # yet they can trigger many calls in one go, + # causing duplicated queries. + # Allow those to just pass project_id instead. + @project_id = opts.fetch(:project_id, nil) + @content_match = opts.fetch(:content_match, nil) + @blob_filename = opts.fetch(:blob_filename, nil) + @repository = opts.fetch(:repository, nil) + end + + def id + @id ||= parsed_content[:id] + end + + def ref + @ref ||= parsed_content[:ref] + end + + def startline + @startline ||= parsed_content[:startline] + end + + # binary_filename is used for running filters on all matches, + # for grepped results (which use content_match), we get + # filename from the beginning of the grepped result which is faster + # then parsing whole snippet + def binary_filename + @binary_filename ||= content_match ? search_result_filename : parsed_content[:binary_filename] + end + + def filename + @filename ||= encode_utf8(@binary_filename || parsed_content[:binary_filename]) + end + + def basename + @basename ||= encode_utf8(@binary_basename || parsed_content[:binary_basename]) + end + + def data + @data ||= encode_utf8(@binary_data || parsed_content[:binary_data]) + end + + def path + filename + end + + def project_id + @project_id || @project&.id + end + + def present + super(presenter_class: BlobPresenter) + end + + def fetch_blob + path = [ref, blob_filename] + missing_blob = { binary_filename: blob_filename } + + BatchLoader.for(path).batch(default_value: missing_blob) do |refs, loader| + Gitlab::Git::Blob.batch(repository, refs, blob_size_limit: 1024).each do |blob| + # if the blob couldn't be fetched for some reason, + # show at least the blob filename + data = { + id: blob.id, + binary_filename: blob.path, + binary_basename: File.basename(blob.path, File.extname(blob.path)), + ref: ref, + startline: 1, + binary_data: blob.data, + project: project + } + + loader.call([ref, blob.path], data) + end + end + end + + private + + def search_result_filename + content_match.match(FILENAME_REGEXP) { |matches| matches[:filename] } + end + + def parsed_content + strong_memoize(:parsed_content) do + if content_match + parse_search_result + elsif blob_filename + fetch_blob + else + {} + end + end + end + + def parse_search_result + ref = nil + filename = nil + basename = nil + + data = [] + startline = 0 + + content_match.each_line.each_with_index do |line, index| + prefix ||= line.match(CONTENT_REGEXP)&.tap do |matches| + ref = matches[:ref] + filename = matches[:filename] + startline = matches[:startline] + startline = startline.to_i - index + extname = Regexp.escape(File.extname(filename)) + basename = filename.sub(/#{extname}$/, '') + end + + data << line.sub(prefix.to_s, '') + end + + { + binary_filename: filename, + binary_basename: basename, + ref: ref, + startline: startline, + binary_data: data.join, + project: project + } + end + + def repository + @repository ||= project.repository + end + end + end +end diff --git a/lib/gitlab/search/query.rb b/lib/gitlab/search/query.rb index 7f69083a492..ba0e16607a6 100644 --- a/lib/gitlab/search/query.rb +++ b/lib/gitlab/search/query.rb @@ -3,6 +3,8 @@ module Gitlab module Search class Query < SimpleDelegator + include EncodingHelper + def initialize(query, filter_opts = {}, &block) @raw_query = query.dup @filters = [] @@ -50,7 +52,9 @@ module Gitlab end def parse_filter(filter, input) - filter[:parser].call(input) + result = filter[:parser].call(input) + + @filter_options[:encode_binary] ? encode_binary(result) : result end end end -- cgit v1.2.3