diff options
author | Jan Provaznik <jprovaznik@gitlab.com> | 2018-12-03 00:47:33 +0300 |
---|---|---|
committer | Jan Provaznik <jprovaznik@gitlab.com> | 2018-12-06 11:25:09 +0300 |
commit | 58bfd733310effa94af0e1f1f19e53e34235cffc (patch) | |
tree | e793b8f8b2669034e80b7668304f3fc75dc23deb /lib | |
parent | 00acef434031b5dc0bf39576a9e83802c7806842 (diff) |
Optimized file search to work without limits
* removed 100 limit on file search results because we
load all results anyway
* expensive processing (parsing match content, utf encoding)
is done only for selected page in paginated output
Diffstat (limited to 'lib')
-rw-r--r-- | lib/api/search.rb | 7 | ||||
-rw-r--r-- | lib/gitlab/file_finder.rb | 57 | ||||
-rw-r--r-- | lib/gitlab/project_search_results.rb | 43 | ||||
-rw-r--r-- | lib/gitlab/search/found_blob.rb | 162 | ||||
-rw-r--r-- | lib/gitlab/search/query.rb | 6 | ||||
-rw-r--r-- | lib/gitlab/search_results.rb | 36 | ||||
-rw-r--r-- | lib/gitlab/wiki_file_finder.rb | 6 |
7 files changed, 196 insertions, 121 deletions
diff --git a/lib/api/search.rb b/lib/api/search.rb index 5900e1cccc2..f5db692afe5 100644 --- a/lib/api/search.rb +++ b/lib/api/search.rb @@ -35,12 +35,7 @@ module API end def process_results(results) - case params[:scope] - when 'blobs', 'wiki_blobs' - paginate(results).map { |blob| blob[1] } - else - paginate(results) - end + paginate(results) end def snippets? diff --git a/lib/gitlab/file_finder.rb b/lib/gitlab/file_finder.rb index b4db3f93c9c..3958814208c 100644 --- a/lib/gitlab/file_finder.rb +++ b/lib/gitlab/file_finder.rb @@ -4,8 +4,6 @@ # the result is joined and sorted by file name module Gitlab class FileFinder - BATCH_SIZE = 100 - attr_reader :project, :ref delegate :repository, to: :project @@ -16,60 +14,35 @@ module Gitlab end def find(query) - query = Gitlab::Search::Query.new(query) do - filter :filename, matcher: ->(filter, blob) { blob.filename =~ /#{filter[:regex_value]}$/i } - filter :path, matcher: ->(filter, blob) { blob.filename =~ /#{filter[:regex_value]}/i } - filter :extension, matcher: ->(filter, blob) { blob.filename =~ /\.#{filter[:regex_value]}$/i } + query = Gitlab::Search::Query.new(query, encode_binary: true) do + filter :filename, matcher: ->(filter, blob) { blob.binary_filename =~ /#{filter[:regex_value]}$/i } + filter :path, matcher: ->(filter, blob) { blob.binary_filename =~ /#{filter[:regex_value]}/i } + filter :extension, matcher: ->(filter, blob) { blob.binary_filename =~ /\.#{filter[:regex_value]}$/i } end - by_content = find_by_content(query.term) - - already_found = Set.new(by_content.map(&:filename)) - by_filename = find_by_filename(query.term, except: already_found) + files = find_by_filename(query.term) + find_by_content(query.term) - files = (by_content + by_filename) - .sort_by(&:filename) + files = query.filter_results(files) if query.filters.any? - query.filter_results(files).map { |blob| [blob.filename, blob] } + files end private def find_by_content(query) - results = repository.search_files_by_content(query, ref).first(BATCH_SIZE) - results.map { |result| Gitlab::ProjectSearchResults.parse_search_result(result, project) } - end - - def find_by_filename(query, except: []) - filenames = search_filenames(query, except) - - blobs(filenames).map do |blob| - Gitlab::SearchResults::FoundBlob.new( - id: blob.id, - filename: blob.path, - basename: File.basename(blob.path, File.extname(blob.path)), - ref: ref, - startline: 1, - data: blob.data, - project: project - ) + repository.search_files_by_content(query, ref).map do |result| + Gitlab::Search::FoundBlob.new(content_match: result, project: project, ref: ref, repository: repository) end end - def search_filenames(query, except) - filenames = repository.search_files_by_name(query, ref).first(BATCH_SIZE) - - filenames.delete_if { |filename| except.include?(filename) } unless except.empty? - - filenames - end - - def blob_refs(filenames) - filenames.map { |filename| [ref, filename] } + def find_by_filename(query) + search_filenames(query).map do |filename| + Gitlab::Search::FoundBlob.new(blob_filename: filename, project: project, ref: ref, repository: repository) + end end - def blobs(filenames) - Gitlab::Git::Blob.batch(repository, blob_refs(filenames), blob_size_limit: 1024) + def search_filenames(query) + repository.search_files_by_name(query, ref) end end end diff --git a/lib/gitlab/project_search_results.rb b/lib/gitlab/project_search_results.rb index 04df881bf03..a68f8801c2a 100644 --- a/lib/gitlab/project_search_results.rb +++ b/lib/gitlab/project_search_results.rb @@ -17,9 +17,9 @@ module Gitlab when 'notes' notes.page(page).per(per_page) when 'blobs' - Kaminari.paginate_array(blobs).page(page).per(per_page) + paginated_blobs(blobs, page) when 'wiki_blobs' - Kaminari.paginate_array(wiki_blobs).page(page).per(per_page) + paginated_blobs(wiki_blobs, page) when 'commits' Kaminari.paginate_array(commits).page(page).per(per_page) else @@ -55,37 +55,6 @@ module Gitlab @commits_count ||= commits.count end - def self.parse_search_result(result, project = nil) - ref = nil - filename = nil - basename = nil - - data = [] - startline = 0 - - result.each_line.each_with_index do |line, index| - prefix ||= line.match(/^(?<ref>[^:]*):(?<filename>[^\x00]*)\x00(?<startline>\d+)\x00/)&.tap do |matches| - ref = matches[:ref] - filename = matches[:filename] - startline = matches[:startline] - startline = startline.to_i - index - extname = Regexp.escape(File.extname(filename)) - basename = filename.sub(/#{extname}$/, '') - end - - data << line.sub(prefix.to_s, '') - end - - FoundBlob.new( - filename: filename, - basename: basename, - ref: ref, - startline: startline, - data: data.join, - project: project - ) - end - def single_commit_result? return false if commits_count != 1 @@ -97,6 +66,14 @@ module Gitlab private + def paginated_blobs(blobs, page) + results = Kaminari.paginate_array(blobs).page(page).per(per_page) + + Gitlab::Search::FoundBlob.preload_blobs(results) + + results + end + def blobs return [] unless Ability.allowed?(@current_user, :download_code, @project) diff --git a/lib/gitlab/search/found_blob.rb b/lib/gitlab/search/found_blob.rb new file mode 100644 index 00000000000..a62ab1521a7 --- /dev/null +++ b/lib/gitlab/search/found_blob.rb @@ -0,0 +1,162 @@ +# frozen_string_literal: true + +module Gitlab + module Search + class FoundBlob + include EncodingHelper + include Presentable + include BlobLanguageFromGitAttributes + include Gitlab::Utils::StrongMemoize + + attr_reader :project, :content_match, :blob_filename + + FILENAME_REGEXP = /\A(?<ref>[^:]*):(?<filename>[^\x00]*)\x00/.freeze + CONTENT_REGEXP = /^(?<ref>[^:]*):(?<filename>[^\x00]*)\x00(?<startline>\d+)\x00/.freeze + + def self.preload_blobs(blobs) + to_fetch = blobs.select { |blob| blob.is_a?(self) && blob.blob_filename } + + to_fetch.each { |blob| blob.fetch_blob } + end + + def initialize(opts = {}) + @id = opts.fetch(:id, nil) + @binary_filename = opts.fetch(:filename, nil) + @binary_basename = opts.fetch(:basename, nil) + @ref = opts.fetch(:ref, nil) + @startline = opts.fetch(:startline, nil) + @binary_data = opts.fetch(:data, nil) + @per_page = opts.fetch(:per_page, 20) + @project = opts.fetch(:project, nil) + # Some caller does not have project object (e.g. elastic search), + # yet they can trigger many calls in one go, + # causing duplicated queries. + # Allow those to just pass project_id instead. + @project_id = opts.fetch(:project_id, nil) + @content_match = opts.fetch(:content_match, nil) + @blob_filename = opts.fetch(:blob_filename, nil) + @repository = opts.fetch(:repository, nil) + end + + def id + @id ||= parsed_content[:id] + end + + def ref + @ref ||= parsed_content[:ref] + end + + def startline + @startline ||= parsed_content[:startline] + end + + # binary_filename is used for running filters on all matches, + # for grepped results (which use content_match), we get + # filename from the beginning of the grepped result which is faster + # then parsing whole snippet + def binary_filename + @binary_filename ||= content_match ? search_result_filename : parsed_content[:binary_filename] + end + + def filename + @filename ||= encode_utf8(@binary_filename || parsed_content[:binary_filename]) + end + + def basename + @basename ||= encode_utf8(@binary_basename || parsed_content[:binary_basename]) + end + + def data + @data ||= encode_utf8(@binary_data || parsed_content[:binary_data]) + end + + def path + filename + end + + def project_id + @project_id || @project&.id + end + + def present + super(presenter_class: BlobPresenter) + end + + def fetch_blob + path = [ref, blob_filename] + missing_blob = { binary_filename: blob_filename } + + BatchLoader.for(path).batch(default_value: missing_blob) do |refs, loader| + Gitlab::Git::Blob.batch(repository, refs, blob_size_limit: 1024).each do |blob| + # if the blob couldn't be fetched for some reason, + # show at least the blob filename + data = { + id: blob.id, + binary_filename: blob.path, + binary_basename: File.basename(blob.path, File.extname(blob.path)), + ref: ref, + startline: 1, + binary_data: blob.data, + project: project + } + + loader.call([ref, blob.path], data) + end + end + end + + private + + def search_result_filename + content_match.match(FILENAME_REGEXP) { |matches| matches[:filename] } + end + + def parsed_content + strong_memoize(:parsed_content) do + if content_match + parse_search_result + elsif blob_filename + fetch_blob + else + {} + end + end + end + + def parse_search_result + ref = nil + filename = nil + basename = nil + + data = [] + startline = 0 + + content_match.each_line.each_with_index do |line, index| + prefix ||= line.match(CONTENT_REGEXP)&.tap do |matches| + ref = matches[:ref] + filename = matches[:filename] + startline = matches[:startline] + startline = startline.to_i - index + extname = Regexp.escape(File.extname(filename)) + basename = filename.sub(/#{extname}$/, '') + end + + data << line.sub(prefix.to_s, '') + end + + { + binary_filename: filename, + binary_basename: basename, + ref: ref, + startline: startline, + binary_data: data.join, + project: project + } + end + + def repository + @repository ||= project.repository + end + end + end +end diff --git a/lib/gitlab/search/query.rb b/lib/gitlab/search/query.rb index 7f69083a492..ba0e16607a6 100644 --- a/lib/gitlab/search/query.rb +++ b/lib/gitlab/search/query.rb @@ -3,6 +3,8 @@ module Gitlab module Search class Query < SimpleDelegator + include EncodingHelper + def initialize(query, filter_opts = {}, &block) @raw_query = query.dup @filters = [] @@ -50,7 +52,9 @@ module Gitlab end def parse_filter(filter, input) - filter[:parser].call(input) + result = filter[:parser].call(input) + + @filter_options[:encode_binary] ? encode_binary(result) : result end end end diff --git a/lib/gitlab/search_results.rb b/lib/gitlab/search_results.rb index 458737f31eb..491148ec1a6 100644 --- a/lib/gitlab/search_results.rb +++ b/lib/gitlab/search_results.rb @@ -2,42 +2,6 @@ module Gitlab class SearchResults - class FoundBlob - include EncodingHelper - include Presentable - include BlobLanguageFromGitAttributes - - attr_reader :id, :filename, :basename, :ref, :startline, :data, :project - - def initialize(opts = {}) - @id = opts.fetch(:id, nil) - @filename = encode_utf8(opts.fetch(:filename, nil)) - @basename = encode_utf8(opts.fetch(:basename, nil)) - @ref = opts.fetch(:ref, nil) - @startline = opts.fetch(:startline, nil) - @data = encode_utf8(opts.fetch(:data, nil)) - @per_page = opts.fetch(:per_page, 20) - @project = opts.fetch(:project, nil) - # Some caller does not have project object (e.g. elastic search), - # yet they can trigger many calls in one go, - # causing duplicated queries. - # Allow those to just pass project_id instead. - @project_id = opts.fetch(:project_id, nil) - end - - def path - filename - end - - def project_id - @project_id || @project&.id - end - - def present - super(presenter_class: BlobPresenter) - end - end - attr_reader :current_user, :query, :per_page # Limit search results by passed projects diff --git a/lib/gitlab/wiki_file_finder.rb b/lib/gitlab/wiki_file_finder.rb index a00cd65594c..5303b3582ab 100644 --- a/lib/gitlab/wiki_file_finder.rb +++ b/lib/gitlab/wiki_file_finder.rb @@ -2,6 +2,8 @@ module Gitlab class WikiFileFinder < FileFinder + BATCH_SIZE = 100 + attr_reader :repository def initialize(project, ref) @@ -12,13 +14,11 @@ module Gitlab private - def search_filenames(query, except) + def search_filenames(query) safe_query = Regexp.escape(query.tr(' ', '-')) safe_query = Regexp.new(safe_query, Regexp::IGNORECASE) filenames = repository.ls_files(ref) - filenames.delete_if { |filename| except.include?(filename) } unless except.empty? - filenames.grep(safe_query).first(BATCH_SIZE) end end |