From 58bfd733310effa94af0e1f1f19e53e34235cffc Mon Sep 17 00:00:00 2001
From: Jan Provaznik <jprovaznik@gitlab.com>
Date: Sun, 2 Dec 2018 22:47:33 +0100
Subject: Optimized file search to work without limits

* removed 100 limit on file search results because we
  load all results anyway
* expensive processing (parsing match content, utf encoding)
  is done only for selected page in paginated output
---
 lib/gitlab/search/found_blob.rb | 162 ++++++++++++++++++++++++++++++++++++++++
 lib/gitlab/search/query.rb      |   6 +-
 2 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 lib/gitlab/search/found_blob.rb

(limited to 'lib/gitlab/search')
diff --git a/lib/gitlab/search/found_blob.rb b/lib/gitlab/search/found_blob.rb
new file mode 100644
index 00000000000..a62ab1521a7
--- /dev/null
+++ b/lib/gitlab/search/found_blob.rb
@@ -0,0 +1,162 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Search
+    class FoundBlob
+      include EncodingHelper
+      include Presentable
+      include BlobLanguageFromGitAttributes
+      include Gitlab::Utils::StrongMemoize
+
+      attr_reader :project, :content_match, :blob_filename
+
+      FILENAME_REGEXP = /\A(?<ref>[^:]*):(?<filename>[^\x00]*)\x00/.freeze
+      CONTENT_REGEXP = /^(?<ref>[^:]*):(?<filename>[^\x00]*)\x00(?<startline>\d+)\x00/.freeze
+
+      def self.preload_blobs(blobs)
+        to_fetch = blobs.select { |blob| blob.is_a?(self) && blob.blob_filename }
+
+        to_fetch.each { |blob| blob.fetch_blob }
+      end
+
+      def initialize(opts = {})
+        @id = opts.fetch(:id, nil)
+        @binary_filename = opts.fetch(:filename, nil)
+        @binary_basename = opts.fetch(:basename, nil)
+        @ref = opts.fetch(:ref, nil)
+        @startline = opts.fetch(:startline, nil)
+        @binary_data = opts.fetch(:data, nil)
+        @per_page = opts.fetch(:per_page, 20)
+        @project = opts.fetch(:project, nil)
+        # Some caller does not have project object (e.g. elastic search),
+        # yet they can trigger many calls in one go,
+        # causing duplicated queries.
+        # Allow those to just pass project_id instead.
+        @project_id = opts.fetch(:project_id, nil)
+        @content_match = opts.fetch(:content_match, nil)
+        @blob_filename = opts.fetch(:blob_filename, nil)
+        @repository = opts.fetch(:repository, nil)
+      end
+
+      def id
+        @id ||= parsed_content[:id]
+      end
+
+      def ref
+        @ref ||= parsed_content[:ref]
+      end
+
+      def startline
+        @startline ||= parsed_content[:startline]
+      end
+
+      # binary_filename is used for running filters on all matches,
+      # for grepped results (which use content_match), we get
+      # filename from the beginning of the grepped result which is faster
+      # then parsing whole snippet
+      def binary_filename
+        @binary_filename ||= content_match ? search_result_filename : parsed_content[:binary_filename]
+      end
+
+      def filename
+        @filename ||= encode_utf8(@binary_filename || parsed_content[:binary_filename])
+      end
+
+      def basename
+        @basename ||= encode_utf8(@binary_basename || parsed_content[:binary_basename])
+      end
+
+      def data
+        @data ||= encode_utf8(@binary_data || parsed_content[:binary_data])
+      end
+
+      def path
+        filename
+      end
+
+      def project_id
+        @project_id || @project&.id
+      end
+
+      def present
+        super(presenter_class: BlobPresenter)
+      end
+
+      def fetch_blob
+        path = [ref, blob_filename]
+        missing_blob = { binary_filename: blob_filename }
+
+        BatchLoader.for(path).batch(default_value: missing_blob) do |refs, loader|
+          Gitlab::Git::Blob.batch(repository, refs, blob_size_limit: 1024).each do |blob|
+            # if the blob couldn't be fetched for some reason,
+            # show at least the blob filename
+            data = {
+              id: blob.id,
+              binary_filename: blob.path,
+              binary_basename: File.basename(blob.path, File.extname(blob.path)),
+              ref: ref,
+              startline: 1,
+              binary_data: blob.data,
+              project: project
+            }
+
+            loader.call([ref, blob.path], data)
+          end
+        end
+      end
+
+      private
+
+      def search_result_filename
+        content_match.match(FILENAME_REGEXP) { |matches| matches[:filename] }
+      end
+
+      def parsed_content
+        strong_memoize(:parsed_content) do
+          if content_match
+            parse_search_result
+          elsif blob_filename
+            fetch_blob
+          else
+            {}
+          end
+        end
+      end
+
+      def parse_search_result
+        ref = nil
+        filename = nil
+        basename = nil
+
+        data = []
+        startline = 0
+
+        content_match.each_line.each_with_index do |line, index|
+          prefix ||= line.match(CONTENT_REGEXP)&.tap do |matches|
+            ref = matches[:ref]
+            filename = matches[:filename]
+            startline = matches[:startline]
+            startline = startline.to_i - index
+            extname = Regexp.escape(File.extname(filename))
+            basename = filename.sub(/#{extname}$/, '')
+          end
+
+          data << line.sub(prefix.to_s, '')
+        end
+
+        {
+          binary_filename: filename,
+          binary_basename: basename,
+          ref: ref,
+          startline: startline,
+          binary_data: data.join,
+          project: project
+        }
+      end
+
+      def repository
+        @repository ||= project.repository
+      end
+    end
+  end
+end
diff --git a/lib/gitlab/search/query.rb b/lib/gitlab/search/query.rb
index 7f69083a492..ba0e16607a6 100644
--- a/lib/gitlab/search/query.rb
+++ b/lib/gitlab/search/query.rb
@@ -3,6 +3,8 @@
 module Gitlab
   module Search
     class Query < SimpleDelegator
+      include EncodingHelper
+
       def initialize(query, filter_opts = {}, &block)
         @raw_query = query.dup
         @filters = []
@@ -50,7 +52,9 @@ module Gitlab
       end
 
       def parse_filter(filter, input)
-        filter[:parser].call(input)
+        result = filter[:parser].call(input)
+
+        @filter_options[:encode_binary] ? encode_binary(result) : result
       end
     end
   end
-- 
cgit v1.2.3