1 files changed, 170 insertions, 61 deletions
diff --git a/lib/gitlab/git/blob.rb b/lib/gitlab/git/blob.rb
index a7aceab4c14..eabcf46cf58 100644
--- a/lib/gitlab/git/blob.rb
+++ b/lib/gitlab/git/blob.rb
@@ -1,3 +1,5 @@
+# Gitaly note: JV: seems to be completely migrated (behind feature flags).
+
 module Gitlab
   module Git
     class Blob
@@ -10,6 +12,12 @@ module Gitlab
       # blob data should use load_all_data!.
       MAX_DATA_DISPLAY_SIZE = 10.megabytes
 
+      # These limits are used as a heuristic to ignore files which can't be LFS
+      # pointers. The format of these is described in
+      # https://github.com/git-lfs/git-lfs/blob/master/docs/spec.md#the-pointer
+      LFS_POINTER_MIN_SIZE = 120.bytes
+      LFS_POINTER_MAX_SIZE = 200.bytes
+
       attr_accessor :name, :path, :size, :data, :mode, :id, :commit_id, :loaded_size, :binary
 
       class << self
@@ -18,18 +26,132 @@ module Gitlab
             if is_enabled
               find_by_gitaly(repository, sha, path)
             else
-              find_by_rugged(repository, sha, path)
+              find_by_rugged(repository, sha, path, limit: MAX_DATA_DISPLAY_SIZE)
+            end
+          end
+        end
+
+        def raw(repository, sha)
+          Gitlab::GitalyClient.migrate(:git_blob_raw) do |is_enabled|
+            if is_enabled
+              repository.gitaly_blob_client.get_blob(oid: sha, limit: MAX_DATA_DISPLAY_SIZE)
+            else
+              rugged_raw(repository, sha, limit: MAX_DATA_DISPLAY_SIZE)
+            end
+          end
+        end
+
+        # Returns an array of Blob instances, specified in blob_references as
+        # [[commit_sha, path], [commit_sha, path], ...]. If blob_size_limit < 0 then the
+        # full blob contents are returned. If blob_size_limit >= 0 then each blob will
+        # contain no more than limit bytes in its data attribute.
+        #
+        # Keep in mind that this method may allocate a lot of memory. It is up
+        # to the caller to limit the number of blobs and blob_size_limit.
+        #
+        # Gitaly migration issue: https://gitlab.com/gitlab-org/gitaly/issues/798
+        def batch(repository, blob_references, blob_size_limit: MAX_DATA_DISPLAY_SIZE)
+          Gitlab::GitalyClient.migrate(:list_blobs_by_sha_path) do |is_enabled|
+            if is_enabled
+              repository.gitaly_blob_client.get_blobs(blob_references, blob_size_limit).to_a
+            else
+              blob_references.map do |sha, path|
+                find_by_rugged(repository, sha, path, limit: blob_size_limit)
+              end
             end
           end
         end
 
-        def find_by_gitaly(repository, sha, path)
-          path = path.sub(/\A\/*/, '')
+        # Find LFS blobs given an array of sha ids
+        # Returns array of Gitlab::Git::Blob
+        # Does not guarantee blob data will be set
+        def batch_lfs_pointers(repository, blob_ids)
+          repository.gitaly_migrate(:batch_lfs_pointers) do |is_enabled|
+            if is_enabled
+              repository.gitaly_blob_client.batch_lfs_pointers(blob_ids.to_a)
+            else
+              blob_ids.lazy
+                      .select { |sha| possible_lfs_blob?(repository, sha) }
+                      .map { |sha| rugged_raw(repository, sha, limit: LFS_POINTER_MAX_SIZE) }
+                      .select(&:lfs_pointer?)
+                      .force
+            end
+          end
+        end
+
+        def binary?(data)
+          EncodingHelper.detect_libgit2_binary?(data)
+        end
+
+        def size_could_be_lfs?(size)
+          size.between?(LFS_POINTER_MIN_SIZE, LFS_POINTER_MAX_SIZE)
+        end
+
+        private
+
+        # Recursive search of blob id by path
+        #
+        # Ex.
+        #   blog/            # oid: 1a
+        #     app/           # oid: 2a
+        #       models/      # oid: 3a
+        #       file.rb      # oid: 4a
+        #
+        #
+        # Blob.find_entry_by_path(repo, '1a', 'app/file.rb') # => '4a'
+        #
+        def find_entry_by_path(repository, root_id, path)
+          root_tree = repository.lookup(root_id)
+          # Strip leading slashes
+          path[%r{^/*}] = ''
+          path_arr = path.split('/')
+
+          entry = root_tree.find do |entry|
+            entry[:name] == path_arr[0]
+          end
+
+          return nil unless entry
+
+          if path_arr.size > 1
+            return nil unless entry[:type] == :tree
+
+            path_arr.shift
+            find_entry_by_path(repository, entry[:oid], path_arr.join('/'))
+          else
+            [:blob, :commit].include?(entry[:type]) ? entry : nil
+          end
+        end
+
+        def submodule_blob(blob_entry, path, sha)
+          new(
+            id: blob_entry[:oid],
+            name: blob_entry[:name],
+            size: 0,
+            data: '',
+            path: path,
+            commit_id: sha
+          )
+        end
+
+        def find_by_gitaly(repository, sha, path, limit: MAX_DATA_DISPLAY_SIZE)
+          return unless path
+
+          path = path.sub(%r{\A/*}, '')
           path = '/' if path.empty?
           name = File.basename(path)
-          entry = Gitlab::GitalyClient::Commit.new(repository).tree_entry(sha, path, MAX_DATA_DISPLAY_SIZE)
+
+          # Gitaly will think that setting the limit to 0 means unlimited, while
+          # the client might only need the metadata and thus set the limit to 0.
+          # In this method we'll then set the limit to 1, but clear the byte of data
+          # that we got back so for the outside world it looks like the limit was
+          # actually 0.
+          req_limit = limit == 0 ? 1 : limit
+
+          entry = Gitlab::GitalyClient::CommitService.new(repository).tree_entry(sha, path, req_limit)
           return unless entry
 
+          entry.data = "" if limit == 0
+
           case entry.type
           when :COMMIT
             new(
@@ -41,10 +163,6 @@ module Gitlab
               commit_id: sha
             )
           when :BLOB
-            # EncodingDetector checks the first 1024 * 1024 bytes for NUL byte, libgit2 checks
-            # only the first 8000 (https://github.com/libgit2/libgit2/blob/2ed855a9e8f9af211e7274021c2264e600c0f86b/src/filter.h#L15),
-            # which is what we use below to keep a consistent behavior.
-            detect = CharlockHolmes::EncodingDetector.new(8000).detect(entry.data)
             new(
               id: entry.oid,
               name: name,
@@ -53,14 +171,16 @@ module Gitlab
               mode: entry.mode.to_s(8),
               path: path,
               commit_id: sha,
-              binary: detect && detect[:type] == :binary
+              binary: binary?(entry.data)
             )
           end
         end
 
-        def find_by_rugged(repository, sha, path)
-          commit = repository.lookup(sha)
-          root_tree = commit.tree
+        def find_by_rugged(repository, sha, path, limit:)
+          return unless path
+
+          rugged_commit = repository.lookup(sha)
+          root_tree = rugged_commit.tree
 
           blob_entry = find_entry_by_path(repository, root_tree.oid, path)
 
@@ -76,7 +196,8 @@ module Gitlab
                 id: blob.oid,
                 name: blob_entry[:name],
                 size: blob.size,
-                data: blob.content(MAX_DATA_DISPLAY_SIZE),
+                # Rugged::Blob#content is expensive; don't call it if we don't have to.
+                data: limit.zero? ? '' : blob.content(limit),
                 mode: blob_entry[:filemode].to_s(8),
                 path: path,
                 commit_id: sha,
@@ -84,71 +205,42 @@ module Gitlab
               )
             end
           end
+        rescue Rugged::ReferenceError
+          nil
         end
 
-        def raw(repository, sha)
+        def rugged_raw(repository, sha, limit:)
           blob = repository.lookup(sha)
 
+          return unless blob.is_a?(Rugged::Blob)
+
           new(
             id: blob.oid,
             size: blob.size,
-            data: blob.content(MAX_DATA_DISPLAY_SIZE),
+            data: blob.content(limit),
             binary: blob.binary?
           )
         end
 
-        # Recursive search of blob id by path
-        #
-        # Ex.
-        #   blog/            # oid: 1a
-        #     app/           # oid: 2a
-        #       models/      # oid: 3a
-        #       file.rb      # oid: 4a
-        #
-        #
-        # Blob.find_entry_by_path(repo, '1a', 'app/file.rb') # => '4a'
-        #
-        def find_entry_by_path(repository, root_id, path)
-          root_tree = repository.lookup(root_id)
-          # Strip leading slashes
-          path[/^\/*/] = ''
-          path_arr = path.split('/')
-
-          entry = root_tree.find do |entry|
-            entry[:name] == path_arr[0]
-          end
-
-          return nil unless entry
-
-          if path_arr.size > 1
-            return nil unless entry[:type] == :tree
-            path_arr.shift
-            find_entry_by_path(repository, entry[:oid], path_arr.join('/'))
-          else
-            [:blob, :commit].include?(entry[:type]) ? entry : nil
-          end
-        end
+        # Efficient lookup to determine if object size
+        # and type make it a possible LFS blob without loading
+        # blob content into memory with repository.lookup(sha)
+        def possible_lfs_blob?(repository, sha)
+          object_header = repository.rugged.read_header(sha)
 
-        def submodule_blob(blob_entry, path, sha)
-          new(
-            id: blob_entry[:oid],
-            name: blob_entry[:name],
-            size: 0,
-            data: '',
-            path: path,
-            commit_id: sha
-          )
+          object_header[:type] == :blob &&
+            size_could_be_lfs?(object_header[:len])
         end
       end
 
       def initialize(options)
         %w(id name path size data mode commit_id binary).each do |key|
-          self.send("#{key}=", options[key.to_sym])
+          self.__send__("#{key}=", options[key.to_sym]) # rubocop:disable GitlabSecurity/PublicSend
         end
 
-        @loaded_all_data = false
         # Retain the actual size before it is encoded
         @loaded_size = @data.bytesize if @data
+        @loaded_all_data = @loaded_size == size
       end
 
       def binary?
@@ -163,18 +255,35 @@ module Gitlab
       # memory as a Ruby string.
       def load_all_data!(repository)
         return if @data == '' # don't mess with submodule blobs
-        return @data if @loaded_all_data
+
+        # Even if we return early, recalculate wether this blob is binary in
+        # case a blob was initialized as text but the full data isn't
+        @binary = nil
+
+        return if @loaded_all_data
+
+        @data = Gitlab::GitalyClient.migrate(:git_blob_load_all_data) do |is_enabled|
+          begin
+            if is_enabled
+              repository.gitaly_blob_client.get_blob(oid: id, limit: -1).data
+            else
+              repository.lookup(id).content
+            end
+          end
+        end
 
         @loaded_all_data = true
-        @data = repository.lookup(id).content
         @loaded_size = @data.bytesize
-        @binary = nil
       end
 
       def name
         encode! @name
       end
 
+      def path
+        encode! @path
+      end
+
       def truncated?
         size && (size > loaded_size)
       end
@@ -185,7 +294,7 @@ module Gitlab
       # size
       # see https://github.com/github/git-lfs/blob/v1.1.0/docs/spec.md#the-pointer
       def lfs_pointer?
-        has_lfs_version_key? && lfs_oid.present? && lfs_size.present?
+        self.class.size_could_be_lfs?(size) && has_lfs_version_key? && lfs_oid.present? && lfs_size.present?
       end
 
       def lfs_oid