From 70985aa19b389c2ee8234edfbb516b5403a7bfcf Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Tue, 17 Apr 2018 15:13:38 +0200
Subject: Limit the number of pipelines to count

When displaying the project pipelines dashboard we display a few tabs
for different pipeline states. For every such tab we count the number of
pipelines that belong to it. For large projects such as GitLab CE this
means having to count over 80 000 rows, which can easily take between 70
and 100 milliseconds per query.

To improve this we apply a technique we already use for search results:
we limit the number of rows to count. The current limit is 1000, which
means that if more than 1000 rows are present for a state we will show
"1000+" instead of the exact number. The SQL queries used for this
perform much better than a regular COUNT, even when a project has a lot
of pipelines.

Prior to these changes we would end up running a query like this:

    SELECT COUNT(*)
    FROM ci_pipelines
    WHERE project_id = 13083
    AND status IN ('success', 'failed', 'canceled')

This would produce a plan along the lines of the following:

    Aggregate  (cost=3147.55..3147.56 rows=1 width=8) (actual time=501.413..501.413 rows=1 loops=1)
      Buffers: shared hit=17116 read=861 dirtied=2
      ->  Index Only Scan using index_ci_pipelines_on_project_id_and_ref_and_status_and_id on ci_pipelines  (cost=0.56..2984.14 rows=65364 width=0) (actual time=0.095..490.263 rows=80388 loops=1)
            Index Cond: (project_id = 13083)
            Filter: ((status)::text = ANY ('{success,failed,canceled}'::text[]))
            Rows Removed by Filter: 2894
            Heap Fetches: 353
            Buffers: shared hit=17116 read=861 dirtied=2
    Planning time: 1.409 ms
    Execution time: 501.519 ms

Using the LIMIT count technique we instead run the following query:

    SELECT COUNT(*)
    FROM (
        SELECT 1
        FROM ci_pipelines
        WHERE project_id = 13083
        AND status IN ('success', 'failed', 'canceled')
        LIMIT 1001
    ) for_count

This query produces the following plan:

    Aggregate  (cost=58.77..58.78 rows=1 width=8) (actual time=1.726..1.727 rows=1 loops=1)
      Buffers: shared hit=169 read=15
      ->  Limit  (cost=0.56..46.25 rows=1001 width=4) (actual time=0.164..1.570 rows=1001 loops=1)
            Buffers: shared hit=169 read=15
            ->  Index Only Scan using index_ci_pipelines_on_project_id_and_ref_and_status_and_id on ci_pipelines  (cost=0.56..2984.14 rows=65364 width=4) (actual time=0.162..1.426 rows=1001 loops=1)
                  Index Cond: (project_id = 13083)
                  Filter: ((status)::text = ANY ('{success,failed,canceled}'::text[]))
                  Rows Removed by Filter: 9
                  Heap Fetches: 10
                  Buffers: shared hit=169 read=15
    Planning time: 1.832 ms
    Execution time: 1.821 ms

While this query still uses a Filter for the "status" field the number
of rows that it may end up filtering (at most 1001) is small enough that
an additional index does not appear to be necessary at this time.

See https://gitlab.com/gitlab-org/gitlab-ce/issues/43132#note_68659234
for more information.
---
 app/controllers/projects/pipelines_controller.rb    | 21 ++++++++++-----------
 .../projects/pipelines_controller_spec.rb           |  8 ++++----
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/app/controllers/projects/pipelines_controller.rb b/app/controllers/projects/pipelines_controller.rb
index f7417a6a5aa..329aaafb76d 100644
--- a/app/controllers/projects/pipelines_controller.rb
+++ b/app/controllers/projects/pipelines_controller.rb
@@ -18,17 +18,10 @@ class Projects::PipelinesController < Projects::ApplicationController
       .page(params[:page])
       .per(30)
 
-    @running_count = PipelinesFinder
-      .new(project, scope: 'running').execute.count
-
-    @pending_count = PipelinesFinder
-      .new(project, scope: 'pending').execute.count
-
-    @finished_count = PipelinesFinder
-      .new(project, scope: 'finished').execute.count
-
-    @pipelines_count = PipelinesFinder
-      .new(project).execute.count
+    @running_count = limited_pipelines_count(project, 'running')
+    @pending_count = limited_pipelines_count(project, 'pending')
+    @finished_count = limited_pipelines_count(project, 'finished')
+    @pipelines_count = limited_pipelines_count(project)
 
     @pipelines.map(&:commit) # List commits for batch loading
 
@@ -185,4 +178,10 @@ class Projects::PipelinesController < Projects::ApplicationController
   def authorize_update_pipeline!
     return access_denied! unless can?(current_user, :update_pipeline, @pipeline)
   end
+
+  def limited_pipelines_count(project, scope = nil)
+    finder = PipelinesFinder.new(project, scope: scope)
+
+    view_context.limited_counter_with_delimiter(finder.execute)
+  end
 end
diff --git a/spec/controllers/projects/pipelines_controller_spec.rb b/spec/controllers/projects/pipelines_controller_spec.rb
index a451bbb97b6..c22e455ea2d 100644
--- a/spec/controllers/projects/pipelines_controller_spec.rb
+++ b/spec/controllers/projects/pipelines_controller_spec.rb
@@ -35,10 +35,10 @@ describe Projects::PipelinesController do
 
       expect(json_response).to include('pipelines')
       expect(json_response['pipelines'].count).to eq 4
-      expect(json_response['count']['all']).to eq 4
-      expect(json_response['count']['running']).to eq 1
-      expect(json_response['count']['pending']).to eq 1
-      expect(json_response['count']['finished']).to eq 1
+      expect(json_response['count']['all']).to eq '4'
+      expect(json_response['count']['running']).to eq '1'
+      expect(json_response['count']['pending']).to eq '1'
+      expect(json_response['count']['finished']).to eq '1'
     end
 
     context 'when performing gitaly calls', :request_store do
-- 
cgit v1.2.3


From 19428e800895ba20eacb3357285acef8d69f6d8c Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Mon, 7 May 2018 18:22:07 +0200
Subject: Preload pipeline data for project pipelines

When displaying the pipelines of a project we now preload the following
data:

1. Authors of the commits that belong to these pipelines
2. The number of warnings per pipeline, which is used by
   Ci::Pipeline#has_warnings?

== Commit Authors

Previously this data was queried for every Commit separately, leading to
20 SQL queries being executed in the worst case. With an average of 3 to
5 milliseconds per SQL query this could result in 100 milliseconds being
spent in _just_ getting Commit authors.

To preload this data Commit#author now uses BatchLoader (through
Commit#lazy_author), and a separate module
Gitlab::Ci::Pipeline::Preloader is used to ensure all authors are loaded
before they are used.

== Number of warnings

This changes Ci::Pipeline#has_warnings? so it supports preloading of the
number of warnings per pipeline. This removes the need for executing a
COUNT(*) query for every pipeline just to see if it has any warnings or
not.
---
 app/controllers/projects/pipelines_controller.rb |  2 +-
 app/models/ci/pipeline.rb                        | 13 +++-
 app/models/commit.rb                             | 28 +++++++-
 lib/gitlab/ci/pipeline/preloader.rb              | 28 ++++++++
 spec/lib/gitlab/ci/pipeline/preloader_spec.rb    | 20 ++++++
 spec/models/ci/pipeline_spec.rb                  | 27 ++++++++
 spec/models/commit_spec.rb                       | 84 ++++++++++++++++++++++--
 7 files changed, 195 insertions(+), 7 deletions(-)
 create mode 100644 lib/gitlab/ci/pipeline/preloader.rb
 create mode 100644 spec/lib/gitlab/ci/pipeline/preloader_spec.rb

diff --git a/app/controllers/projects/pipelines_controller.rb b/app/controllers/projects/pipelines_controller.rb
index 329aaafb76d..91afe24b707 100644
--- a/app/controllers/projects/pipelines_controller.rb
+++ b/app/controllers/projects/pipelines_controller.rb
@@ -23,7 +23,7 @@ class Projects::PipelinesController < Projects::ApplicationController
     @finished_count = limited_pipelines_count(project, 'finished')
     @pipelines_count = limited_pipelines_count(project)
 
-    @pipelines.map(&:commit) # List commits for batch loading
+    Gitlab::Ci::Pipeline::Preloader.preload(@pipelines)
 
     respond_to do |format|
       format.html
diff --git a/app/models/ci/pipeline.rb b/app/models/ci/pipeline.rb
index 1f49764e7cc..c26f0b6dcdc 100644
--- a/app/models/ci/pipeline.rb
+++ b/app/models/ci/pipeline.rb
@@ -406,7 +406,18 @@ module Ci
     end
 
     def has_warnings?
-      builds.latest.failed_but_allowed.any?
+      number_of_warnings.positive?
+    end
+
+    def number_of_warnings
+      BatchLoader.for(id).batch(default_value: 0) do |pipeline_ids, loader|
+        Build.where(commit_id: pipeline_ids)
+          .latest
+          .failed_but_allowed
+          .group(:commit_id)
+          .count
+          .each { |id, amount| loader.call(id, amount) }
+      end
     end
 
     def set_config_source
diff --git a/app/models/commit.rb b/app/models/commit.rb
index b46f9f34689..56d4c86774e 100644
--- a/app/models/commit.rb
+++ b/app/models/commit.rb
@@ -224,8 +224,34 @@ class Commit
     Gitlab::ClosingIssueExtractor.new(project, current_user).closed_by_message(safe_message)
   end
 
+  def lazy_author
+    BatchLoader.for(author_email.downcase).batch do |emails, loader|
+      # A Hash that maps user Emails to the corresponding User objects. The
+      # Emails at this point are the _primary_ Emails of the Users.
+      users_for_emails = User
+        .by_any_email(emails)
+        .each_with_object({}) { |user, hash| hash[user.email] = user }
+
+      users_for_ids = users_for_emails
+        .values
+        .each_with_object({}) { |user, hash| hash[user.id] = user }
+
+      # Some commits may have used an alternative Email address. In this case we
+      # need to query the "emails" table to map those addresses to User objects.
+      Email
+        .where(email: emails - users_for_emails.keys)
+        .pluck(:email, :user_id)
+        .each { |(email, id)| users_for_emails[email] = users_for_ids[id] }
+
+      users_for_emails.each { |email, user| loader.call(email, user) }
+    end
+  end
+
   def author
-    User.find_by_any_email(author_email.downcase)
+    # We use __sync so that we get the actual objects back (including an actual
+    # nil), instead of a wrapper, as returning a wrapped nil breaks a lot of
+    # code.
+    lazy_author.__sync
   end
   request_cache(:author) { author_email.downcase }
 
diff --git a/lib/gitlab/ci/pipeline/preloader.rb b/lib/gitlab/ci/pipeline/preloader.rb
new file mode 100644
index 00000000000..e7a2e5511cf
--- /dev/null
+++ b/lib/gitlab/ci/pipeline/preloader.rb
@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Ci
+    module Pipeline
+      # Class for preloading data associated with pipelines such as commit
+      # authors.
+      module Preloader
+        def self.preload(pipelines)
+          # This ensures that all the pipeline commits are eager loaded before we
+          # start using them.
+          pipelines.each(&:commit)
+
+          pipelines.each do |pipeline|
+            # This preloads the author of every commit. We're using "lazy_author"
+            # here since "author" immediately loads the data on the first call.
+            pipeline.commit.try(:lazy_author)
+
+            # This preloads the number of warnings for every pipeline, ensuring
+            # that Ci::Pipeline#has_warnings? doesn't execute any additional
+            # queries.
+            pipeline.number_of_warnings
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/spec/lib/gitlab/ci/pipeline/preloader_spec.rb b/spec/lib/gitlab/ci/pipeline/preloader_spec.rb
new file mode 100644
index 00000000000..477c7477df0
--- /dev/null
+++ b/spec/lib/gitlab/ci/pipeline/preloader_spec.rb
@@ -0,0 +1,20 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+describe Gitlab::Ci::Pipeline::Preloader do
+  describe '.preload' do
+    it 'preloads the author of every pipeline commit' do
+      commit = double(:commit)
+      pipeline = double(:pipeline, commit: commit)
+
+      expect(commit)
+        .to receive(:lazy_author)
+
+      expect(pipeline)
+        .to receive(:number_of_warnings)
+
+      described_class.preload([pipeline])
+    end
+  end
+end
diff --git a/spec/models/ci/pipeline_spec.rb b/spec/models/ci/pipeline_spec.rb
index ddd66a6be87..e7845b693a1 100644
--- a/spec/models/ci/pipeline_spec.rb
+++ b/spec/models/ci/pipeline_spec.rb
@@ -774,6 +774,33 @@ describe Ci::Pipeline, :mailer do
     end
   end
 
+  describe '#number_of_warnings' do
+    it 'returns the number of warnings' do
+      create(:ci_build, :allowed_to_fail, :failed, pipeline: pipeline, name: 'rubocop')
+
+      expect(pipeline.number_of_warnings).to eq(1)
+    end
+
+    it 'supports eager loading of the number of warnings' do
+      pipeline2 = create(:ci_empty_pipeline, status: :created, project: project)
+
+      create(:ci_build, :allowed_to_fail, :failed, pipeline: pipeline, name: 'rubocop')
+      create(:ci_build, :allowed_to_fail, :failed, pipeline: pipeline2, name: 'rubocop')
+
+      pipelines = project.pipelines.to_a
+
+      pipelines.each(&:number_of_warnings)
+
+      # To run the queries we need to actually use the lazy objects, which we do
+      # by just sending "to_i" to them.
+      amount = ActiveRecord::QueryRecorder
+        .new { pipelines.each { |p| p.number_of_warnings.to_i } }
+        .count
+
+      expect(amount).to eq(1)
+    end
+  end
+
   shared_context 'with some outdated pipelines' do
     before do
       create_pipeline(:canceled, 'ref', 'A', project)
diff --git a/spec/models/commit_spec.rb b/spec/models/commit_spec.rb
index 448b813c2e1..090f91168ad 100644
--- a/spec/models/commit_spec.rb
+++ b/spec/models/commit_spec.rb
@@ -52,22 +52,98 @@ describe Commit do
     end
   end
 
-  describe '#author' do
+  describe '#author', :request_store do
     it 'looks up the author in a case-insensitive way' do
       user = create(:user, email: commit.author_email.upcase)
       expect(commit.author).to eq(user)
     end
 
-    it 'caches the author', :request_store do
+    it 'caches the author' do
       user = create(:user, email: commit.author_email)
-      expect(User).to receive(:find_by_any_email).and_call_original
 
       expect(commit.author).to eq(user)
+
       key = "Commit:author:#{commit.author_email.downcase}"
-      expect(RequestStore.store[key]).to eq(user)
 
+      expect(RequestStore.store[key]).to eq(user)
       expect(commit.author).to eq(user)
     end
+
+    context 'using eager loading' do
+      let!(:alice) { create(:user, email: 'alice@example.com') }
+      let!(:bob) { create(:user, email: 'hunter2@example.com') }
+
+      let(:alice_commit) do
+        described_class.new(RepoHelpers.sample_commit, project).tap do |c|
+          c.author_email = 'alice@example.com'
+        end
+      end
+
+      let(:bob_commit) do
+        # The commit for Bob uses one of his alternative Emails, instead of the
+        # primary one.
+        described_class.new(RepoHelpers.sample_commit, project).tap do |c|
+          c.author_email = 'bob@example.com'
+        end
+      end
+
+      let(:eve_commit) do
+        described_class.new(RepoHelpers.sample_commit, project).tap do |c|
+          c.author_email = 'eve@example.com'
+        end
+      end
+
+      let!(:commits) { [alice_commit, bob_commit, eve_commit] }
+
+      before do
+        create(:email, user: bob, email: 'bob@example.com')
+      end
+
+      it 'executes only two SQL queries' do
+        recorder = ActiveRecord::QueryRecorder.new do
+          # Running this first ensures we don't run one query for every
+          # commit.
+          commits.each(&:lazy_author)
+
+          # This forces the execution of the SQL queries necessary to load the
+          # data.
+          commits.each { |c| c.author.try(:id) }
+        end
+
+        expect(recorder.count).to eq(2)
+      end
+
+      it "preloads the authors for Commits matching a user's primary Email" do
+        commits.each(&:lazy_author)
+
+        expect(alice_commit.author).to eq(alice)
+      end
+
+      it "preloads the authors for Commits using a User's alternative Email" do
+        commits.each(&:lazy_author)
+
+        expect(bob_commit.author).to eq(bob)
+      end
+
+      it 'sets the author to Nil if an author could not be found for a Commit' do
+        commits.each(&:lazy_author)
+
+        expect(eve_commit.author).to be_nil
+      end
+
+      it 'does not execute SQL queries once the authors are preloaded' do
+        commits.each(&:lazy_author)
+        commits.each { |c| c.author.try(:id) }
+
+        recorder = ActiveRecord::QueryRecorder.new do
+          alice_commit.author
+          bob_commit.author
+          eve_commit.author
+        end
+
+        expect(recorder.count).to be_zero
+      end
+    end
   end
 
   describe '#to_reference' do
-- 
cgit v1.2.3


From 878ca2e69b371e6c12acee6bddd32b4406c651d7 Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Fri, 11 May 2018 16:17:03 +0200
Subject: Exclude coverage data from the pipelines page

When displaying a project's pipelines
(Projects::PipelinesController#index) we now exclude the coverage data.
This data was not used by the frontend, yet getting it would require one
SQL query per pipeline. These queries in turn could be quite expensive
on GitLab.com.
---
 app/controllers/projects/pipelines_controller.rb       | 2 +-
 app/serializers/pipeline_entity.rb                     | 6 +++++-
 spec/controllers/projects/pipelines_controller_spec.rb | 6 ++++++
 spec/serializers/pipeline_entity_spec.rb               | 7 +++++++
 4 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/app/controllers/projects/pipelines_controller.rb b/app/controllers/projects/pipelines_controller.rb
index 91afe24b707..6b40fc2fe68 100644
--- a/app/controllers/projects/pipelines_controller.rb
+++ b/app/controllers/projects/pipelines_controller.rb
@@ -34,7 +34,7 @@ class Projects::PipelinesController < Projects::ApplicationController
           pipelines: PipelineSerializer
             .new(project: @project, current_user: @current_user)
             .with_pagination(request, response)
-            .represent(@pipelines),
+            .represent(@pipelines, disable_coverage: true),
           count: {
             all: @pipelines_count,
             running: @running_count,
diff --git a/app/serializers/pipeline_entity.rb b/app/serializers/pipeline_entity.rb
index 6457294b285..f782b411b84 100644
--- a/app/serializers/pipeline_entity.rb
+++ b/app/serializers/pipeline_entity.rb
@@ -4,7 +4,11 @@ class PipelineEntity < Grape::Entity
   expose :id
   expose :user, using: UserEntity
   expose :active?, as: :active
-  expose :coverage
+
+  # Coverage isn't always necessary (e.g. when displaying project pipelines in
+  # the UI). Instead of creating an entirely different entity we just allow the
+  # disabling of this specific field whenever necessary.
+  expose :coverage, unless: proc { options[:disable_coverage] }
   expose :source
 
   expose :created_at, :updated_at
diff --git a/spec/controllers/projects/pipelines_controller_spec.rb b/spec/controllers/projects/pipelines_controller_spec.rb
index c22e455ea2d..9e7bc20a6d1 100644
--- a/spec/controllers/projects/pipelines_controller_spec.rb
+++ b/spec/controllers/projects/pipelines_controller_spec.rb
@@ -41,6 +41,12 @@ describe Projects::PipelinesController do
       expect(json_response['count']['finished']).to eq '1'
     end
 
+    it 'does not include coverage data for the pipelines' do
+      subject
+
+      expect(json_response['pipelines'][0]).not_to include('coverage')
+    end
+
     context 'when performing gitaly calls', :request_store do
       it 'limits the Gitaly requests' do
         expect { subject }.to change { Gitlab::GitalyClient.get_request_count }.by(3)
diff --git a/spec/serializers/pipeline_entity_spec.rb b/spec/serializers/pipeline_entity_spec.rb
index 2473c561f4b..e67d12b7a89 100644
--- a/spec/serializers/pipeline_entity_spec.rb
+++ b/spec/serializers/pipeline_entity_spec.rb
@@ -26,6 +26,13 @@ describe PipelineEntity do
         expect(subject).to include :updated_at, :created_at
       end
 
+      it 'excludes coverage data when disabled' do
+        entity = described_class
+          .represent(pipeline, request: request, disable_coverage: true)
+
+        expect(entity.as_json).not_to include(:coverage)
+      end
+
       it 'contains details' do
         expect(subject).to include :details
         expect(subject[:details])
-- 
cgit v1.2.3


From da7bbef8fed767483236f9503553942b4c1acd05 Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Fri, 11 May 2018 16:22:09 +0200
Subject: Added changelog for pipelines page performance

---
 changelogs/unreleased/pipelines-index-performance.yml | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 changelogs/unreleased/pipelines-index-performance.yml

diff --git a/changelogs/unreleased/pipelines-index-performance.yml b/changelogs/unreleased/pipelines-index-performance.yml
new file mode 100644
index 00000000000..928c2ddab72
--- /dev/null
+++ b/changelogs/unreleased/pipelines-index-performance.yml
@@ -0,0 +1,5 @@
+---
+title: Improve performance of project pipelines pages
+merge_request:
+author:
+type: performance
-- 
cgit v1.2.3