From 70985aa19b389c2ee8234edfbb516b5403a7bfcf Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Tue, 17 Apr 2018 15:13:38 +0200 Subject: Limit the number of pipelines to count When displaying the project pipelines dashboard we display a few tabs for different pipeline states. For every such tab we count the number of pipelines that belong to it. For large projects such as GitLab CE this means having to count over 80 000 rows, which can easily take between 70 and 100 milliseconds per query. To improve this we apply a technique we already use for search results: we limit the number of rows to count. The current limit is 1000, which means that if more than 1000 rows are present for a state we will show "1000+" instead of the exact number. The SQL queries used for this perform much better than a regular COUNT, even when a project has a lot of pipelines. Prior to these changes we would end up running a query like this: SELECT COUNT(*) FROM ci_pipelines WHERE project_id = 13083 AND status IN ('success', 'failed', 'canceled') This would produce a plan along the lines of the following: Aggregate (cost=3147.55..3147.56 rows=1 width=8) (actual time=501.413..501.413 rows=1 loops=1) Buffers: shared hit=17116 read=861 dirtied=2 -> Index Only Scan using index_ci_pipelines_on_project_id_and_ref_and_status_and_id on ci_pipelines (cost=0.56..2984.14 rows=65364 width=0) (actual time=0.095..490.263 rows=80388 loops=1) Index Cond: (project_id = 13083) Filter: ((status)::text = ANY ('{success,failed,canceled}'::text[])) Rows Removed by Filter: 2894 Heap Fetches: 353 Buffers: shared hit=17116 read=861 dirtied=2 Planning time: 1.409 ms Execution time: 501.519 ms Using the LIMIT count technique we instead run the following query: SELECT COUNT(*) FROM ( SELECT 1 FROM ci_pipelines WHERE project_id = 13083 AND status IN ('success', 'failed', 'canceled') LIMIT 1001 ) for_count This query produces the following plan: Aggregate (cost=58.77..58.78 rows=1 width=8) (actual time=1.726..1.727 rows=1 loops=1) Buffers: shared hit=169 read=15 -> Limit (cost=0.56..46.25 rows=1001 width=4) (actual time=0.164..1.570 rows=1001 loops=1) Buffers: shared hit=169 read=15 -> Index Only Scan using index_ci_pipelines_on_project_id_and_ref_and_status_and_id on ci_pipelines (cost=0.56..2984.14 rows=65364 width=4) (actual time=0.162..1.426 rows=1001 loops=1) Index Cond: (project_id = 13083) Filter: ((status)::text = ANY ('{success,failed,canceled}'::text[])) Rows Removed by Filter: 9 Heap Fetches: 10 Buffers: shared hit=169 read=15 Planning time: 1.832 ms Execution time: 1.821 ms While this query still uses a Filter for the "status" field the number of rows that it may end up filtering (at most 1001) is small enough that an additional index does not appear to be necessary at this time. See https://gitlab.com/gitlab-org/gitlab-ce/issues/43132#note_68659234 for more information. --- app/controllers/projects/pipelines_controller.rb | 21 ++++++++++----------- .../projects/pipelines_controller_spec.rb | 8 ++++---- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/app/controllers/projects/pipelines_controller.rb b/app/controllers/projects/pipelines_controller.rb index f7417a6a5aa..329aaafb76d 100644 --- a/app/controllers/projects/pipelines_controller.rb +++ b/app/controllers/projects/pipelines_controller.rb @@ -18,17 +18,10 @@ class Projects::PipelinesController < Projects::ApplicationController .page(params[:page]) .per(30) - @running_count = PipelinesFinder - .new(project, scope: 'running').execute.count - - @pending_count = PipelinesFinder - .new(project, scope: 'pending').execute.count - - @finished_count = PipelinesFinder - .new(project, scope: 'finished').execute.count - - @pipelines_count = PipelinesFinder - .new(project).execute.count + @running_count = limited_pipelines_count(project, 'running') + @pending_count = limited_pipelines_count(project, 'pending') + @finished_count = limited_pipelines_count(project, 'finished') + @pipelines_count = limited_pipelines_count(project) @pipelines.map(&:commit) # List commits for batch loading @@ -185,4 +178,10 @@ class Projects::PipelinesController < Projects::ApplicationController def authorize_update_pipeline! return access_denied! unless can?(current_user, :update_pipeline, @pipeline) end + + def limited_pipelines_count(project, scope = nil) + finder = PipelinesFinder.new(project, scope: scope) + + view_context.limited_counter_with_delimiter(finder.execute) + end end diff --git a/spec/controllers/projects/pipelines_controller_spec.rb b/spec/controllers/projects/pipelines_controller_spec.rb index a451bbb97b6..c22e455ea2d 100644 --- a/spec/controllers/projects/pipelines_controller_spec.rb +++ b/spec/controllers/projects/pipelines_controller_spec.rb @@ -35,10 +35,10 @@ describe Projects::PipelinesController do expect(json_response).to include('pipelines') expect(json_response['pipelines'].count).to eq 4 - expect(json_response['count']['all']).to eq 4 - expect(json_response['count']['running']).to eq 1 - expect(json_response['count']['pending']).to eq 1 - expect(json_response['count']['finished']).to eq 1 + expect(json_response['count']['all']).to eq '4' + expect(json_response['count']['running']).to eq '1' + expect(json_response['count']['pending']).to eq '1' + expect(json_response['count']['finished']).to eq '1' end context 'when performing gitaly calls', :request_store do -- cgit v1.2.3 From 19428e800895ba20eacb3357285acef8d69f6d8c Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Mon, 7 May 2018 18:22:07 +0200 Subject: Preload pipeline data for project pipelines When displaying the pipelines of a project we now preload the following data: 1. Authors of the commits that belong to these pipelines 2. The number of warnings per pipeline, which is used by Ci::Pipeline#has_warnings? == Commit Authors Previously this data was queried for every Commit separately, leading to 20 SQL queries being executed in the worst case. With an average of 3 to 5 milliseconds per SQL query this could result in 100 milliseconds being spent in _just_ getting Commit authors. To preload this data Commit#author now uses BatchLoader (through Commit#lazy_author), and a separate module Gitlab::Ci::Pipeline::Preloader is used to ensure all authors are loaded before they are used. == Number of warnings This changes Ci::Pipeline#has_warnings? so it supports preloading of the number of warnings per pipeline. This removes the need for executing a COUNT(*) query for every pipeline just to see if it has any warnings or not. --- app/controllers/projects/pipelines_controller.rb | 2 +- app/models/ci/pipeline.rb | 13 +++- app/models/commit.rb | 28 +++++++- lib/gitlab/ci/pipeline/preloader.rb | 28 ++++++++ spec/lib/gitlab/ci/pipeline/preloader_spec.rb | 20 ++++++ spec/models/ci/pipeline_spec.rb | 27 ++++++++ spec/models/commit_spec.rb | 84 ++++++++++++++++++++++-- 7 files changed, 195 insertions(+), 7 deletions(-) create mode 100644 lib/gitlab/ci/pipeline/preloader.rb create mode 100644 spec/lib/gitlab/ci/pipeline/preloader_spec.rb diff --git a/app/controllers/projects/pipelines_controller.rb b/app/controllers/projects/pipelines_controller.rb index 329aaafb76d..91afe24b707 100644 --- a/app/controllers/projects/pipelines_controller.rb +++ b/app/controllers/projects/pipelines_controller.rb @@ -23,7 +23,7 @@ class Projects::PipelinesController < Projects::ApplicationController @finished_count = limited_pipelines_count(project, 'finished') @pipelines_count = limited_pipelines_count(project) - @pipelines.map(&:commit) # List commits for batch loading + Gitlab::Ci::Pipeline::Preloader.preload(@pipelines) respond_to do |format| format.html diff --git a/app/models/ci/pipeline.rb b/app/models/ci/pipeline.rb index 1f49764e7cc..c26f0b6dcdc 100644 --- a/app/models/ci/pipeline.rb +++ b/app/models/ci/pipeline.rb @@ -406,7 +406,18 @@ module Ci end def has_warnings? - builds.latest.failed_but_allowed.any? + number_of_warnings.positive? + end + + def number_of_warnings + BatchLoader.for(id).batch(default_value: 0) do |pipeline_ids, loader| + Build.where(commit_id: pipeline_ids) + .latest + .failed_but_allowed + .group(:commit_id) + .count + .each { |id, amount| loader.call(id, amount) } + end end def set_config_source diff --git a/app/models/commit.rb b/app/models/commit.rb index b46f9f34689..56d4c86774e 100644 --- a/app/models/commit.rb +++ b/app/models/commit.rb @@ -224,8 +224,34 @@ class Commit Gitlab::ClosingIssueExtractor.new(project, current_user).closed_by_message(safe_message) end + def lazy_author + BatchLoader.for(author_email.downcase).batch do |emails, loader| + # A Hash that maps user Emails to the corresponding User objects. The + # Emails at this point are the _primary_ Emails of the Users. + users_for_emails = User + .by_any_email(emails) + .each_with_object({}) { |user, hash| hash[user.email] = user } + + users_for_ids = users_for_emails + .values + .each_with_object({}) { |user, hash| hash[user.id] = user } + + # Some commits may have used an alternative Email address. In this case we + # need to query the "emails" table to map those addresses to User objects. + Email + .where(email: emails - users_for_emails.keys) + .pluck(:email, :user_id) + .each { |(email, id)| users_for_emails[email] = users_for_ids[id] } + + users_for_emails.each { |email, user| loader.call(email, user) } + end + end + def author - User.find_by_any_email(author_email.downcase) + # We use __sync so that we get the actual objects back (including an actual + # nil), instead of a wrapper, as returning a wrapped nil breaks a lot of + # code. + lazy_author.__sync end request_cache(:author) { author_email.downcase } diff --git a/lib/gitlab/ci/pipeline/preloader.rb b/lib/gitlab/ci/pipeline/preloader.rb new file mode 100644 index 00000000000..e7a2e5511cf --- /dev/null +++ b/lib/gitlab/ci/pipeline/preloader.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +module Gitlab + module Ci + module Pipeline + # Class for preloading data associated with pipelines such as commit + # authors. + module Preloader + def self.preload(pipelines) + # This ensures that all the pipeline commits are eager loaded before we + # start using them. + pipelines.each(&:commit) + + pipelines.each do |pipeline| + # This preloads the author of every commit. We're using "lazy_author" + # here since "author" immediately loads the data on the first call. + pipeline.commit.try(:lazy_author) + + # This preloads the number of warnings for every pipeline, ensuring + # that Ci::Pipeline#has_warnings? doesn't execute any additional + # queries. + pipeline.number_of_warnings + end + end + end + end + end +end diff --git a/spec/lib/gitlab/ci/pipeline/preloader_spec.rb b/spec/lib/gitlab/ci/pipeline/preloader_spec.rb new file mode 100644 index 00000000000..477c7477df0 --- /dev/null +++ b/spec/lib/gitlab/ci/pipeline/preloader_spec.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true + +require 'spec_helper' + +describe Gitlab::Ci::Pipeline::Preloader do + describe '.preload' do + it 'preloads the author of every pipeline commit' do + commit = double(:commit) + pipeline = double(:pipeline, commit: commit) + + expect(commit) + .to receive(:lazy_author) + + expect(pipeline) + .to receive(:number_of_warnings) + + described_class.preload([pipeline]) + end + end +end diff --git a/spec/models/ci/pipeline_spec.rb b/spec/models/ci/pipeline_spec.rb index ddd66a6be87..e7845b693a1 100644 --- a/spec/models/ci/pipeline_spec.rb +++ b/spec/models/ci/pipeline_spec.rb @@ -774,6 +774,33 @@ describe Ci::Pipeline, :mailer do end end + describe '#number_of_warnings' do + it 'returns the number of warnings' do + create(:ci_build, :allowed_to_fail, :failed, pipeline: pipeline, name: 'rubocop') + + expect(pipeline.number_of_warnings).to eq(1) + end + + it 'supports eager loading of the number of warnings' do + pipeline2 = create(:ci_empty_pipeline, status: :created, project: project) + + create(:ci_build, :allowed_to_fail, :failed, pipeline: pipeline, name: 'rubocop') + create(:ci_build, :allowed_to_fail, :failed, pipeline: pipeline2, name: 'rubocop') + + pipelines = project.pipelines.to_a + + pipelines.each(&:number_of_warnings) + + # To run the queries we need to actually use the lazy objects, which we do + # by just sending "to_i" to them. + amount = ActiveRecord::QueryRecorder + .new { pipelines.each { |p| p.number_of_warnings.to_i } } + .count + + expect(amount).to eq(1) + end + end + shared_context 'with some outdated pipelines' do before do create_pipeline(:canceled, 'ref', 'A', project) diff --git a/spec/models/commit_spec.rb b/spec/models/commit_spec.rb index 448b813c2e1..090f91168ad 100644 --- a/spec/models/commit_spec.rb +++ b/spec/models/commit_spec.rb @@ -52,22 +52,98 @@ describe Commit do end end - describe '#author' do + describe '#author', :request_store do it 'looks up the author in a case-insensitive way' do user = create(:user, email: commit.author_email.upcase) expect(commit.author).to eq(user) end - it 'caches the author', :request_store do + it 'caches the author' do user = create(:user, email: commit.author_email) - expect(User).to receive(:find_by_any_email).and_call_original expect(commit.author).to eq(user) + key = "Commit:author:#{commit.author_email.downcase}" - expect(RequestStore.store[key]).to eq(user) + expect(RequestStore.store[key]).to eq(user) expect(commit.author).to eq(user) end + + context 'using eager loading' do + let!(:alice) { create(:user, email: 'alice@example.com') } + let!(:bob) { create(:user, email: 'hunter2@example.com') } + + let(:alice_commit) do + described_class.new(RepoHelpers.sample_commit, project).tap do |c| + c.author_email = 'alice@example.com' + end + end + + let(:bob_commit) do + # The commit for Bob uses one of his alternative Emails, instead of the + # primary one. + described_class.new(RepoHelpers.sample_commit, project).tap do |c| + c.author_email = 'bob@example.com' + end + end + + let(:eve_commit) do + described_class.new(RepoHelpers.sample_commit, project).tap do |c| + c.author_email = 'eve@example.com' + end + end + + let!(:commits) { [alice_commit, bob_commit, eve_commit] } + + before do + create(:email, user: bob, email: 'bob@example.com') + end + + it 'executes only two SQL queries' do + recorder = ActiveRecord::QueryRecorder.new do + # Running this first ensures we don't run one query for every + # commit. + commits.each(&:lazy_author) + + # This forces the execution of the SQL queries necessary to load the + # data. + commits.each { |c| c.author.try(:id) } + end + + expect(recorder.count).to eq(2) + end + + it "preloads the authors for Commits matching a user's primary Email" do + commits.each(&:lazy_author) + + expect(alice_commit.author).to eq(alice) + end + + it "preloads the authors for Commits using a User's alternative Email" do + commits.each(&:lazy_author) + + expect(bob_commit.author).to eq(bob) + end + + it 'sets the author to Nil if an author could not be found for a Commit' do + commits.each(&:lazy_author) + + expect(eve_commit.author).to be_nil + end + + it 'does not execute SQL queries once the authors are preloaded' do + commits.each(&:lazy_author) + commits.each { |c| c.author.try(:id) } + + recorder = ActiveRecord::QueryRecorder.new do + alice_commit.author + bob_commit.author + eve_commit.author + end + + expect(recorder.count).to be_zero + end + end end describe '#to_reference' do -- cgit v1.2.3 From 878ca2e69b371e6c12acee6bddd32b4406c651d7 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Fri, 11 May 2018 16:17:03 +0200 Subject: Exclude coverage data from the pipelines page When displaying a project's pipelines (Projects::PipelinesController#index) we now exclude the coverage data. This data was not used by the frontend, yet getting it would require one SQL query per pipeline. These queries in turn could be quite expensive on GitLab.com. --- app/controllers/projects/pipelines_controller.rb | 2 +- app/serializers/pipeline_entity.rb | 6 +++++- spec/controllers/projects/pipelines_controller_spec.rb | 6 ++++++ spec/serializers/pipeline_entity_spec.rb | 7 +++++++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/app/controllers/projects/pipelines_controller.rb b/app/controllers/projects/pipelines_controller.rb index 91afe24b707..6b40fc2fe68 100644 --- a/app/controllers/projects/pipelines_controller.rb +++ b/app/controllers/projects/pipelines_controller.rb @@ -34,7 +34,7 @@ class Projects::PipelinesController < Projects::ApplicationController pipelines: PipelineSerializer .new(project: @project, current_user: @current_user) .with_pagination(request, response) - .represent(@pipelines), + .represent(@pipelines, disable_coverage: true), count: { all: @pipelines_count, running: @running_count, diff --git a/app/serializers/pipeline_entity.rb b/app/serializers/pipeline_entity.rb index 6457294b285..f782b411b84 100644 --- a/app/serializers/pipeline_entity.rb +++ b/app/serializers/pipeline_entity.rb @@ -4,7 +4,11 @@ class PipelineEntity < Grape::Entity expose :id expose :user, using: UserEntity expose :active?, as: :active - expose :coverage + + # Coverage isn't always necessary (e.g. when displaying project pipelines in + # the UI). Instead of creating an entirely different entity we just allow the + # disabling of this specific field whenever necessary. + expose :coverage, unless: proc { options[:disable_coverage] } expose :source expose :created_at, :updated_at diff --git a/spec/controllers/projects/pipelines_controller_spec.rb b/spec/controllers/projects/pipelines_controller_spec.rb index c22e455ea2d..9e7bc20a6d1 100644 --- a/spec/controllers/projects/pipelines_controller_spec.rb +++ b/spec/controllers/projects/pipelines_controller_spec.rb @@ -41,6 +41,12 @@ describe Projects::PipelinesController do expect(json_response['count']['finished']).to eq '1' end + it 'does not include coverage data for the pipelines' do + subject + + expect(json_response['pipelines'][0]).not_to include('coverage') + end + context 'when performing gitaly calls', :request_store do it 'limits the Gitaly requests' do expect { subject }.to change { Gitlab::GitalyClient.get_request_count }.by(3) diff --git a/spec/serializers/pipeline_entity_spec.rb b/spec/serializers/pipeline_entity_spec.rb index 2473c561f4b..e67d12b7a89 100644 --- a/spec/serializers/pipeline_entity_spec.rb +++ b/spec/serializers/pipeline_entity_spec.rb @@ -26,6 +26,13 @@ describe PipelineEntity do expect(subject).to include :updated_at, :created_at end + it 'excludes coverage data when disabled' do + entity = described_class + .represent(pipeline, request: request, disable_coverage: true) + + expect(entity.as_json).not_to include(:coverage) + end + it 'contains details' do expect(subject).to include :details expect(subject[:details]) -- cgit v1.2.3 From da7bbef8fed767483236f9503553942b4c1acd05 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Fri, 11 May 2018 16:22:09 +0200 Subject: Added changelog for pipelines page performance --- changelogs/unreleased/pipelines-index-performance.yml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 changelogs/unreleased/pipelines-index-performance.yml diff --git a/changelogs/unreleased/pipelines-index-performance.yml b/changelogs/unreleased/pipelines-index-performance.yml new file mode 100644 index 00000000000..928c2ddab72 --- /dev/null +++ b/changelogs/unreleased/pipelines-index-performance.yml @@ -0,0 +1,5 @@ +--- +title: Improve performance of project pipelines pages +merge_request: +author: +type: performance -- cgit v1.2.3