From 97c2564ffac057f1830d008269f90afa9e12f815 Mon Sep 17 00:00:00 2001 From: Douwe Maan Date: Fri, 16 Aug 2019 13:26:31 -0500 Subject: Look up upstream commits once before queuing ProcessCommitWorkers Instead of checking if a commit already exists in the upstream project in its ProcessCommitWorker and bailing out if it does, we check the existence of all commits in bulk in Git::BranchHooksService, so that we can skip scheduling ProcessCommitWorker jobs for those commits that already exist upstream entirely. --- app/services/git/branch_hooks_service.rb | 25 +++++++- app/workers/process_commit_worker.rb | 17 +----- .../unreleased/dm-process-commit-worker-n-1.yml | 5 ++ spec/services/git/branch_hooks_service_spec.rb | 70 ++++++++++++++++++---- spec/workers/process_commit_worker_spec.rb | 40 ------------- 5 files changed, 90 insertions(+), 67 deletions(-) create mode 100644 changelogs/unreleased/dm-process-commit-worker-n-1.yml diff --git a/app/services/git/branch_hooks_service.rb b/app/services/git/branch_hooks_service.rb index 431a5aedf2e..d2b037a680c 100644 --- a/app/services/git/branch_hooks_service.rb +++ b/app/services/git/branch_hooks_service.rb @@ -83,8 +83,16 @@ module Git # Schedules processing of commit messages def enqueue_process_commit_messages - limited_commits.each do |commit| - next unless commit.matches_cross_reference_regex? + referencing_commits = limited_commits.select(&:matches_cross_reference_regex?) + + upstream_commit_ids = upstream_commit_ids(referencing_commits) + + referencing_commits.each do |commit| + # Avoid reprocessing commits that already exist upstream if the project + # is a fork. This will prevent duplicated/superfluous system notes on + # mentionables referenced by a commit that is pushed to the upstream, + # that is then also pushed to forks when these get synced by users. + next if upstream_commit_ids.include?(commit.id) ProcessCommitWorker.perform_async( project.id, @@ -142,5 +150,18 @@ module Git def branch_name strong_memoize(:branch_name) { Gitlab::Git.ref_name(params[:ref]) } end + + def upstream_commit_ids(commits) + set = Set.new + + upstream_project = project.fork_source + if upstream_project + upstream_project + .commits_by(oids: commits.map(&:id)) + .each { |commit| set << commit.id } + end + + set + end end end diff --git a/app/workers/process_commit_worker.rb b/app/workers/process_commit_worker.rb index 3efb5343a96..f6ebe4ab006 100644 --- a/app/workers/process_commit_worker.rb +++ b/app/workers/process_commit_worker.rb @@ -2,7 +2,8 @@ # Worker for processing individual commit messages pushed to a repository. # -# Jobs for this worker are scheduled for every commit that is being pushed. As a +# Jobs for this worker are scheduled for every commit that contains mentionable +# references in its message and does not exist in the upstream project. As a # result of this the workload of this worker should be kept to a bare minimum. # Consider using an extra worker if you need to add any extra (and potentially # slow) processing of commits. @@ -19,7 +20,6 @@ class ProcessCommitWorker project = Project.find_by(id: project_id) return unless project - return if commit_exists_in_upstream?(project, commit_hash) user = User.find_by(id: user_id) @@ -77,17 +77,4 @@ class ProcessCommitWorker Commit.from_hash(hash, project) end - - private - - # Avoid reprocessing commits that already exist in the upstream - # when project is forked. This will also prevent duplicated system notes. - def commit_exists_in_upstream?(project, commit_hash) - upstream_project = project.fork_source - - return false unless upstream_project - - commit_id = commit_hash.with_indifferent_access[:id] - upstream_project.commit(commit_id).present? - end end diff --git a/changelogs/unreleased/dm-process-commit-worker-n-1.yml b/changelogs/unreleased/dm-process-commit-worker-n-1.yml new file mode 100644 index 00000000000..0bd7de6730a --- /dev/null +++ b/changelogs/unreleased/dm-process-commit-worker-n-1.yml @@ -0,0 +1,5 @@ +--- +title: Look up upstream commits once before queuing ProcessCommitWorkers +merge_request: +author: +type: performance diff --git a/spec/services/git/branch_hooks_service_spec.rb b/spec/services/git/branch_hooks_service_spec.rb index 23be400059e..41180402759 100644 --- a/spec/services/git/branch_hooks_service_spec.rb +++ b/spec/services/git/branch_hooks_service_spec.rb @@ -4,6 +4,7 @@ require 'spec_helper' describe Git::BranchHooksService do include RepoHelpers + include ProjectForksHelper let(:project) { create(:project, :repository) } let(:user) { project.creator } @@ -266,10 +267,10 @@ describe Git::BranchHooksService do end describe 'Processing commit messages' do - # Create 4 commits, 2 of which have references. Limiting to 2 commits, we - # expect to see one commit message processor enqueued. - let(:commit_ids) do - Array.new(4) do |i| + # Create 6 commits, 3 of which have references. Limiting to 4 commits, we + # expect to see two commit message processors enqueued. + let!(:commit_ids) do + Array.new(6) do |i| message = "Issue #{'#' if i.even?}#{i}" project.repository.update_file( user, 'README.md', '', message: message, branch_name: branch @@ -277,18 +278,18 @@ describe Git::BranchHooksService do end end - let(:oldrev) { commit_ids.first } + let(:oldrev) { project.commit(commit_ids.first).parent_id } let(:newrev) { commit_ids.last } before do - stub_const("::Git::BaseHooksService::PROCESS_COMMIT_LIMIT", 2) + stub_const("::Git::BaseHooksService::PROCESS_COMMIT_LIMIT", 4) end context 'creating the default branch' do let(:oldrev) { Gitlab::Git::BLANK_SHA } it 'processes a limited number of commit messages' do - expect(ProcessCommitWorker).to receive(:perform_async).once + expect(ProcessCommitWorker).to receive(:perform_async).twice service.execute end @@ -296,7 +297,7 @@ describe Git::BranchHooksService do context 'updating the default branch' do it 'processes a limited number of commit messages' do - expect(ProcessCommitWorker).to receive(:perform_async).once + expect(ProcessCommitWorker).to receive(:perform_async).twice service.execute end @@ -317,7 +318,7 @@ describe Git::BranchHooksService do let(:oldrev) { Gitlab::Git::BLANK_SHA } it 'processes a limited number of commit messages' do - expect(ProcessCommitWorker).to receive(:perform_async).once + expect(ProcessCommitWorker).to receive(:perform_async).twice service.execute end @@ -327,7 +328,7 @@ describe Git::BranchHooksService do let(:branch) { 'fix' } it 'processes a limited number of commit messages' do - expect(ProcessCommitWorker).to receive(:perform_async).once + expect(ProcessCommitWorker).to receive(:perform_async).twice service.execute end @@ -343,6 +344,55 @@ describe Git::BranchHooksService do service.execute end end + + context 'when the project is forked' do + let(:upstream_project) { project } + let(:forked_project) { fork_project(upstream_project, user, repository: true) } + + let!(:forked_service) do + described_class.new(forked_project, user, oldrev: oldrev, newrev: newrev, ref: ref) + end + + context 'when commits already exists in the upstream project' do + it 'does not process commit messages' do + expect(ProcessCommitWorker).not_to receive(:perform_async) + + forked_service.execute + end + end + + context 'when a commit does not exist in the upstream repo' do + # On top of the existing 6 commits, 3 of which have references, + # create 2 more, 1 of which has a reference. Limiting to 4 commits, we + # expect to see one commit message processor enqueued. + let!(:forked_commit_ids) do + Array.new(2) do |i| + message = "Issue #{'#' if i.even?}#{i}" + forked_project.repository.update_file( + user, 'README.md', '', message: message, branch_name: branch + ) + end + end + + let(:newrev) { forked_commit_ids.last } + + it 'processes the commit message' do + expect(ProcessCommitWorker).to receive(:perform_async).once + + forked_service.execute + end + end + + context 'when the upstream project no longer exists' do + it 'processes the commit messages' do + upstream_project.destroy! + + expect(ProcessCommitWorker).to receive(:perform_async).twice + + forked_service.execute + end + end + end end describe 'New branch detection' do diff --git a/spec/workers/process_commit_worker_spec.rb b/spec/workers/process_commit_worker_spec.rb index 47bac63511e..eb1d3c364ac 100644 --- a/spec/workers/process_commit_worker_spec.rb +++ b/spec/workers/process_commit_worker_spec.rb @@ -3,8 +3,6 @@ require 'spec_helper' describe ProcessCommitWorker do - include ProjectForksHelper - let(:worker) { described_class.new } let(:user) { create(:user) } let(:project) { create(:project, :public, :repository) } @@ -35,44 +33,6 @@ describe ProcessCommitWorker do worker.perform(project.id, user.id, commit.to_hash) end - - context 'when the project is forked' do - context 'when commit already exists in the upstream project' do - it 'does not process the commit message' do - forked = fork_project(project, user, repository: true) - - expect(worker).not_to receive(:process_commit_message) - - worker.perform(forked.id, user.id, forked.commit.to_hash) - end - end - - context 'when the commit does not exist in the upstream project' do - it 'processes the commit message' do - empty_project = create(:project, :public) - forked = fork_project(empty_project, user, repository: true) - - TestEnv.copy_repo(forked, - bare_repo: TestEnv.factory_repo_path_bare, - refs: TestEnv::BRANCH_SHA) - - expect(worker).to receive(:process_commit_message) - - worker.perform(forked.id, user.id, forked.commit.to_hash) - end - end - - context 'when the upstream project no longer exists' do - it 'processes the commit message' do - forked = fork_project(project, user, repository: true) - project.destroy! - - expect(worker).to receive(:process_commit_message) - - worker.perform(forked.id, user.id, forked.commit.to_hash) - end - end - end end describe '#process_commit_message' do -- cgit v1.2.3