From 0377c015cf903a1d22d14086e68d1e07fe3c2643 Mon Sep 17 00:00:00 2001 From: Stan Hu Date: Mon, 13 Aug 2018 15:36:15 -0700 Subject: Refactor GitHub Importer database helpers into helper methods This in preparation for addressing idle-in-transaction timeouts for other importers. Part of #50021 --- lib/gitlab/github_import.rb | 18 ----- .../github_import/importer/issue_importer.rb | 4 +- .../importer/pull_request_importer.rb | 92 +++++----------------- lib/gitlab/import/database_helpers.rb | 25 ++++++ lib/gitlab/import/merge_request_helpers.rb | 70 ++++++++++++++++ 5 files changed, 118 insertions(+), 91 deletions(-) create mode 100644 lib/gitlab/import/database_helpers.rb create mode 100644 lib/gitlab/import/merge_request_helpers.rb (limited to 'lib') diff --git a/lib/gitlab/github_import.rb b/lib/gitlab/github_import.rb index 65b5e30c70f..d40b06f969f 100644 --- a/lib/gitlab/github_import.rb +++ b/lib/gitlab/github_import.rb @@ -10,24 +10,6 @@ module Gitlab Client.new(token_to_use, parallel: parallel) end - # Inserts a raw row and returns the ID of the inserted row. - # - # attributes - The attributes/columns to set. - # relation - An ActiveRecord::Relation to use for finding the ID of the row - # when using MySQL. - def self.insert_and_return_id(attributes, relation) - # We use bulk_insert here so we can bypass any queries executed by - # callbacks or validation rules, as doing this wouldn't scale when - # importing very large projects. - result = Gitlab::Database - .bulk_insert(relation.table_name, [attributes], return_ids: true) - - # MySQL doesn't support returning the IDs of a bulk insert in a way that - # is not a pain, so in this case we'll issue an extra query instead. - result.first || - relation.where(iid: attributes[:iid]).limit(1).pluck(:id).first - end - # Returns the ID of the ghost user. def self.ghost_user_id key = 'github-import/ghost-user-id' diff --git a/lib/gitlab/github_import/importer/issue_importer.rb b/lib/gitlab/github_import/importer/issue_importer.rb index cb4d7a6a0b6..4226eee85cc 100644 --- a/lib/gitlab/github_import/importer/issue_importer.rb +++ b/lib/gitlab/github_import/importer/issue_importer.rb @@ -4,6 +4,8 @@ module Gitlab module GithubImport module Importer class IssueImporter + include Gitlab::Import::DatabaseHelpers + attr_reader :project, :issue, :client, :user_finder, :milestone_finder, :issuable_finder @@ -55,7 +57,7 @@ module Gitlab updated_at: issue.updated_at } - GithubImport.insert_and_return_id(attributes, project.issues).tap do |id| + insert_and_return_id(attributes, project.issues).tap do |id| # We use .insert_and_return_id which effectively disables all callbacks. # Trigger iid logic here to make sure we track internal id values consistently. project.issues.find(id).ensure_project_iid! diff --git a/lib/gitlab/github_import/importer/pull_request_importer.rb b/lib/gitlab/github_import/importer/pull_request_importer.rb index ed17aa54373..ae7c4cf1b38 100644 --- a/lib/gitlab/github_import/importer/pull_request_importer.rb +++ b/lib/gitlab/github_import/importer/pull_request_importer.rb @@ -4,6 +4,8 @@ module Gitlab module GithubImport module Importer class PullRequestImporter + include Gitlab::Import::MergeRequestHelpers + attr_reader :pull_request, :project, :client, :user_finder, :milestone_finder, :issuable_finder @@ -44,81 +46,27 @@ module Gitlab description = MarkdownText .format(pull_request.description, pull_request.author, author_found) - # This work must be wrapped in a transaction as otherwise we can leave - # behind incomplete data in the event of an error. This can then lead - # to duplicate key errors when jobs are retried. - MergeRequest.transaction do - attributes = { - iid: pull_request.iid, - title: pull_request.truncated_title, - description: description, - source_project_id: project.id, - target_project_id: project.id, - source_branch: pull_request.formatted_source_branch, - target_branch: pull_request.target_branch, - state: pull_request.state, - milestone_id: milestone_finder.id_for(pull_request), - author_id: author_id, - assignee_id: user_finder.assignee_id_for(pull_request), - created_at: pull_request.created_at, - updated_at: pull_request.updated_at - } - - # When creating merge requests there are a lot of hooks that may - # run, for many different reasons. Many of these hooks (e.g. the - # ones used for rendering Markdown) are completely unnecessary and - # may even lead to transaction timeouts. - # - # To ensure importing pull requests has a minimal impact and can - # complete in a reasonable time we bypass all the hooks by inserting - # the row and then retrieving it. We then only perform the - # additional work that is strictly necessary. - merge_request_id = GithubImport - .insert_and_return_id(attributes, project.merge_requests) - - merge_request = project.merge_requests.find(merge_request_id) - - # We use .insert_and_return_id which effectively disables all callbacks. - # Trigger iid logic here to make sure we track internal id values consistently. - merge_request.ensure_target_project_iid! + attributes = { + iid: pull_request.iid, + title: pull_request.truncated_title, + description: description, + source_project_id: project.id, + target_project_id: project.id, + source_branch: pull_request.formatted_source_branch, + target_branch: pull_request.target_branch, + state: pull_request.state, + milestone_id: milestone_finder.id_for(pull_request), + author_id: author_id, + assignee_id: user_finder.assignee_id_for(pull_request), + created_at: pull_request.created_at, + updated_at: pull_request.updated_at + } - [merge_request, false] - end - rescue ActiveRecord::InvalidForeignKey - # It's possible the project has been deleted since scheduling this - # job. In this case we'll just skip creating the merge request. - [] - rescue ActiveRecord::RecordNotUnique - # It's possible we previously created the MR, but failed when updating - # the Git data. In this case we'll just continue working on the - # existing row. - [project.merge_requests.find_by(iid: pull_request.iid), true] + create_merge_request_without_hooks(project, attributes, pull_request.iid) end - def insert_git_data(merge_request, already_exists = false) - # These fields are set so we can create the correct merge request - # diffs. - merge_request.source_branch_sha = pull_request.source_branch_sha - merge_request.target_branch_sha = pull_request.target_branch_sha - - merge_request.keep_around_commit - - # MR diffs normally use an "after_save" hook to pull data from Git. - # All of this happens in the transaction started by calling - # create/save/etc. This in turn can lead to these transactions being - # held open for much longer than necessary. To work around this we - # first save the diff, then populate it. - diff = - if already_exists - merge_request.merge_request_diffs.take || - merge_request.merge_request_diffs.build - else - merge_request.merge_request_diffs.build - end - - diff.importing = true - diff.save - diff.save_git_content + def insert_git_data(merge_request, already_exists) + insert_or_replace_git_data(merge_request, pull_request.source_branch_sha, pull_request.target_branch_sha, already_exists) end end end diff --git a/lib/gitlab/import/database_helpers.rb b/lib/gitlab/import/database_helpers.rb new file mode 100644 index 00000000000..80857061933 --- /dev/null +++ b/lib/gitlab/import/database_helpers.rb @@ -0,0 +1,25 @@ +# frozen_string_literal: true + +module Gitlab + module Import + module DatabaseHelpers + # Inserts a raw row and returns the ID of the inserted row. + # + # attributes - The attributes/columns to set. + # relation - An ActiveRecord::Relation to use for finding the ID of the row + # when using MySQL. + def insert_and_return_id(attributes, relation) + # We use bulk_insert here so we can bypass any queries executed by + # callbacks or validation rules, as doing this wouldn't scale when + # importing very large projects. + result = Gitlab::Database + .bulk_insert(relation.table_name, [attributes], return_ids: true) + + # MySQL doesn't support returning the IDs of a bulk insert in a way that + # is not a pain, so in this case we'll issue an extra query instead. + result.first || + relation.where(iid: attributes[:iid]).limit(1).pluck(:id).first + end + end + end +end diff --git a/lib/gitlab/import/merge_request_helpers.rb b/lib/gitlab/import/merge_request_helpers.rb new file mode 100644 index 00000000000..8ba70700dc1 --- /dev/null +++ b/lib/gitlab/import/merge_request_helpers.rb @@ -0,0 +1,70 @@ +# frozen_string_literal: true + +module Gitlab + module Import + module MergeRequestHelpers + include DatabaseHelpers + + def create_merge_request_without_hooks(project, attributes, iid) + # This work must be wrapped in a transaction as otherwise we can leave + # behind incomplete data in the event of an error. This can then lead + # to duplicate key errors when jobs are retried. + MergeRequest.transaction do + # When creating merge requests there are a lot of hooks that may + # run, for many different reasons. Many of these hooks (e.g. the + # ones used for rendering Markdown) are completely unnecessary and + # may even lead to transaction timeouts. + # + # To ensure importing pull requests has a minimal impact and can + # complete in a reasonable time we bypass all the hooks by inserting + # the row and then retrieving it. We then only perform the + # additional work that is strictly necessary. + merge_request_id = insert_and_return_id(attributes, project.merge_requests) + + merge_request = project.merge_requests.find(merge_request_id) + + # We use .insert_and_return_id which effectively disables all callbacks. + # Trigger iid logic here to make sure we track internal id values consistently. + merge_request.ensure_target_project_iid! + + [merge_request, false] + end + rescue ActiveRecord::InvalidForeignKey + # It's possible the project has been deleted since scheduling this + # job. In this case we'll just skip creating the merge request. + [] + rescue ActiveRecord::RecordNotUnique + # It's possible we previously created the MR, but failed when updating + # the Git data. In this case we'll just continue working on the + # existing row. + [project.merge_requests.find_by(iid: iid), true] + end + + def insert_or_replace_git_data(merge_request, source_branch_sha, target_branch_sha, already_exists = false) + # These fields are set so we can create the correct merge request + # diffs. + merge_request.source_branch_sha = source_branch_sha + merge_request.target_branch_sha = target_branch_sha + + merge_request.keep_around_commit + + # MR diffs normally use an "after_save" hook to pull data from Git. + # All of this happens in the transaction started by calling + # create/save/etc. This in turn can lead to these transactions being + # held open for much longer than necessary. To work around this we + # first save the diff, then populate it. + diff = + if already_exists + merge_request.merge_request_diffs.take || + merge_request.merge_request_diffs.build + else + merge_request.merge_request_diffs.build + end + + diff.importing = true + diff.save + diff.save_git_content + end + end + end +end -- cgit v1.2.3