From 0377c015cf903a1d22d14086e68d1e07fe3c2643 Mon Sep 17 00:00:00 2001
From: Stan Hu <stanhu@gmail.com>
Date: Mon, 13 Aug 2018 15:36:15 -0700
Subject: Refactor GitHub Importer database helpers into helper methods

This in preparation for addressing idle-in-transaction timeouts for other importers.

Part of #50021
---
 lib/gitlab/github_import.rb                        | 18 -----
 .../github_import/importer/issue_importer.rb       |  4 +-
 .../importer/pull_request_importer.rb              | 92 +++++-----------------
 lib/gitlab/import/database_helpers.rb              | 25 ++++++
 lib/gitlab/import/merge_request_helpers.rb         | 70 ++++++++++++++++
 5 files changed, 118 insertions(+), 91 deletions(-)
 create mode 100644 lib/gitlab/import/database_helpers.rb
 create mode 100644 lib/gitlab/import/merge_request_helpers.rb

(limited to 'lib')

diff --git a/lib/gitlab/github_import.rb b/lib/gitlab/github_import.rb
index 65b5e30c70f..d40b06f969f 100644
--- a/lib/gitlab/github_import.rb
+++ b/lib/gitlab/github_import.rb
@@ -10,24 +10,6 @@ module Gitlab
       Client.new(token_to_use, parallel: parallel)
     end
 
-    # Inserts a raw row and returns the ID of the inserted row.
-    #
-    # attributes - The attributes/columns to set.
-    # relation - An ActiveRecord::Relation to use for finding the ID of the row
-    #            when using MySQL.
-    def self.insert_and_return_id(attributes, relation)
-      # We use bulk_insert here so we can bypass any queries executed by
-      # callbacks or validation rules, as doing this wouldn't scale when
-      # importing very large projects.
-      result = Gitlab::Database
-        .bulk_insert(relation.table_name, [attributes], return_ids: true)
-
-      # MySQL doesn't support returning the IDs of a bulk insert in a way that
-      # is not a pain, so in this case we'll issue an extra query instead.
-      result.first ||
-        relation.where(iid: attributes[:iid]).limit(1).pluck(:id).first
-    end
-
     # Returns the ID of the ghost user.
     def self.ghost_user_id
       key = 'github-import/ghost-user-id'
diff --git a/lib/gitlab/github_import/importer/issue_importer.rb b/lib/gitlab/github_import/importer/issue_importer.rb
index cb4d7a6a0b6..4226eee85cc 100644
--- a/lib/gitlab/github_import/importer/issue_importer.rb
+++ b/lib/gitlab/github_import/importer/issue_importer.rb
@@ -4,6 +4,8 @@ module Gitlab
   module GithubImport
     module Importer
       class IssueImporter
+        include Gitlab::Import::DatabaseHelpers
+
         attr_reader :project, :issue, :client, :user_finder, :milestone_finder,
                     :issuable_finder
 
@@ -55,7 +57,7 @@ module Gitlab
             updated_at: issue.updated_at
           }
 
-          GithubImport.insert_and_return_id(attributes, project.issues).tap do |id|
+          insert_and_return_id(attributes, project.issues).tap do |id|
             # We use .insert_and_return_id which effectively disables all callbacks.
             # Trigger iid logic here to make sure we track internal id values consistently.
             project.issues.find(id).ensure_project_iid!
diff --git a/lib/gitlab/github_import/importer/pull_request_importer.rb b/lib/gitlab/github_import/importer/pull_request_importer.rb
index ed17aa54373..ae7c4cf1b38 100644
--- a/lib/gitlab/github_import/importer/pull_request_importer.rb
+++ b/lib/gitlab/github_import/importer/pull_request_importer.rb
@@ -4,6 +4,8 @@ module Gitlab
   module GithubImport
     module Importer
       class PullRequestImporter
+        include Gitlab::Import::MergeRequestHelpers
+
         attr_reader :pull_request, :project, :client, :user_finder,
                     :milestone_finder, :issuable_finder
 
@@ -44,81 +46,27 @@ module Gitlab
           description = MarkdownText
             .format(pull_request.description, pull_request.author, author_found)
 
-          # This work must be wrapped in a transaction as otherwise we can leave
-          # behind incomplete data in the event of an error. This can then lead
-          # to duplicate key errors when jobs are retried.
-          MergeRequest.transaction do
-            attributes = {
-              iid: pull_request.iid,
-              title: pull_request.truncated_title,
-              description: description,
-              source_project_id: project.id,
-              target_project_id: project.id,
-              source_branch: pull_request.formatted_source_branch,
-              target_branch: pull_request.target_branch,
-              state: pull_request.state,
-              milestone_id: milestone_finder.id_for(pull_request),
-              author_id: author_id,
-              assignee_id: user_finder.assignee_id_for(pull_request),
-              created_at: pull_request.created_at,
-              updated_at: pull_request.updated_at
-            }
-
-            # When creating merge requests there are a lot of hooks that may
-            # run, for many different reasons. Many of these hooks (e.g. the
-            # ones used for rendering Markdown) are completely unnecessary and
-            # may even lead to transaction timeouts.
-            #
-            # To ensure importing pull requests has a minimal impact and can
-            # complete in a reasonable time we bypass all the hooks by inserting
-            # the row and then retrieving it. We then only perform the
-            # additional work that is strictly necessary.
-            merge_request_id = GithubImport
-              .insert_and_return_id(attributes, project.merge_requests)
-
-            merge_request = project.merge_requests.find(merge_request_id)
-
-            # We use .insert_and_return_id which effectively disables all callbacks.
-            # Trigger iid logic here to make sure we track internal id values consistently.
-            merge_request.ensure_target_project_iid!
+          attributes = {
+            iid: pull_request.iid,
+            title: pull_request.truncated_title,
+            description: description,
+            source_project_id: project.id,
+            target_project_id: project.id,
+            source_branch: pull_request.formatted_source_branch,
+            target_branch: pull_request.target_branch,
+            state: pull_request.state,
+            milestone_id: milestone_finder.id_for(pull_request),
+            author_id: author_id,
+            assignee_id: user_finder.assignee_id_for(pull_request),
+            created_at: pull_request.created_at,
+            updated_at: pull_request.updated_at
+          }
 
-            [merge_request, false]
-          end
-        rescue ActiveRecord::InvalidForeignKey
-          # It's possible the project has been deleted since scheduling this
-          # job. In this case we'll just skip creating the merge request.
-          []
-        rescue ActiveRecord::RecordNotUnique
-          # It's possible we previously created the MR, but failed when updating
-          # the Git data. In this case we'll just continue working on the
-          # existing row.
-          [project.merge_requests.find_by(iid: pull_request.iid), true]
+          create_merge_request_without_hooks(project, attributes, pull_request.iid)
         end
 
-        def insert_git_data(merge_request, already_exists = false)
-          # These fields are set so we can create the correct merge request
-          # diffs.
-          merge_request.source_branch_sha = pull_request.source_branch_sha
-          merge_request.target_branch_sha = pull_request.target_branch_sha
-
-          merge_request.keep_around_commit
-
-          # MR diffs normally use an "after_save" hook to pull data from Git.
-          # All of this happens in the transaction started by calling
-          # create/save/etc. This in turn can lead to these transactions being
-          # held open for much longer than necessary. To work around this we
-          # first save the diff, then populate it.
-          diff =
-            if already_exists
-              merge_request.merge_request_diffs.take ||
-                merge_request.merge_request_diffs.build
-            else
-              merge_request.merge_request_diffs.build
-            end
-
-          diff.importing = true
-          diff.save
-          diff.save_git_content
+        def insert_git_data(merge_request, already_exists)
+          insert_or_replace_git_data(merge_request, pull_request.source_branch_sha, pull_request.target_branch_sha, already_exists)
         end
       end
     end
diff --git a/lib/gitlab/import/database_helpers.rb b/lib/gitlab/import/database_helpers.rb
new file mode 100644
index 00000000000..80857061933
--- /dev/null
+++ b/lib/gitlab/import/database_helpers.rb
@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Import
+    module DatabaseHelpers
+      # Inserts a raw row and returns the ID of the inserted row.
+      #
+      # attributes - The attributes/columns to set.
+      # relation - An ActiveRecord::Relation to use for finding the ID of the row
+      #            when using MySQL.
+      def insert_and_return_id(attributes, relation)
+        # We use bulk_insert here so we can bypass any queries executed by
+        # callbacks or validation rules, as doing this wouldn't scale when
+        # importing very large projects.
+        result = Gitlab::Database
+                 .bulk_insert(relation.table_name, [attributes], return_ids: true)
+
+        # MySQL doesn't support returning the IDs of a bulk insert in a way that
+        # is not a pain, so in this case we'll issue an extra query instead.
+        result.first ||
+          relation.where(iid: attributes[:iid]).limit(1).pluck(:id).first
+      end
+    end
+  end
+end
diff --git a/lib/gitlab/import/merge_request_helpers.rb b/lib/gitlab/import/merge_request_helpers.rb
new file mode 100644
index 00000000000..8ba70700dc1
--- /dev/null
+++ b/lib/gitlab/import/merge_request_helpers.rb
@@ -0,0 +1,70 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Import
+    module MergeRequestHelpers
+      include DatabaseHelpers
+
+      def create_merge_request_without_hooks(project, attributes, iid)
+        # This work must be wrapped in a transaction as otherwise we can leave
+        # behind incomplete data in the event of an error. This can then lead
+        # to duplicate key errors when jobs are retried.
+        MergeRequest.transaction do
+          # When creating merge requests there are a lot of hooks that may
+          # run, for many different reasons. Many of these hooks (e.g. the
+          # ones used for rendering Markdown) are completely unnecessary and
+          # may even lead to transaction timeouts.
+          #
+          # To ensure importing pull requests has a minimal impact and can
+          # complete in a reasonable time we bypass all the hooks by inserting
+          # the row and then retrieving it. We then only perform the
+          # additional work that is strictly necessary.
+          merge_request_id = insert_and_return_id(attributes, project.merge_requests)
+
+          merge_request = project.merge_requests.find(merge_request_id)
+
+          # We use .insert_and_return_id which effectively disables all callbacks.
+          # Trigger iid logic here to make sure we track internal id values consistently.
+          merge_request.ensure_target_project_iid!
+
+          [merge_request, false]
+        end
+      rescue ActiveRecord::InvalidForeignKey
+        # It's possible the project has been deleted since scheduling this
+        # job. In this case we'll just skip creating the merge request.
+        []
+      rescue ActiveRecord::RecordNotUnique
+        # It's possible we previously created the MR, but failed when updating
+        # the Git data. In this case we'll just continue working on the
+        # existing row.
+        [project.merge_requests.find_by(iid: iid), true]
+      end
+
+      def insert_or_replace_git_data(merge_request, source_branch_sha, target_branch_sha, already_exists = false)
+        # These fields are set so we can create the correct merge request
+        # diffs.
+        merge_request.source_branch_sha = source_branch_sha
+        merge_request.target_branch_sha = target_branch_sha
+
+        merge_request.keep_around_commit
+
+        # MR diffs normally use an "after_save" hook to pull data from Git.
+        # All of this happens in the transaction started by calling
+        # create/save/etc. This in turn can lead to these transactions being
+        # held open for much longer than necessary. To work around this we
+        # first save the diff, then populate it.
+        diff =
+          if already_exists
+            merge_request.merge_request_diffs.take ||
+              merge_request.merge_request_diffs.build
+          else
+            merge_request.merge_request_diffs.build
+          end
+
+        diff.importing = true
+        diff.save
+        diff.save_git_content
+      end
+    end
+  end
+end
-- 
cgit v1.2.3