diff options
Diffstat (limited to 'lib/gitlab/github_import')
-rw-r--r-- | lib/gitlab/github_import/attachments_downloader.rb | 39 | ||||
-rw-r--r-- | lib/gitlab/github_import/client.rb | 14 | ||||
-rw-r--r-- | lib/gitlab/github_import/importer/note_attachments_importer.rb | 19 | ||||
-rw-r--r-- | lib/gitlab/github_import/markdown/attachment.rb | 13 | ||||
-rw-r--r-- | lib/gitlab/github_import/object_counter.rb | 10 | ||||
-rw-r--r-- | lib/gitlab/github_import/user_finder.rb | 110 |
6 files changed, 171 insertions, 34 deletions
diff --git a/lib/gitlab/github_import/attachments_downloader.rb b/lib/gitlab/github_import/attachments_downloader.rb index b71d5f753f2..4db55a6aabb 100644 --- a/lib/gitlab/github_import/attachments_downloader.rb +++ b/lib/gitlab/github_import/attachments_downloader.rb @@ -8,15 +8,17 @@ module Gitlab include ::BulkImports::FileDownloads::Validations DownloadError = Class.new(StandardError) + UnsupportedAttachmentError = Class.new(StandardError) FILENAME_SIZE_LIMIT = 255 # chars before the extension DEFAULT_FILE_SIZE_LIMIT = 25.megabytes TMP_DIR = File.join(Dir.tmpdir, 'github_attachments').freeze - attr_reader :file_url, :filename, :file_size_limit + attr_reader :file_url, :filename, :file_size_limit, :options - def initialize(file_url, file_size_limit: DEFAULT_FILE_SIZE_LIMIT) + def initialize(file_url, options: {}, file_size_limit: DEFAULT_FILE_SIZE_LIMIT) @file_url = file_url + @options = options @file_size_limit = file_size_limit filename = URI(file_url).path.split('/').last @@ -27,7 +29,9 @@ module Gitlab validate_content_length validate_filepath - file = download + redirection_url = get_assets_download_redirection_url + file = download_from(redirection_url) + validate_symlink file end @@ -47,9 +51,34 @@ module Gitlab Gitlab::HTTP.perform_request(Net::HTTP::Head, file_url, {}).headers end - def download + # Github /assets redirection link will redirect to aws which has its own authorization. + # Keeping our bearer token will cause request rejection + # eg. Only one auth mechanism allowed; only the X-Amz-Algorithm query parameter, + # Signature query string parameter or the Authorization header should be specified. + def get_assets_download_redirection_url + return file_url unless file_url.starts_with?(github_assets_url_regex) + + options[:follow_redirects] = false + response = Gitlab::HTTP.perform_request(Net::HTTP::Get, file_url, options) + raise_error("expected a redirect response, got #{response.code}") unless response.redirection? + + redirection_url = response.headers[:location] + filename = URI.parse(redirection_url).path + + unless Gitlab::GithubImport::Markdown::Attachment::MEDIA_TYPES.any? { |type| filename.ends_with?(type) } + raise UnsupportedAttachmentError + end + + redirection_url + end + + def github_assets_url_regex + %r{#{Regexp.escape(::Gitlab::GithubImport::MarkdownText.github_url)}/.*/assets/} + end + + def download_from(url) file = File.open(filepath, 'wb') - Gitlab::HTTP.perform_request(Net::HTTP::Get, file_url, stream_body: true) { |batch| file.write(batch) } + Gitlab::HTTP.perform_request(Net::HTTP::Get, url, stream_body: true) { |batch| file.write(batch) } file end diff --git a/lib/gitlab/github_import/client.rb b/lib/gitlab/github_import/client.rb index 886563a6f69..23d4faa3dde 100644 --- a/lib/gitlab/github_import/client.rb +++ b/lib/gitlab/github_import/client.rb @@ -67,10 +67,18 @@ module Gitlab end # Returns the details of a GitHub user. + # 304 (Not Modified) status means the user is cached - API won't return user data. # - # username - The username of the user. - def user(username) - with_rate_limit { octokit.user(username).to_h } + # @param username[String] the username of the user. + # @param options[Hash] the optional parameters. + def user(username, options = {}) + with_rate_limit do + user = octokit.user(username, options) + + next if octokit.last_response&.status == 304 + + user.to_h + end end def pull_request_reviews(repo_name, iid) diff --git a/lib/gitlab/github_import/importer/note_attachments_importer.rb b/lib/gitlab/github_import/importer/note_attachments_importer.rb index 266ee2938ba..26472b0d468 100644 --- a/lib/gitlab/github_import/importer/note_attachments_importer.rb +++ b/lib/gitlab/github_import/importer/note_attachments_importer.rb @@ -4,14 +4,15 @@ module Gitlab module GithubImport module Importer class NoteAttachmentsImporter - attr_reader :note_text, :project + attr_reader :note_text, :project, :client # note_text - An instance of `Gitlab::GithubImport::Representation::NoteText`. # project - An instance of `Project`. # client - An instance of `Gitlab::GithubImport::Client`. - def initialize(note_text, project, _client = nil) + def initialize(note_text, project, client) @note_text = note_text @project = project + @client = client end def execute @@ -33,7 +34,7 @@ module Gitlab if attachment.part_of_project_blob?(project_import_source) convert_project_content_link(attachment.url, project_import_source) - elsif attachment.media? || attachment.doc_belongs_to_project?(project_import_source) + elsif attachment.media?(project_import_source) || attachment.doc_belongs_to_project?(project_import_source) download_attachment(attachment) else # url to other GitHub project attachment.url @@ -53,14 +54,24 @@ module Gitlab # in: an instance of Gitlab::GithubImport::Markdown::Attachment # out: gitlab attachment markdown url def download_attachment(attachment) - downloader = ::Gitlab::GithubImport::AttachmentsDownloader.new(attachment.url) + downloader = ::Gitlab::GithubImport::AttachmentsDownloader.new(attachment.url, options: options) file = downloader.perform uploader = UploadService.new(project, file, FileUploader).execute uploader.to_h[:url] + rescue ::Gitlab::GithubImport::AttachmentsDownloader::UnsupportedAttachmentError + attachment.url ensure downloader&.delete end + def options + { + headers: { + 'Authorization' => "Bearer #{client.octokit.access_token}" + } + } + end + def update_note_record(text) case note_text.record_type when ::Release.name diff --git a/lib/gitlab/github_import/markdown/attachment.rb b/lib/gitlab/github_import/markdown/attachment.rb index e270cfba619..0d8f3196719 100644 --- a/lib/gitlab/github_import/markdown/attachment.rb +++ b/lib/gitlab/github_import/markdown/attachment.rb @@ -57,7 +57,8 @@ module Gitlab def github_url?(url, docs: false, media: false) if media - url.start_with?(::Gitlab::GithubImport::MarkdownText::GITHUB_MEDIA_CDN) + url.start_with?(::Gitlab::GithubImport::MarkdownText.github_url, + ::Gitlab::GithubImport::MarkdownText::GITHUB_MEDIA_CDN) elsif docs url.start_with?(::Gitlab::GithubImport::MarkdownText.github_url) end @@ -65,6 +66,9 @@ module Gitlab def whitelisted_type?(url, docs: false, media: false) if media + # We do not know the file extension type from the /assets markdown + return true if url.start_with?(::Gitlab::GithubImport::MarkdownText.github_url) + MEDIA_TYPES.any? { |type| url.end_with?(type) } elsif docs DOC_TYPES.any? { |type| url.end_with?(type) } @@ -91,8 +95,11 @@ module Gitlab ) end - def media? - url.start_with?(::Gitlab::GithubImport::MarkdownText::GITHUB_MEDIA_CDN) + def media?(import_source) + url.start_with?( + "#{::Gitlab::GithubImport::MarkdownText.github_url}/#{import_source}/assets", + ::Gitlab::GithubImport::MarkdownText::GITHUB_MEDIA_CDN + ) end def inspect diff --git a/lib/gitlab/github_import/object_counter.rb b/lib/gitlab/github_import/object_counter.rb index 7ee64b2abac..88e91800cee 100644 --- a/lib/gitlab/github_import/object_counter.rb +++ b/lib/gitlab/github_import/object_counter.rb @@ -14,6 +14,8 @@ module Gitlab CACHING = Gitlab::Cache::Import::Caching + IMPORT_CACHING_TIMEOUT = 2.weeks.to_i + class << self # Increments the project and the global counters if the given value is >= 1 def increment(project, object_type, operation, value: 1) @@ -36,7 +38,7 @@ module Gitlab # After import is completed we store this information in project's import_checksums return cached_summary if cached_summary != EMPTY_SUMMARY || project.import_state.blank? - project.import_state.in_progress? ? cached_summary : project.import_checksums + project.import_state.completed? ? project.import_checksums : cached_summary end private @@ -50,7 +52,7 @@ module Gitlab .sort .each do |counter| object_type = counter.split('/').last - result[operation][object_type] = CACHING.read_integer(counter) + result[operation][object_type] = CACHING.read_integer(counter) || 0 end end end @@ -84,11 +86,11 @@ module Gitlab add_counter_to_list(project, operation, counter_key) - CACHING.increment_by(counter_key, value) + CACHING.increment_by(counter_key, value, timeout: IMPORT_CACHING_TIMEOUT) end def add_counter_to_list(project, operation, key) - CACHING.set_add(counter_list_key(project, operation), key) + CACHING.set_add(counter_list_key(project, operation), key, timeout: IMPORT_CACHING_TIMEOUT) end def counter_list_key(project, operation) diff --git a/lib/gitlab/github_import/user_finder.rb b/lib/gitlab/github_import/user_finder.rb index 57365ebe206..1832f071a44 100644 --- a/lib/gitlab/github_import/user_finder.rb +++ b/lib/gitlab/github_import/user_finder.rb @@ -28,6 +28,17 @@ module Gitlab EMAIL_FOR_USERNAME_CACHE_KEY = 'github-import/user-finder/email-for-username/%s' + # The base cache key to use for caching the user ETAG response headers + USERNAME_ETAG_CACHE_KEY = 'github-import/user-finder/user-etag/%s' + + # The base cache key to store whether an email has been fetched for a project + EMAIL_FETCHED_FOR_PROJECT_CACHE_KEY = 'github-import/user-finder/%{project}/email-fetched/%{username}' + + EMAIL_API_CALL_LOGGING_MESSAGE = { + true => 'Fetching email from GitHub with ETAG header', + false => 'Fetching email from GitHub' + }.freeze + # project - An instance of `Project` # client - An instance of `Gitlab::GithubImport::Client` def initialize(project, client) @@ -109,24 +120,39 @@ module Gitlab id_for_github_id(id) || id_for_github_email(email) end - # Find the public email of a given username in GitHub. The public email is cached to avoid multiple calls to - # GitHub. In case the username does not exist or the public email is nil, a blank value is cached to also prevent - # multiple calls to GitHub. + # Find the public email of a given username in GitHub. + # The email is cached to avoid multiple calls to GitHub. The cache is shared among all projects. + # If the email was not found, a blank email is cached. + # If the email is blank, we attempt to fetch it from GitHub using an ETAG request once for every project. + + # @param username [String] The username of the GitHub user. # # @return [String] If public email is found # @return [Nil] If public email or username does not exist def email_for_github_username(username) - cache_key = EMAIL_FOR_USERNAME_CACHE_KEY % username - email = Gitlab::Cache::Import::Caching.read(cache_key) + email = read_email_from_cache(username) + + if email.blank? && !email_fetched_for_project?(username) + # If an ETAG is available, make an API call with the ETAG. + # Only make a rate-limited API call if the ETAG is not available and the email is nil. + etag = read_etag_from_cache(username) + email = fetch_email_from_github(username, etag: etag) || email - if email.nil? - user = client.user(username) - email = Gitlab::Cache::Import::Caching.write(cache_key, user[:email].to_s, timeout: timeout(user[:email])) + cache_email!(username, email) + cache_etag!(username) if email.blank? && etag.nil? + + # If a non-blank email is cached, we don't need the ETAG or project check caches. + # Otherwise, indicate that the project has been checked. + if email.present? + clear_caches!(username) + else + set_project_as_checked!(username) + end end email.presence rescue ::Octokit::NotFound - Gitlab::Cache::Import::Caching.write(cache_key, '') + cache_email!(username, '') nil end @@ -192,12 +218,66 @@ module Gitlab private - def timeout(email) - if email - Gitlab::Cache::Import::Caching::TIMEOUT - else - Gitlab::Cache::Import::Caching::SHORTER_TIMEOUT - end + def read_email_from_cache(username) + Gitlab::Cache::Import::Caching.read(email_cache_key(username)) + end + + def read_etag_from_cache(username) + Gitlab::Cache::Import::Caching.read(etag_cache_key(username)) + end + + def email_fetched_for_project?(username) + email_fetched_for_project_cache_key = email_fetched_for_project_cache_key(username) + Gitlab::Cache::Import::Caching.read(email_fetched_for_project_cache_key) + end + + def fetch_email_from_github(username, etag: nil) + log(EMAIL_API_CALL_LOGGING_MESSAGE[etag.present?], username: username) + user = client.user(username, { headers: { 'If-None-Match' => etag }.compact }) + + user[:email] || '' if user + end + + def cache_email!(username, email) + return unless email + + Gitlab::Cache::Import::Caching.write(email_cache_key(username), email) + end + + def cache_etag!(username) + etag = client.octokit.last_response.headers[:etag] + Gitlab::Cache::Import::Caching.write(etag_cache_key(username), etag) + end + + def set_project_as_checked!(username) + Gitlab::Cache::Import::Caching.write(email_fetched_for_project_cache_key(username), 1) + end + + def clear_caches!(username) + Gitlab::Cache::Import::Caching.expire(etag_cache_key(username), 0) + Gitlab::Cache::Import::Caching.expire(email_fetched_for_project_cache_key(username), 0) + end + + def email_cache_key(username) + EMAIL_FOR_USERNAME_CACHE_KEY % username + end + + def etag_cache_key(username) + USERNAME_ETAG_CACHE_KEY % username + end + + def email_fetched_for_project_cache_key(username) + format(EMAIL_FETCHED_FOR_PROJECT_CACHE_KEY, project: project.id, username: username) + end + + def log(message, username: nil) + Gitlab::Import::Logger.info( + import_type: :github, + project_id: project.id, + class: self.class.name, + username: username, + message: message + ) end end end |