Welcome to mirror list, hosted at ThFree Co, Russian Federation.

user_finder.rb « github_import « gitlab « lib - gitlab.com/gitlab-org/gitlab-foss.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: bec4c7fc4d418b9a8e5c45ebb8c5b1bcd98fa731 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
# frozen_string_literal: true

module Gitlab
  module GithubImport
    # Class that can be used for finding a GitLab user ID based on a GitHub user
    # ID or username.
    #
    # Any found user IDs are cached in Redis to reduce the number of SQL queries
    # executed over time. Valid keys are refreshed upon access so frequently
    # used keys stick around.
    #
    # Lookups are cached even if no ID was found to remove the need for querying
    # the database when most queries are not going to return results anyway.
    class UserFinder
      include Gitlab::ExclusiveLeaseHelpers

      attr_reader :project, :client

      # The base cache key to use for caching user IDs for a given GitHub user ID.
      ID_CACHE_KEY = 'github-import/user-finder/user-id/%s'

      # The base cache key to use for caching user IDs for a given GitHub email address.
      ID_FOR_EMAIL_CACHE_KEY = 'github-import/user-finder/id-for-email/%s'

      # The base cache key to use for caching the Email addresses of GitHub usernames.
      EMAIL_FOR_USERNAME_CACHE_KEY = 'github-import/user-finder/email-for-username/%s'

      # The base cache key to use for caching the user ETAG response headers
      USERNAME_ETAG_CACHE_KEY = 'github-import/user-finder/user-etag/%s'

      # The base cache key to store whether an email has been fetched for a project
      EMAIL_FETCHED_FOR_PROJECT_CACHE_KEY = 'github-import/user-finder/%{project}/email-fetched/%{username}'

      EMAIL_API_CALL_LOGGING_MESSAGE = {
        true => 'Fetching email from GitHub with ETAG header',
        false => 'Fetching email from GitHub'
      }.freeze

      # project - An instance of `Project`
      # client - An instance of `Gitlab::GithubImport::Client`
      def initialize(project, client)
        @project = project
        @client = client
      end

      # Returns the GitLab user ID of an object's author.
      #
      # If the object has no author ID we'll use the ID of the GitLab ghost
      # user.
      # object - An instance of `Hash` or a `Github::Representer`
      def author_id_for(object, author_key: :author)
        user_info = case author_key
                    when :actor
                      object[:actor]
                    when :assignee
                      object[:assignee]
                    when :requested_reviewer
                      object[:requested_reviewer]
                    when :review_requester
                      object[:review_requester]
                    else
                      object ? object[:author] : nil
                    end

        id = user_info ? user_id_for(user_info) : GithubImport.ghost_user_id

        if id
          [id, true]
        else
          [project.creator_id, false]
        end
      end

      # Returns the GitLab user ID of an issuable's assignee.
      def assignee_id_for(issuable)
        user_id_for(issuable[:assignee]) if issuable[:assignee]
      end

      # Returns the GitLab user ID for a GitHub user.
      #
      # user - An instance of `Gitlab::GithubImport::Representation::User` or `Hash`.
      def user_id_for(user)
        find(user[:id], user[:login]) if user.present?
      end

      # Returns the GitLab ID for the given GitHub ID or username.
      #
      # id - The ID of the GitHub user.
      # username - The username of the GitHub user.
      def find(id, username)
        email = email_for_github_username(username)
        cached, found_id = find_from_cache(id, email)

        return found_id if found_id

        # We only want to query the database if necessary. If previous lookups
        # didn't yield a user ID we won't query the database again until the
        # keys expire.
        find_id_from_database(id, email) unless cached
      end

      # Finds a user ID from the cache for a given GitHub ID or Email.
      def find_from_cache(id, email = nil)
        id_exists, id_for_github_id = cached_id_for_github_id(id)

        return [id_exists, id_for_github_id] if id_for_github_id

        # Just in case no Email address could be retrieved (for whatever reason)
        return [false] unless email

        cached_id_for_github_email(email)
      end

      # Finds a GitLab user ID from the database for a given GitHub user ID or
      # Email.
      def find_id_from_database(id, email)
        id_for_github_id(id) || id_for_github_email(email)
      end

      # Find the public email of a given username in GitHub.
      # The email is cached to avoid multiple calls to GitHub. The cache is shared among all projects.
      # If the email was not found, a blank email is cached.
      # If the email is blank, we attempt to fetch it from GitHub using an ETAG request once for every project.

      # @param username [String] The username of the GitHub user.
      #
      # @return [String] If public email is found
      # @return [Nil] If public email or username does not exist
      def email_for_github_username(username)
        email = read_email_from_cache(username)

        if email.blank? && !email_fetched_for_project?(username)
          # If an ETAG is available, make an API call with the ETAG.
          # Only make a rate-limited API call if the ETAG is not available and the email is nil.
          etag = read_etag_from_cache(username)
          email = fetch_email_from_github(username, etag: etag) || email

          cache_email!(username, email)
          cache_etag!(username) if email.blank? && etag.nil?

          # If a non-blank email is cached, we don't need the ETAG or project check caches.
          # Otherwise, indicate that the project has been checked.
          if email.present?
            clear_caches!(username)
          else
            set_project_as_checked!(username)
          end
        end

        email.presence
      rescue ::Octokit::NotFound
        cache_email!(username, '')
        nil
      end

      def cached_id_for_github_id(id)
        read_id_from_cache(ID_CACHE_KEY % id)
      end

      def cached_id_for_github_email(email)
        read_id_from_cache(ID_FOR_EMAIL_CACHE_KEY % email)
      end

      # If importing from github.com, queries and caches the GitLab user ID for
      # a GitHub user ID, if one was found.
      #
      # When importing from Github Enterprise, do not query user by Github ID
      # since we only have users' Github ID from github.com.
      def id_for_github_id(id)
        gitlab_id =
          if project.github_enterprise_import?
            nil
          else
            query_id_for_github_id(id)
          end

        Gitlab::Cache::Import::Caching.write(ID_CACHE_KEY % id, gitlab_id)
      end

      # Queries and caches the GitLab user ID for a GitHub email, if one was
      # found.
      def id_for_github_email(email)
        gitlab_id = query_id_for_github_email(email) || nil

        Gitlab::Cache::Import::Caching.write(ID_FOR_EMAIL_CACHE_KEY % email, gitlab_id)
      end

      # rubocop: disable CodeReuse/ActiveRecord
      def query_id_for_github_id(id)
        User.by_provider_and_extern_uid(:github, id).select(:id).first&.id
      end
      # rubocop: enable CodeReuse/ActiveRecord

      # rubocop: disable CodeReuse/ActiveRecord
      def query_id_for_github_email(email)
        User.by_any_email(email).pick(:id)
      end
      # rubocop: enable CodeReuse/ActiveRecord

      # Reads an ID from the cache.
      #
      # The return value is an Array with two values:
      #
      # 1. A boolean indicating if the key was present or not.
      # 2. The ID as an Integer, or nil in case no ID could be found.
      def read_id_from_cache(key)
        value = Gitlab::Cache::Import::Caching.read(key)
        exists = !value.nil?
        number = value.to_i

        # The cache key may be empty to indicate a previously looked up user for
        # which we couldn't find an ID.
        [exists, number > 0 ? number : nil]
      end

      private

      def lease_key
        "gitlab:github_import:user_finder:#{project.id}"
      end

      # Retrieves the email associated with the given username from the cache.
      #
      # The return value can be an email, an empty string, or nil.
      #
      # If an empty string is returned, it indicates that the user's email was fetched but not set on GitHub.
      # If nil is returned, it indicates that the user's email wasn't fetched or the cache has expired.
      # If an email is returned, it means the user has a public email set, and it has been successfully cached.
      def read_email_from_cache(username)
        Gitlab::Cache::Import::Caching.read(email_cache_key(username))
      end

      def read_etag_from_cache(username)
        Gitlab::Cache::Import::Caching.read(etag_cache_key(username))
      end

      def email_fetched_for_project?(username)
        email_fetched_for_project_cache_key = email_fetched_for_project_cache_key(username)
        Gitlab::Cache::Import::Caching.read(email_fetched_for_project_cache_key)
      end

      def fetch_email_from_github(username, etag: nil)
        in_lock(lease_key, ttl: 3.minutes, sleep_sec: 1.second, retries: 30) do |retried|
          # when retried, check the cache again as the other process that had the lease may have fetched the email
          if retried
            email = read_email_from_cache(username)

            next email if email.present?
          end

          log(EMAIL_API_CALL_LOGGING_MESSAGE[etag.present?], username: username)

          # Only make a rate-limited API call if the ETAG is not available })
          user = client.user(username, { headers: { 'If-None-Match' => etag }.compact })
          user[:email] || '' if user
        end
      end

      # Caches the email associated to the username
      #
      # An empty email is cached when the user email isn't set on GitHub.
      # This is done to prevent UserFinder from fetching the user's email again when the user's email isn't set on
      # GitHub
      def cache_email!(username, email)
        return unless email

        Gitlab::Cache::Import::Caching.write(email_cache_key(username), email)
      end

      def cache_etag!(username)
        return unless client.octokit.last_response

        etag = client.octokit.last_response.headers[:etag]
        Gitlab::Cache::Import::Caching.write(etag_cache_key(username), etag)
      end

      def set_project_as_checked!(username)
        Gitlab::Cache::Import::Caching.write(email_fetched_for_project_cache_key(username), 1)
      end

      def clear_caches!(username)
        Gitlab::Cache::Import::Caching.expire(etag_cache_key(username), 0)
        Gitlab::Cache::Import::Caching.expire(email_fetched_for_project_cache_key(username), 0)
      end

      def email_cache_key(username)
        EMAIL_FOR_USERNAME_CACHE_KEY % username
      end

      def etag_cache_key(username)
        USERNAME_ETAG_CACHE_KEY % username
      end

      def email_fetched_for_project_cache_key(username)
        format(EMAIL_FETCHED_FOR_PROJECT_CACHE_KEY, project: project.id, username: username)
      end

      def log(message, username: nil)
        Logger.info(
          project_id: project.id,
          class: self.class.name,
          username: username,
          message: message
        )
      end
    end
  end
end