# frozen_string_literal: true module Gitlab class UrlSanitizer include Gitlab::Utils::StrongMemoize ALLOWED_SCHEMES = %w[http https ssh git].freeze ALLOWED_WEB_SCHEMES = %w[http https].freeze SCHEMIFIED_SCHEME = 'glschemelessuri' SCHEMIFY_PLACEHOLDER = "#{SCHEMIFIED_SCHEME}://".freeze # URI::DEFAULT_PARSER.make_regexp will only match URLs with schemes or # relative URLs. This section will match schemeless URIs with userinfo # e.g. user:pass@gitlab.com but will not match scp-style URIs e.g. # user@server:path/to/file) # # The userinfo part is very loose compared to URI's implementation so we # also match non-escaped userinfo e.g foo:b?r@gitlab.com which should be # encoded as foo:b%3Fr@gitlab.com URI_REGEXP = %r{ (?: #{URI::DEFAULT_PARSER.make_regexp(ALLOWED_SCHEMES)} | (?:(?:(?!@)[%#{URI::REGEXP::PATTERN::UNRESERVED}#{URI::REGEXP::PATTERN::RESERVED}])+(?:@)) (?# negative lookahead ensures this isn't an SCP-style URL: [host]:[rel_path|abs_path] server:path/to/file) (?!#{URI::REGEXP::PATTERN::HOST}:(?:#{URI::REGEXP::PATTERN::REL_PATH}|#{URI::REGEXP::PATTERN::ABS_PATH})) #{URI::REGEXP::PATTERN::HOSTPORT} ) }x def self.sanitize(content) content.gsub(URI_REGEXP) do |url| new(url).masked_url rescue Addressable::URI::InvalidURIError '' end end def self.valid?(url, allowed_schemes: ALLOWED_SCHEMES) return false unless url.present? return false unless url.is_a?(String) uri = Addressable::URI.parse(url.strip) allowed_schemes.include?(uri.scheme) rescue Addressable::URI::InvalidURIError false end def self.valid_web?(url) valid?(url, allowed_schemes: ALLOWED_WEB_SCHEMES) end def initialize(url, credentials: nil) %i[user password].each do |symbol| credentials[symbol] = credentials[symbol].presence if credentials&.key?(symbol) end @credentials = credentials @url = parse_url(url) end def credentials @credentials ||= { user: @url.user.presence, password: @url.password.presence } end def user credentials[:user] end def sanitized_url safe_url = @url.dup safe_url.password = nil safe_url.user = nil reverse_schemify(safe_url.to_s) end strong_memoize_attr :sanitized_url def masked_url url = @url.dup url.password = "*****" if url.password.present? url.user = "*****" if url.user.present? reverse_schemify(url.to_s) end strong_memoize_attr :masked_url def full_url return reverse_schemify(@url.to_s) unless valid_credentials? url = @url.dup url.password = encode_percent(credentials[:password]) if credentials[:password].present? url.user = encode_percent(credentials[:user]) if credentials[:user].present? reverse_schemify(url.to_s) end strong_memoize_attr :full_url private def parse_url(url) url = schemify(url.to_s.strip) match = url.match(%r{\A(?:(?:#{SCHEMIFIED_SCHEME}|git|ssh|http(?:s?)):)?//(?:(.+)(?:@))?(.+)}o) raw_credentials = match[1] if match if raw_credentials.present? url.sub!("#{raw_credentials}@", '') user, _, password = raw_credentials.partition(':') @credentials ||= {} @credentials[:user] = user.presence if @credentials[:user].blank? @credentials[:password] = password.presence if @credentials[:password].blank? end url = Addressable::URI.parse(url) url.password = password if password.present? url.user = user if user.present? url end def schemify(url) # Prepend the placeholder scheme unless the URL has a scheme or is relative url.prepend(SCHEMIFY_PLACEHOLDER) unless url.starts_with?(%r{(?:#{URI::REGEXP::PATTERN::SCHEME}:)?//}o) url end def reverse_schemify(url) url.slice!(SCHEMIFY_PLACEHOLDER) if url.starts_with?(SCHEMIFY_PLACEHOLDER) url end def valid_credentials? credentials.is_a?(Hash) && credentials.values.any? end def encode_percent(string) # CGI.escape converts spaces to +, but this doesn't work for git clone CGI.escape(string).gsub('+', '%20') end end end