diff options
author | Douwe Maan <douwe@gitlab.com> | 2015-08-20 21:05:06 +0300 |
---|---|---|
committer | Douwe Maan <douwe@gitlab.com> | 2015-08-20 21:05:06 +0300 |
commit | e9972efc2f3d730e989907585dd1438c517a0bba (patch) | |
tree | 7a38f9638cc50813d16d55f9276db98dd7cb041c /lib/gitlab/email/reply_parser.rb | |
parent | 3ff9d5c64cef8bf8daed5e253e388545987fb945 (diff) |
Extract ReplyParser and AttachmentUploader from Receiver.
Diffstat (limited to 'lib/gitlab/email/reply_parser.rb')
-rw-r--r-- | lib/gitlab/email/reply_parser.rb | 91 |
1 files changed, 91 insertions, 0 deletions
diff --git a/lib/gitlab/email/reply_parser.rb b/lib/gitlab/email/reply_parser.rb new file mode 100644 index 00000000000..6ceb755968c --- /dev/null +++ b/lib/gitlab/email/reply_parser.rb @@ -0,0 +1,91 @@ +# Inspired in great part by Discourse's Email::Receiver +module Gitlab + module Email + class ReplyParser + attr_accessor :message + + def initialize(message) + @message = message + end + + def execute + body = select_body(message) + + encoding = body.encoding + + body = discourse_email_trimmer(body) + + body = EmailReplyParser.parse_reply(body) + + body.force_encoding(encoding).encode("UTF-8") + end + + private + + def select_body(message) + html = nil + text = nil + + if message.multipart? + html = fix_charset(message.html_part) + text = fix_charset(message.text_part) + elsif message.content_type =~ /text\/html/ + html = fix_charset(message) + end + + # prefer plain text + return text if text + + if html + body = HtmlCleaner.new(html).output_html + else + body = fix_charset(message) + end + + # Certain trigger phrases that means we didn't parse correctly + if body =~ /(Content\-Type\:|multipart\/alternative|text\/plain)/ + return "" + end + + body + end + + # Force encoding to UTF-8 on a Mail::Message or Mail::Part + def fix_charset(object) + return nil if object.nil? + + if object.charset + object.body.decoded.force_encoding(object.charset.gsub(/utf8/i, "UTF-8")).encode("UTF-8").to_s + else + object.body.to_s + end + rescue + nil + end + + REPLYING_HEADER_LABELS = %w(From Sent To Subject Reply To Cc Bcc Date) + REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |label| "#{label}:" }) + + def discourse_email_trimmer(body) + lines = body.scrub.lines.to_a + range_end = 0 + + lines.each_with_index do |l, idx| + # This one might be controversial but so many reply lines have years, times and end with a colon. + # Let's try it and see how well it works. + break if (l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/) || + (l =~ /On \w+ \d+,? \d+,?.*wrote:/) + + # Headers on subsequent lines + break if (0..2).all? { |off| lines[idx+off] =~ REPLYING_HEADER_REGEX } + # Headers on the same line + break if REPLYING_HEADER_LABELS.count { |label| l.include?(label) } >= 3 + + range_end = idx + end + + lines[0..range_end].join.strip + end + end + end +end |