diff options
Diffstat (limited to 'lib/banzai')
-rw-r--r-- | lib/banzai/filter/dollar_math_post_filter.rb | 76 | ||||
-rw-r--r-- | lib/banzai/filter/dollar_math_pre_filter.rb | 49 | ||||
-rw-r--r-- | lib/banzai/filter/inline_observability_filter.rb | 16 | ||||
-rw-r--r-- | lib/banzai/filter/markdown_post_escape_filter.rb | 76 | ||||
-rw-r--r-- | lib/banzai/filter/markdown_pre_escape_filter.rb | 42 | ||||
-rw-r--r-- | lib/banzai/filter/math_filter.rb | 96 | ||||
-rw-r--r-- | lib/banzai/filter/repository_link_filter.rb | 4 | ||||
-rw-r--r-- | lib/banzai/filter/service_desk_upload_link_filter.rb | 41 | ||||
-rw-r--r-- | lib/banzai/pipeline/plain_markdown_pipeline.rb | 7 | ||||
-rw-r--r-- | lib/banzai/pipeline/service_desk_email_pipeline.rb | 11 |
10 files changed, 319 insertions, 99 deletions
diff --git a/lib/banzai/filter/dollar_math_post_filter.rb b/lib/banzai/filter/dollar_math_post_filter.rb new file mode 100644 index 00000000000..94d1b4bcb48 --- /dev/null +++ b/lib/banzai/filter/dollar_math_post_filter.rb @@ -0,0 +1,76 @@ +# frozen_string_literal: true + +# Generated HTML is transformed back to GFM by: +# - app/assets/javascripts/behaviors/markdown/marks/math.js +# - app/assets/javascripts/behaviors/markdown/nodes/code_block.js +module Banzai + module Filter + # HTML filter that implements our dollar math syntax, one of three filters: + # DollarMathPreFilter, DollarMathPostFilter, and MathFilter + # + class DollarMathPostFilter < HTML::Pipeline::Filter + # Based on the Pandoc heuristics, + # https://pandoc.org/MANUAL.html#extension-tex_math_dollars + # + # Handle the $...$ and $$...$$ inline syntax in this filter, after markdown processing + # but before post-handling of escaped characters. Any escaped $ will have been specially + # encoded and will therefore not interfere with the detection of the dollar syntax. + + # Corresponds to the "$...$" syntax + DOLLAR_INLINE_PATTERN = %r{ + (?<matched>\$(?<math>(?:\S[^$\n]*?\S|[^$\s]))\$)(?:[^\d]|$) + }x.freeze + + # Corresponds to the "$$...$$" syntax + DOLLAR_DISPLAY_INLINE_PATTERN = %r{ + (?<matched>\$\$\ *(?<math>[^$\n]+?)\ *\$\$) + }x.freeze + + # Order dependent. Handle the `$$` syntax before the `$` syntax + DOLLAR_MATH_PIPELINE = [ + { pattern: DOLLAR_DISPLAY_INLINE_PATTERN, style: :display }, + { pattern: DOLLAR_INLINE_PATTERN, style: :inline } + ].freeze + + # Do not recognize math inside these tags + IGNORED_ANCESTOR_TAGS = %w[pre code tt].to_set + + def call + process_dollar_pipeline + + doc + end + + def process_dollar_pipeline + doc.xpath('descendant-or-self::text()').each do |node| + next if has_ancestor?(node, IGNORED_ANCESTOR_TAGS) + + node_html = node.to_html + next unless node_html.match?(DOLLAR_INLINE_PATTERN) || + node_html.match?(DOLLAR_DISPLAY_INLINE_PATTERN) + + temp_doc = Nokogiri::HTML.fragment(node_html) + + DOLLAR_MATH_PIPELINE.each do |pipeline| + temp_doc.xpath('child::text()').each do |temp_node| + html = temp_node.to_html + temp_node.content.scan(pipeline[:pattern]).each do |matched, math| + html.sub!(matched, math_html(math: math, style: pipeline[:style])) + end + + temp_node.replace(html) + end + end + + node.replace(temp_doc) + end + end + + private + + def math_html(math:, style:) + "<code data-math-style=\"#{style}\">#{math}</code>" + end + end + end +end diff --git a/lib/banzai/filter/dollar_math_pre_filter.rb b/lib/banzai/filter/dollar_math_pre_filter.rb new file mode 100644 index 00000000000..aaa186f87a6 --- /dev/null +++ b/lib/banzai/filter/dollar_math_pre_filter.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +# Generated HTML is transformed back to GFM by: +# - app/assets/javascripts/behaviors/markdown/marks/math.js +# - app/assets/javascripts/behaviors/markdown/nodes/code_block.js +module Banzai + module Filter + # HTML filter that implements our dollar math syntax, one of three filters: + # DollarMathPreFilter, DollarMathPostFilter, and MathFilter + # + class DollarMathPreFilter < HTML::Pipeline::TextFilter + # Based on the Pandoc heuristics, + # https://pandoc.org/MANUAL.html#extension-tex_math_dollars + # + # Handle the $$\n...\n$$ syntax in this filter, before markdown processing, + # by converting it into the ```math syntax. In this way, we can ensure + # that it's considered a code block and will not have any markdown processed inside it. + + # Corresponds to the "$$\n...\n$$" syntax + REGEX = %r{ + #{::Gitlab::Regex.markdown_code_or_html_blocks} + | + (?=(?<=^\n|\A)\$\$\ *\n.*\n\$\$\ *(?=\n$|\z))(?: + # Display math block: + # $$ + # latex math + # $$ + + (?<=^\n|\A)\$\$\ *\n + (?<display_math> + (?:.)+? + ) + \n\$\$\ *(?=\n$|\z) + ) + }mx.freeze + + def call + @text.gsub(REGEX) do + if $~[:display_math] + # change from $$ to ```math + "```math\n#{$~[:display_math]}\n```" + else + $~[0] + end + end + end + end + end +end diff --git a/lib/banzai/filter/inline_observability_filter.rb b/lib/banzai/filter/inline_observability_filter.rb index 27b89073a0e..334c04f2b59 100644 --- a/lib/banzai/filter/inline_observability_filter.rb +++ b/lib/banzai/filter/inline_observability_filter.rb @@ -3,6 +3,12 @@ module Banzai module Filter class InlineObservabilityFilter < ::Banzai::Filter::InlineEmbedsFilter + def call + return doc unless can_view_observability? + + super + end + # Placeholder element for the frontend to use as an # injection point for observability. def create_element(url) @@ -25,6 +31,16 @@ module Banzai create_element(url) end + + private + + def can_view_observability? + Feature.enabled?(:observability_group_tab, group) + end + + def group + context[:group] || context[:project]&.group + end end end end diff --git a/lib/banzai/filter/markdown_post_escape_filter.rb b/lib/banzai/filter/markdown_post_escape_filter.rb index 09ae09a22ae..8c0bd62f80a 100644 --- a/lib/banzai/filter/markdown_post_escape_filter.rb +++ b/lib/banzai/filter/markdown_post_escape_filter.rb @@ -2,33 +2,69 @@ module Banzai module Filter + # See comments in MarkdownPreEscapeFilter for details on strategy class MarkdownPostEscapeFilter < HTML::Pipeline::Filter LITERAL_KEYWORD = MarkdownPreEscapeFilter::LITERAL_KEYWORD LITERAL_REGEX = %r{#{LITERAL_KEYWORD}-(.*?)-#{LITERAL_KEYWORD}}.freeze NOT_LITERAL_REGEX = %r{#{LITERAL_KEYWORD}-((%5C|\\).+?)-#{LITERAL_KEYWORD}}.freeze SPAN_REGEX = %r{<span>(.*?)</span>}.freeze - CSS_A = 'a' - XPATH_A = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_A).freeze - CSS_LANG_TAG = 'pre' - XPATH_LANG_TAG = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_LANG_TAG).freeze + XPATH_A = Gitlab::Utils::Nokogiri.css_to_xpath('a').freeze + XPATH_LANG_TAG = Gitlab::Utils::Nokogiri.css_to_xpath('pre').freeze + XPATH_CODE_SPAN = Gitlab::Utils::Nokogiri.css_to_xpath('code > span').freeze def call return doc unless result[:escaped_literals] - # For any literals that actually didn't get escape processed - # (for example in code blocks), remove the special sequence. - html.gsub!(NOT_LITERAL_REGEX, '\1') + new_html = unescaped_literals(doc.to_html) + new_html = add_spans(new_html) - # Replace any left over literal sequences with `span` so that our - # reference processing is short-circuited - html.gsub!(LITERAL_REGEX, '<span>\1</span>') + @doc = parse_html(new_html) - # Since literals are converted in links, we need to remove any surrounding `span`. - # Note: this could have been done in the renderer, - # Banzai::Renderer::CommonMark::HTML. However, we eventually want to use - # the built-in compiled renderer, rather than the ruby version, for speed. - # So let's do this work here. + remove_spans_in_certain_attributes + remove_spans_in_code + + doc + end + + private + + # For any literals that actually didn't get escape processed + # (for example in code blocks), remove the special sequence. + def unescaped_literals(html) + html.gsub!(NOT_LITERAL_REGEX) do |match| + last_match = ::Regexp.last_match(1) + last_match_token = last_match.sub('%5C', '\\') + + escaped_item = Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.find { |item| item[:token] == last_match_token } + escaped_char = escaped_item ? escaped_item[:escaped] : last_match + + escaped_char = escaped_char.sub('\\', '%5C') if last_match.start_with?('%5C') + + escaped_char + end + + html + end + + # Replace any left over literal sequences with `span` so that our + # reference processing is short-circuited + def add_spans(html) + html.gsub!(LITERAL_REGEX) do |match| + last_match = ::Regexp.last_match(1) + last_match_token = "\\#{last_match}" + + escaped_item = Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.find { |item| item[:token] == last_match_token } + escaped_char = escaped_item ? escaped_item[:char] : ::Regexp.last_match(1) + + "<span>#{escaped_char}</span>" + end + + html + end + + # Since literals are converted in links, we need to remove any surrounding `span`. + def remove_spans_in_certain_attributes doc.xpath(XPATH_A).each do |node| node.attributes['href'].value = node.attributes['href'].value.gsub(SPAN_REGEX, '\1') if node.attributes['href'] node.attributes['title'].value = node.attributes['title'].value.gsub(SPAN_REGEX, '\1') if node.attributes['title'] @@ -37,8 +73,16 @@ module Banzai doc.xpath(XPATH_LANG_TAG).each do |node| node.attributes['lang'].value = node.attributes['lang'].value.gsub(SPAN_REGEX, '\1') if node.attributes['lang'] end + end - doc + # Any `<span>` that makes it into a `<code>` element is from the math processing, + # convert back to the escaped character, such as `\$` + def remove_spans_in_code + doc.xpath(XPATH_CODE_SPAN).each do |node| + escaped_item = Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.find { |item| item[:char] == node.content && item[:latex] } + + node.replace(escaped_item[:escaped]) if escaped_item + end end end end diff --git a/lib/banzai/filter/markdown_pre_escape_filter.rb b/lib/banzai/filter/markdown_pre_escape_filter.rb index 8d54d140877..8cc7b0defd6 100644 --- a/lib/banzai/filter/markdown_pre_escape_filter.rb +++ b/lib/banzai/filter/markdown_pre_escape_filter.rb @@ -10,6 +10,10 @@ module Banzai # This way CommonMark will properly handle the backslash escaped chars # but we will maintain knowledge (the sequence) that it was a literal. # + # This processing is also important for the handling of escaped characters + # in LaTeX math. These will need to be converted back into their escaped + # versions if they are detected in math blocks. + # # We need to surround the character, not just prefix it. It could # get converted into an entity by CommonMark and we wouldn't know how many # characters there are. The entire literal needs to be surrounded with @@ -24,9 +28,36 @@ module Banzai # This filter does the initial surrounding, and MarkdownPostEscapeFilter # does the conversion into span tags. class MarkdownPreEscapeFilter < HTML::Pipeline::TextFilter - # We just need to target those that are special GitLab references - REFERENCE_CHARACTERS = '@#!$&~%^' - ASCII_PUNCTUATION = %r{(\\[#{REFERENCE_CHARACTERS}])}.freeze + # Table of characters that need this special handling. It consists of the + # GitLab special reference characters and special LaTeX characters. + # + # The `token` is used when we do the initial replacement - for example converting + # `\$` into `cmliteral-\+a-cmliteral`. We don't simply replace `\$` with `$`, + # because this can cause difficulties in parsing math blocks that use `$` as a + # delimiter. We also include a character that _can_ be escaped, `\+`. By examining + # the text once it's been passed to markdown, we can determine that `cmliteral-\+a-cmliteral` + # was in a block that markdown did _not_ escape the character, for example an inline + # code block or some other element. In this case, we must convert back to the + # original escaped version, `\$`. However if we detect `cmliteral-+a-cmliteral`, + # then we know markdown considered it an escaped character, and we should replace it + # with the non-escaped version, `$`. + # See the MarkdownPostEscapeFilter for how this is done. + ESCAPABLE_CHARS = [ + { char: '$', escaped: '\$', token: '\+a', reference: true, latex: true }, + { char: '%', escaped: '\%', token: '\+b', reference: true, latex: true }, + { char: '#', escaped: '\#', token: '\+c', reference: true, latex: true }, + { char: '&', escaped: '\&', token: '\+d', reference: true, latex: true }, + { char: '{', escaped: '\{', token: '\+e', reference: false, latex: true }, + { char: '}', escaped: '\}', token: '\+f', reference: false, latex: true }, + { char: '_', escaped: '\_', token: '\+g', reference: false, latex: true }, + { char: '@', escaped: '\@', token: '\+h', reference: true, latex: false }, + { char: '!', escaped: '\!', token: '\+i', reference: true, latex: false }, + { char: '~', escaped: '\~', token: '\+j', reference: true, latex: false }, + { char: '^', escaped: '\^', token: '\+k', reference: true, latex: false } + ].freeze + + TARGET_CHARS = ESCAPABLE_CHARS.pluck(:char).join.freeze + ASCII_PUNCTUATION = %r{(\\[#{TARGET_CHARS}])}.freeze LITERAL_KEYWORD = 'cmliteral' def call @@ -35,7 +66,10 @@ module Banzai # are found, we can bypass the post filter result[:escaped_literals] = true - "#{LITERAL_KEYWORD}-#{match}-#{LITERAL_KEYWORD}" + escaped_item = ESCAPABLE_CHARS.find { |item| item[:escaped] == match } + token = escaped_item ? escaped_item[:token] : match + + "#{LITERAL_KEYWORD}-#{token}-#{LITERAL_KEYWORD}" end end end diff --git a/lib/banzai/filter/math_filter.rb b/lib/banzai/filter/math_filter.rb index 1d854d6599b..9b6fc71077a 100644 --- a/lib/banzai/filter/math_filter.rb +++ b/lib/banzai/filter/math_filter.rb @@ -1,55 +1,29 @@ # frozen_string_literal: true -require 'uri' - # Generated HTML is transformed back to GFM by: # - app/assets/javascripts/behaviors/markdown/marks/math.js # - app/assets/javascripts/behaviors/markdown/nodes/code_block.js module Banzai module Filter - # HTML filter that implements our math syntax, adding class="code math" + # HTML filter that implements the original GitLab math syntax, one of three filters: + # DollarMathPreFilter, DollarMathPostFilter, and MathFilter # class MathFilter < HTML::Pipeline::Filter + # Handle the $`...`$ and ```math syntax in this filter. + # Also add necessary classes any existing math blocks. + CSS_MATH = 'pre[lang="math"] > code' XPATH_MATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_MATH).freeze CSS_CODE = 'code' XPATH_CODE = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_CODE).freeze - - # These are based on the Pandoc heuristics, - # https://pandoc.org/MANUAL.html#extension-tex_math_dollars - # Note: at this time, using a dollar sign literal, `\$` inside - # a math statement does not work correctly. - # Corresponds to the "$...$" syntax - DOLLAR_INLINE_PATTERN = %r{ - (?<matched>\$(?<math>(?:\S[^$\n]*?\S|[^$\s]))\$)(?:[^\d]|$) - }x.freeze - - # Corresponds to the "$$...$$" syntax - DOLLAR_DISPLAY_INLINE_PATTERN = %r{ - (?<matched>\$\$\ *(?<math>[^$\n]+?)\ *\$\$) - }x.freeze - - # Corresponds to the $$\n...\n$$ syntax - DOLLAR_DISPLAY_BLOCK_PATTERN = %r{ - ^(?<matched>\$\$\ *\n(?<math>.*)\n\$\$\ *)$ - }mx.freeze - - # Order dependent. Handle the `$$` syntax before the `$` syntax - DOLLAR_MATH_PIPELINE = [ - { pattern: DOLLAR_DISPLAY_INLINE_PATTERN, tag: :code, style: :display }, - { pattern: DOLLAR_DISPLAY_BLOCK_PATTERN, tag: :pre, style: :display }, - { pattern: DOLLAR_INLINE_PATTERN, tag: :code, style: :inline } - ].freeze - - # Do not recognize math inside these tags - IGNORED_ANCESTOR_TAGS = %w[pre code tt].to_set + CSS_INLINE_CODE = 'code[data-math-style]' + XPATH_INLINE_CODE = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_INLINE_CODE).freeze # Attribute indicating inline or display math. STYLE_ATTRIBUTE = 'data-math-style' # Class used for tagging elements that should be rendered TAG_CLASS = 'js-render-math' - MATH_CLASSES = "code math #{TAG_CLASS}" DOLLAR_SIGN = '$' @@ -61,47 +35,31 @@ module Banzai def call @nodes_count = 0 - process_dollar_pipeline if Feature.enabled?(:markdown_dollar_math, group) - + process_existing process_dollar_backtick_inline process_math_codeblock doc end - def process_dollar_pipeline - doc.xpath('descendant-or-self::text()').each do |node| - next if has_ancestor?(node, IGNORED_ANCESTOR_TAGS) - - node_html = node.to_html - next unless node_html.match?(DOLLAR_INLINE_PATTERN) || - node_html.match?(DOLLAR_DISPLAY_INLINE_PATTERN) || - node_html.match?(DOLLAR_DISPLAY_BLOCK_PATTERN) - - temp_doc = Nokogiri::HTML.fragment(node_html) - DOLLAR_MATH_PIPELINE.each do |pipeline| - temp_doc.xpath('child::text()').each do |temp_node| - html = temp_node.to_html - temp_node.content.scan(pipeline[:pattern]).each do |matched, math| - html.sub!(matched, math_html(tag: pipeline[:tag], style: pipeline[:style], math: math)) - - @nodes_count += 1 - break if @nodes_count >= RENDER_NODES_LIMIT - end + private - temp_node.replace(html) + # Add necessary classes to any existing math blocks + def process_existing + doc.xpath(XPATH_INLINE_CODE).each do |code| + break if @nodes_count >= RENDER_NODES_LIMIT - break if @nodes_count >= RENDER_NODES_LIMIT - end - end + code[:class] = MATH_CLASSES - node.replace(temp_doc) + @nodes_count += 1 end end # Corresponds to the "$`...`$" syntax def process_dollar_backtick_inline doc.xpath(XPATH_CODE).each do |code| + break if @nodes_count >= RENDER_NODES_LIMIT + closing = code.next opening = code.previous @@ -112,17 +70,16 @@ module Banzai closing.content.first == DOLLAR_SIGN && opening.content.last == DOLLAR_SIGN - code[:class] = MATH_CLASSES code[STYLE_ATTRIBUTE] = 'inline' + code[:class] = MATH_CLASSES closing.content = closing.content[1..] opening.content = opening.content[0..-2] @nodes_count += 1 - break if @nodes_count >= RENDER_NODES_LIMIT end end - # corresponds to the "```math...```" syntax + # Corresponds to the "```math...```" syntax def process_math_codeblock doc.xpath(XPATH_MATH).each do |node| pre_node = node.parent @@ -130,21 +87,6 @@ module Banzai pre_node[:class] = TAG_CLASS end end - - private - - def math_html(tag:, math:, style:) - case tag - when :code - "<code class=\"#{MATH_CLASSES}\" data-math-style=\"#{style}\">#{math}</code>" - when :pre - "<pre class=\"#{MATH_CLASSES}\" data-math-style=\"#{style}\"><code>#{math}</code></pre>" - end - end - - def group - context[:group] || context[:project]&.group - end end end end diff --git a/lib/banzai/filter/repository_link_filter.rb b/lib/banzai/filter/repository_link_filter.rb index 86beeae01b7..ddc3f5cf715 100644 --- a/lib/banzai/filter/repository_link_filter.rb +++ b/lib/banzai/filter/repository_link_filter.rb @@ -90,14 +90,14 @@ module Banzai end def get_uri(html_attr) - uri = URI(html_attr.value) + uri = Addressable::URI.parse(html_attr.value) uri if uri.relative? && uri.path.present? rescue URI::Error, Addressable::URI::InvalidURIError end def process_link_to_repository_attr(html_attr) - uri = URI(html_attr.value) + uri = Addressable::URI.parse(html_attr.value) if uri.relative? && uri.path.present? html_attr.value = rebuild_relative_uri(uri).to_s diff --git a/lib/banzai/filter/service_desk_upload_link_filter.rb b/lib/banzai/filter/service_desk_upload_link_filter.rb new file mode 100644 index 00000000000..9f26dfb8ae5 --- /dev/null +++ b/lib/banzai/filter/service_desk_upload_link_filter.rb @@ -0,0 +1,41 @@ +# frozen_string_literal: true + +module Banzai + module Filter + # HTML filter for service desk emails. + # Context options: + # :replace_upload_links + class ServiceDeskUploadLinkFilter < BaseRelativeLinkFilter + def call + return doc unless context[:uploads_as_attachments].present? + + linkable_attributes.reject! do |attr| + replace_upload_link(attr) + end + + doc + end + + protected + + def replace_upload_link(html_attr) + return unless html_attr.name == 'href' + return unless html_attr.value.start_with?('/uploads/') + + secret, filename_in_link = html_attr.value.scan(FileUploader::DYNAMIC_PATH_PATTERN).first + return unless context[:uploads_as_attachments].include?("#{secret}/#{filename_in_link}") + + parent = html_attr.parent + filename_in_text = parent.text + final_filename = if filename_in_link != filename_in_text + "#{filename_in_text} (#{filename_in_link})" + else + filename_in_text + end + + final_element = Nokogiri::HTML::DocumentFragment.parse("<strong>#{final_filename}</strong>") + parent.replace(final_element) + end + end + end +end diff --git a/lib/banzai/pipeline/plain_markdown_pipeline.rb b/lib/banzai/pipeline/plain_markdown_pipeline.rb index 1da0f72996b..205bbc2140d 100644 --- a/lib/banzai/pipeline/plain_markdown_pipeline.rb +++ b/lib/banzai/pipeline/plain_markdown_pipeline.rb @@ -3,10 +3,17 @@ module Banzai module Pipeline class PlainMarkdownPipeline < BasePipeline + # DollarMathPreFilter and DollarMathPostFilter need to be included here, + # rather than in another pipeline. However, since dollar math would most + # likely be supported as an extension in any other markdown parser we used, + # it is not out of place. We are considering this a part of the actual + # markdown processing def self.filters FilterArray[ Filter::MarkdownPreEscapeFilter, + Filter::DollarMathPreFilter, Filter::MarkdownFilter, + Filter::DollarMathPostFilter, Filter::MarkdownPostEscapeFilter ] end diff --git a/lib/banzai/pipeline/service_desk_email_pipeline.rb b/lib/banzai/pipeline/service_desk_email_pipeline.rb new file mode 100644 index 00000000000..cc7cd8a92b8 --- /dev/null +++ b/lib/banzai/pipeline/service_desk_email_pipeline.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +module Banzai + module Pipeline + class ServiceDeskEmailPipeline < EmailPipeline + def self.filters + super.insert_before(Filter::ExternalLinkFilter, Banzai::Filter::ServiceDeskUploadLinkFilter) + end + end + end +end |