From ec5caa57cc14a328b8b994d49cb8def8eb95aea7 Mon Sep 17 00:00:00 2001 From: Koosha KM Date: Thu, 28 Aug 2014 17:17:54 +0000 Subject: Bug 330707: Add optional support for MarkDown r=dkl,a=sgreen --- Bugzilla/Markdown.pm | 493 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 493 insertions(+) create mode 100644 Bugzilla/Markdown.pm (limited to 'Bugzilla/Markdown.pm') diff --git a/Bugzilla/Markdown.pm b/Bugzilla/Markdown.pm new file mode 100644 index 000000000..c5a34fb6e --- /dev/null +++ b/Bugzilla/Markdown.pm @@ -0,0 +1,493 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# This Source Code Form is "Incompatible With Secondary Licenses", as +# defined by the Mozilla Public License, v. 2.0. + +package Bugzilla::Markdown; + +use 5.10.1; +use strict; +use warnings; + +use Bugzilla::Constants; +use Bugzilla::Template; + +use Digest::MD5 qw(md5_hex); + +use parent qw(Text::Markdown); + +@Bugzilla::Markdown::EXPORT = qw(new); + +# Regex to match balanced [brackets]. See Friedl's +# "Mastering Regular Expressions", 2nd Ed., pp. 328-331. +our ($g_nested_brackets, $g_nested_parens); +$g_nested_brackets = qr{ + (?> # Atomic matching + [^\[\]]+ # Anything other than brackets + | + \[ + (??{ $g_nested_brackets }) # Recursive set of nested brackets + \] + )* +}x; +# Doesn't allow for whitespace, because we're using it to match URLs: +$g_nested_parens = qr{ + (?> # Atomic matching + [^()\s]+ # Anything other than parens or whitespace + | + \( + (??{ $g_nested_parens }) # Recursive set of nested brackets + \) + )* +}x; + +our %g_escape_table; +foreach my $char (split //, '\\`*_{}[]()>#+-.!~') { + $g_escape_table{$char} = md5_hex($char); +} + +sub new { + my $invocant = shift; + my $class = ref $invocant || $invocant; + return $class->SUPER::new(tab_width => MARKDOWN_TAB_WIDTH, + # Bugzilla uses HTML not XHTML + empty_element_suffix => '>'); +} + +sub markdown { + my $self = shift; + my $text = shift; + my $user = Bugzilla->user; + + if (Bugzilla->feature('markdown') + && $user->settings->{use_markdown}->{is_enabled} + && $user->setting('use_markdown') eq 'on') + { + return $self->SUPER::markdown($text, @_); + } + + return Bugzilla::Template::quoteUrls($text); +} + +sub _Markdown { + my $self = shift; + my $text = shift; + + $text = Bugzilla::Template::quoteUrls($text); + + return $self->SUPER::_Markdown($text, @_); +} + +sub _RunSpanGamut { + # These are all the transformations that occur *within* block-level + # tags like paragraphs, headers, and list items. + + my ($self, $text) = @_; + + $text = $self->_DoCodeSpans($text); + $text = $self->_EscapeSpecialCharsWithinTagAttributes($text); + $text = $self->_EscapeSpecialChars($text); + + $text = $self->_DoAnchors($text); + + # Strikethroughs is Bugzilla's extension + $text = $self->_DoStrikethroughs($text); + + $text = $self->_DoAutoLinks($text); + $text = $self->_EncodeAmpsAndAngles($text); + $text = $self->_DoItalicsAndBold($text); + + $text =~ s/ {2,}\n/ {empty_element_suffix}\n/g; + + return $text; +} + +# Override to check for HTML-escaped <>" chars. +sub _StripLinkDefinitions { +# +# Strips link definitions from text, stores the URLs and titles in +# hash references. +# + my ($self, $text) = @_; + my $less_than_tab = $self->{tab_width} - 1; + + # Link defs are in the form: ^[id]: url "optional title" + while ($text =~ s{ + ^[ ]{0,$less_than_tab}\[(.+)\]: # id = \$1 + [ \t]* + \n? # maybe *one* newline + [ \t]* + (?:<)?\2(?:>)? # url = \$2 + [ \t]* + \n? # maybe one newline + [ \t]* + (?: + (?<=\s) # lookbehind for whitespace + (?:"|\() + (.+?) # title = \$3 + (?:"|\)) + [ \t]* + )? # title is optional + (?:\n+|\Z) + }{}omx) { + $self->{_urls}{lc $1} = $self->_EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive + if ($3) { + $self->{_titles}{lc $1} = $3; + $self->{_titles}{lc $1} =~ s/"/"/g; + } + + } + + return $text; +} + +# We need to look for HTML-escaped '<' and '>' (i.e. < and >). +# We also remove Email linkification from the original implementation +# as it is already done in Bugzilla's quoteUrls(). +sub _DoAutoLinks { + my ($self, $text) = @_; + + $text =~ s{(?:<|<)((?:https?|ftp):[^'">\s]+)(?:>|>)}{$1}gi; + return $text; +} + +# The main reasons for overriding this method are +# resolving URL conflicts with Bugzilla's quoteUrls() +# and also changing '"' to '"' in regular expressions wherever needed. +sub _DoAnchors { +# +# Turn Markdown link shortcuts into tags. +# + my ($self, $text) = @_; + + # We revert linkifications of non-email links and only + # those links whose URL and title are the same because + # this way we can be sure that link is generated by quoteUrls() + $text =~ s@\1@$1@xmg; + + # + # First, handle reference-style links: [link text] [id] + # + $text =~ s{ + ( # wrap whole match in $1 + \[ + ($g_nested_brackets) # link text = $2 + \] + + [ ]? # one optional space + (?:\n[ ]*)? # one optional newline followed by spaces + + \[ + (.*?) # id = $3 + \] + ) + }{ + my $whole_match = $1; + my $link_text = $2; + my $link_id = lc $3; + + if ($link_id eq "") { + $link_id = lc $link_text; # for shortcut links like [this][]. + } + + $link_id =~ s{[ ]*\n}{ }g; # turn embedded newlines into spaces + + $self->_GenerateAnchor($whole_match, $link_text, $link_id); + }xsge; + + # + # Next, inline-style links: [link text](url "optional title") + # + $text =~ s{ + ( # wrap whole match in $1 + \[ + ($g_nested_brackets) # link text = $2 + \] + \( # literal paren + [ \t]* + ($g_nested_parens) # href = $3 + [ \t]* + ( # $4 + ("|') # quote char = $5 + (.*?) # Title = $6 + \5 # matching quote + [ \t]* # ignore any spaces/tabs between closing quote and ) + )? # title is optional + \) + ) + }{ + my $result; + my $whole_match = $1; + my $link_text = $2; + my $url = $3; + my $title = $6; + + # Remove Bugzilla quoteUrls() linkification + if ($url =~ /^a href="/ && $url =~ m|]+>//; + $url =~ s@_GenerateAnchor($whole_match, $link_text, undef, $url, $title); + }xsge; + + # + # Last, handle reference-style shortcuts: [link text] + # These must come last in case you've also got [link test][1] + # or [link test](/foo) + # + $text =~ s{ + ( # wrap whole match in $1 + \[ + ([^\[\]]+) # link text = $2; can't contain '[' or ']' + \] + ) + }{ + my $result; + my $whole_match = $1; + my $link_text = $2; + (my $link_id = lc $2) =~ s{[ ]*\n}{ }g; # lower-case and turn embedded newlines into spaces + + $self->_GenerateAnchor($whole_match, $link_text, $link_id); + }xsge; + + return $text; +} + +# The purpose of overriding this function is to add support +# for a Github Flavored Markdown (GFM) feature called 'Multiple +# underscores in words'. The standard markdown specification +# specifies the underscore for making the text emphasized/bold. +# However, some variable names in programming languages contain underscores +# and we do not want a part of those variables to look emphasized/bold. +# Instead, we render them as the way they originally are. +sub _DoItalicsAndBold { + my ($self, $text) = @_; + + # Handle at beginning of lines: + $text =~ s{ (^__ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\S)) } + { + my $result = _has_multiple_underscores($2) ? $1 : "$2"; + $result; + }gsxe; + + $text =~ s{ ^\*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{$1}gsx; + + $text =~ s{ (^_ (?=\S) (.+?) (?<=\S) _ (?!\S)) } + { + my $result = _has_multiple_underscores($2) ? $1 : "$2"; + $result; + }gsxe; + + $text =~ s{ ^\* (?=\S) (.+?) (?<=\S) \* }{$1}gsx; + + # must go first: + $text =~ s{ ( (?<=\W) __ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\S) ) } + { + my $result = _has_multiple_underscores($2) ? $1 : "$2"; + $result; + }gsxe; + + + $text =~ s{ (?<=\W) \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{$1}gsx; + + $text =~ s{ ( (?<=\W) _ (?=\S) (.+?) (?<=\S) _ (?!\S) ) } + { + my $result = _has_multiple_underscores($2) ? $1 : "$2"; + $result; + }gsxe; + + $text =~ s{ (?<=\W) \* (?=\S) (.+?) (?<=\S) \* }{$1}gsx; + + # And now, a second pass to catch nested strong and emphasis special cases + $text =~ s{ ( (?<=\W) __ (?=\S) (.+?[*_]*) (?<=\S) __ (\S*) ) } + { + my $result = _has_multiple_underscores($3) ? $1 : "$2$3"; + $result; + }gsxe; + + $text =~ s{ (?<=\W) \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{$1}gsx; + $text =~ s{ ( (?<=\W) _ (?=\S) (.+?) (?<=\S) _ (\S*) ) } + { + my $result = _has_multiple_underscores($3) ? $1 : "$2$3"; + $result; + }gsxe; + + $text =~ s{ (?<=\W) \* (?=\S) (.+?) (?<=\S) \* }{$1}gsx; + + return $text; +} + +# Override this function to ignore 'wrap_in_p_tags' from +# the caller and to not generate

tags around the output. +sub _FormParagraphs { + my ($self, $text) = @_; + return $self->SUPER::_FormParagraphs($text, { wrap_in_p_tags => 0 }); +} + +sub _DoStrikethroughs { + my ($self, $text) = @_; + + $text =~ s{ ^ ~~ (?=\S) ([^~]+?) (?<=\S) ~~ (?!~) }{$1}gsx; + $text =~ s{ (?<=_|[^~\w]) ~~ (?=\S) ([^~]+?) (?<=\S) ~~ (?!~) }{$1}gsx; + + return $text; +} + +# The original _DoCodeSpans() uses the 's' modifier in its regex +# which prevents _DoCodeBlocks() to match GFM fenced code blocks. +# We copy the code from the original implementation and remove the +# 's' modifier from it. +sub _DoCodeSpans { + my ($self, $text) = @_; + + $text =~ s@ + (?_EncodeCode($c); + "$c"; + @egx; + + return $text; +} + +# Override to add GFM Fenced Code Blocks +sub _DoCodeBlocks { + my ($self, $text) = @_; + + $text =~ s{ + ^ `{3,} [\s\t]* \n + ( # $1 = the entire code block + (?: .* \n+)+? + ) + `{3,} [\s\t]* $ + }{ + my $codeblock = $1; + my $result; + + $codeblock = $self->_EncodeCode($codeblock); + $codeblock = $self->_Detab($codeblock); + $codeblock =~ s/\n\z//; # remove the trailing newline + + $result = "\n\n

" . $codeblock . "
\n\n"; + $result; + }egmx; + + # And now do the standard code blocks + $text = $self->SUPER::_DoCodeBlocks($text); + + return $text; +} + +sub _EncodeCode { + my ($self, $text) = @_; + + # We need to unescape the escaped HTML characters in code blocks. + # These are the reverse of the escapings done in Bugzilla::Util::html_quote() + $text =~ s/<//g; + $text =~ s/"/"/g; + $text =~ s/@/@/g; + # '&' substitution must be the last one, otherwise a literal like '>' + # will turn to '>' because '&' is already changed to '&' in Bugzilla::Util::html_quote(). + # In other words, html_quote() will change '>' to '&gt;' and then we will + # change '&gt' -> '>' -> '>' if we write this substitution as the first one. + $text =~ s/&/&/g; + $text = $self->SUPER::_EncodeCode($text); + $text =~ s/~/$g_escape_table{'~'}/go; + + return $text; +} + +sub _EncodeBackslashEscapes { + my ($self, $text) = @_; + + $text = $self->SUPER::_EncodeBackslashEscapes($text); + $text =~ s/\\~/$g_escape_table{'~'}/go; + + return $text; +} + +sub _UnescapeSpecialChars { + my ($self, $text) = @_; + + $text = $self->SUPER::_UnescapeSpecialChars($text); + $text =~ s/$g_escape_table{'~'}/~/go; + + return $text; +} + +# Check if the passed string is of the form multiple_underscores_in_a_word. +# To check that, we first need to make sure that the string does not contain +# any white-space. Then, if the string is composed of non-space chunks which +# are bound together with underscores, the string has the desired form. +sub _has_multiple_underscores { + my $string = shift; + return 0 unless defined($string) && length($string); + return 0 if $string =~ /[\t\s]+/; + return 1 if scalar (split /_/, $string) > 1; + return 0; +} + +1; + +__END__ + +=head1 NAME + +Bugzilla::Markdown - Generates HTML output from structured plain-text input. + +=head1 SYNOPSIS + + use Bugzilla::Markdown; + + my $markdown = Bugzilla::Markdown->new(); + print $markdown->markdown($text); + +=head1 DESCRIPTION + +Bugzilla::Markdown implements a Markdown engine that produces +an HTML-based output from a given plain-text input. + +The majority of the implementation is done by C +CPAN module. It also applies the linkifications done in L +to the input resulting in an output which is a combination of both Markdown +structures and those defined by Bugzilla itself. + +=head2 Accessors + +=over + +=item C + +C Produces an HTML-based output string based on the structures +and format defined in the given plain-text input. + +=over + +=item B + +=over + +=item C + +C A plain-text string which includes Markdown structures. + +=back + +=back + +=back -- cgit v1.2.3-24-g4f1b