summaryrefslogtreecommitdiffstats
path: root/Bugzilla/Markdown.pm
diff options
context:
space:
mode:
Diffstat (limited to 'Bugzilla/Markdown.pm')
-rw-r--r--Bugzilla/Markdown.pm493
1 files changed, 493 insertions, 0 deletions
diff --git a/Bugzilla/Markdown.pm b/Bugzilla/Markdown.pm
new file mode 100644
index 000000000..c5a34fb6e
--- /dev/null
+++ b/Bugzilla/Markdown.pm
@@ -0,0 +1,493 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# This Source Code Form is "Incompatible With Secondary Licenses", as
+# defined by the Mozilla Public License, v. 2.0.
+
+package Bugzilla::Markdown;
+
+use 5.10.1;
+use strict;
+use warnings;
+
+use Bugzilla::Constants;
+use Bugzilla::Template;
+
+use Digest::MD5 qw(md5_hex);
+
+use parent qw(Text::Markdown);
+
+@Bugzilla::Markdown::EXPORT = qw(new);
+
+# Regex to match balanced [brackets]. See Friedl's
+# "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
+our ($g_nested_brackets, $g_nested_parens);
+$g_nested_brackets = qr{
+ (?> # Atomic matching
+ [^\[\]]+ # Anything other than brackets
+ |
+ \[
+ (??{ $g_nested_brackets }) # Recursive set of nested brackets
+ \]
+ )*
+}x;
+# Doesn't allow for whitespace, because we're using it to match URLs:
+$g_nested_parens = qr{
+ (?> # Atomic matching
+ [^()\s]+ # Anything other than parens or whitespace
+ |
+ \(
+ (??{ $g_nested_parens }) # Recursive set of nested brackets
+ \)
+ )*
+}x;
+
+our %g_escape_table;
+foreach my $char (split //, '\\`*_{}[]()>#+-.!~') {
+ $g_escape_table{$char} = md5_hex($char);
+}
+
+sub new {
+ my $invocant = shift;
+ my $class = ref $invocant || $invocant;
+ return $class->SUPER::new(tab_width => MARKDOWN_TAB_WIDTH,
+ # Bugzilla uses HTML not XHTML
+ empty_element_suffix => '>');
+}
+
+sub markdown {
+ my $self = shift;
+ my $text = shift;
+ my $user = Bugzilla->user;
+
+ if (Bugzilla->feature('markdown')
+ && $user->settings->{use_markdown}->{is_enabled}
+ && $user->setting('use_markdown') eq 'on')
+ {
+ return $self->SUPER::markdown($text, @_);
+ }
+
+ return Bugzilla::Template::quoteUrls($text);
+}
+
+sub _Markdown {
+ my $self = shift;
+ my $text = shift;
+
+ $text = Bugzilla::Template::quoteUrls($text);
+
+ return $self->SUPER::_Markdown($text, @_);
+}
+
+sub _RunSpanGamut {
+ # These are all the transformations that occur *within* block-level
+ # tags like paragraphs, headers, and list items.
+
+ my ($self, $text) = @_;
+
+ $text = $self->_DoCodeSpans($text);
+ $text = $self->_EscapeSpecialCharsWithinTagAttributes($text);
+ $text = $self->_EscapeSpecialChars($text);
+
+ $text = $self->_DoAnchors($text);
+
+ # Strikethroughs is Bugzilla's extension
+ $text = $self->_DoStrikethroughs($text);
+
+ $text = $self->_DoAutoLinks($text);
+ $text = $self->_EncodeAmpsAndAngles($text);
+ $text = $self->_DoItalicsAndBold($text);
+
+ $text =~ s/ {2,}\n/ <br$self->{empty_element_suffix}\n/g;
+
+ return $text;
+}
+
+# Override to check for HTML-escaped <>" chars.
+sub _StripLinkDefinitions {
+#
+# Strips link definitions from text, stores the URLs and titles in
+# hash references.
+#
+ my ($self, $text) = @_;
+ my $less_than_tab = $self->{tab_width} - 1;
+
+ # Link defs are in the form: ^[id]: url "optional title"
+ while ($text =~ s{
+ ^[ ]{0,$less_than_tab}\[(.+)\]: # id = \$1
+ [ \t]*
+ \n? # maybe *one* newline
+ [ \t]*
+ (?:&lt;)?<a\s+href="(.+?)">\2</a>(?:&gt;)? # url = \$2
+ [ \t]*
+ \n? # maybe one newline
+ [ \t]*
+ (?:
+ (?<=\s) # lookbehind for whitespace
+ (?:&quot;|\()
+ (.+?) # title = \$3
+ (?:&quot;|\))
+ [ \t]*
+ )? # title is optional
+ (?:\n+|\Z)
+ }{}omx) {
+ $self->{_urls}{lc $1} = $self->_EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
+ if ($3) {
+ $self->{_titles}{lc $1} = $3;
+ $self->{_titles}{lc $1} =~ s/"/&quot;/g;
+ }
+
+ }
+
+ return $text;
+}
+
+# We need to look for HTML-escaped '<' and '>' (i.e. &lt; and &gt;).
+# We also remove Email linkification from the original implementation
+# as it is already done in Bugzilla's quoteUrls().
+sub _DoAutoLinks {
+ my ($self, $text) = @_;
+
+ $text =~ s{(?:<|&lt;)((?:https?|ftp):[^'">\s]+)(?:>|&gt;)}{<a href="$1">$1</a>}gi;
+ return $text;
+}
+
+# The main reasons for overriding this method are
+# resolving URL conflicts with Bugzilla's quoteUrls()
+# and also changing '"' to '&quot;' in regular expressions wherever needed.
+sub _DoAnchors {
+#
+# Turn Markdown link shortcuts into <a> tags.
+#
+ my ($self, $text) = @_;
+
+ # We revert linkifications of non-email links and only
+ # those links whose URL and title are the same because
+ # this way we can be sure that link is generated by quoteUrls()
+ $text =~ s@<a \s+ href="(?! mailto ) (.+?)">\1</a>@$1@xmg;
+
+ #
+ # First, handle reference-style links: [link text] [id]
+ #
+ $text =~ s{
+ ( # wrap whole match in $1
+ \[
+ ($g_nested_brackets) # link text = $2
+ \]
+
+ [ ]? # one optional space
+ (?:\n[ ]*)? # one optional newline followed by spaces
+
+ \[
+ (.*?) # id = $3
+ \]
+ )
+ }{
+ my $whole_match = $1;
+ my $link_text = $2;
+ my $link_id = lc $3;
+
+ if ($link_id eq "") {
+ $link_id = lc $link_text; # for shortcut links like [this][].
+ }
+
+ $link_id =~ s{[ ]*\n}{ }g; # turn embedded newlines into spaces
+
+ $self->_GenerateAnchor($whole_match, $link_text, $link_id);
+ }xsge;
+
+ #
+ # Next, inline-style links: [link text](url "optional title")
+ #
+ $text =~ s{
+ ( # wrap whole match in $1
+ \[
+ ($g_nested_brackets) # link text = $2
+ \]
+ \( # literal paren
+ [ \t]*
+ ($g_nested_parens) # href = $3
+ [ \t]*
+ ( # $4
+ (&quot;|') # quote char = $5
+ (.*?) # Title = $6
+ \5 # matching quote
+ [ \t]* # ignore any spaces/tabs between closing quote and )
+ )? # title is optional
+ \)
+ )
+ }{
+ my $result;
+ my $whole_match = $1;
+ my $link_text = $2;
+ my $url = $3;
+ my $title = $6;
+
+ # Remove Bugzilla quoteUrls() linkification
+ if ($url =~ /^a href="/ && $url =~ m|</a$|) {
+ $url =~ s/^[^>]+>//;
+ $url =~ s@</a$@@;
+ }
+
+ # Limit URL to HTTP/HTTPS links
+ $url = "http://$url" unless $url =~ m!^https?://!i;
+
+ $self->_GenerateAnchor($whole_match, $link_text, undef, $url, $title);
+ }xsge;
+
+ #
+ # Last, handle reference-style shortcuts: [link text]
+ # These must come last in case you've also got [link test][1]
+ # or [link test](/foo)
+ #
+ $text =~ s{
+ ( # wrap whole match in $1
+ \[
+ ([^\[\]]+) # link text = $2; can't contain '[' or ']'
+ \]
+ )
+ }{
+ my $result;
+ my $whole_match = $1;
+ my $link_text = $2;
+ (my $link_id = lc $2) =~ s{[ ]*\n}{ }g; # lower-case and turn embedded newlines into spaces
+
+ $self->_GenerateAnchor($whole_match, $link_text, $link_id);
+ }xsge;
+
+ return $text;
+}
+
+# The purpose of overriding this function is to add support
+# for a Github Flavored Markdown (GFM) feature called 'Multiple
+# underscores in words'. The standard markdown specification
+# specifies the underscore for making the text emphasized/bold.
+# However, some variable names in programming languages contain underscores
+# and we do not want a part of those variables to look emphasized/bold.
+# Instead, we render them as the way they originally are.
+sub _DoItalicsAndBold {
+ my ($self, $text) = @_;
+
+ # Handle at beginning of lines:
+ $text =~ s{ (^__ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\S)) }
+ {
+ my $result = _has_multiple_underscores($2) ? $1 : "<strong>$2</strong>";
+ $result;
+ }gsxe;
+
+ $text =~ s{ ^\*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{<strong>$1</strong>}gsx;
+
+ $text =~ s{ (^_ (?=\S) (.+?) (?<=\S) _ (?!\S)) }
+ {
+ my $result = _has_multiple_underscores($2) ? $1 : "<em>$2</em>";
+ $result;
+ }gsxe;
+
+ $text =~ s{ ^\* (?=\S) (.+?) (?<=\S) \* }{<em>$1</em>}gsx;
+
+ # <strong> must go first:
+ $text =~ s{ ( (?<=\W) __ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\S) ) }
+ {
+ my $result = _has_multiple_underscores($2) ? $1 : "<strong>$2</strong>";
+ $result;
+ }gsxe;
+
+
+ $text =~ s{ (?<=\W) \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{<strong>$1</strong>}gsx;
+
+ $text =~ s{ ( (?<=\W) _ (?=\S) (.+?) (?<=\S) _ (?!\S) ) }
+ {
+ my $result = _has_multiple_underscores($2) ? $1 : "<em>$2</em>";
+ $result;
+ }gsxe;
+
+ $text =~ s{ (?<=\W) \* (?=\S) (.+?) (?<=\S) \* }{<em>$1</em>}gsx;
+
+ # And now, a second pass to catch nested strong and emphasis special cases
+ $text =~ s{ ( (?<=\W) __ (?=\S) (.+?[*_]*) (?<=\S) __ (\S*) ) }
+ {
+ my $result = _has_multiple_underscores($3) ? $1 : "<strong>$2</strong>$3";
+ $result;
+ }gsxe;
+
+ $text =~ s{ (?<=\W) \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{<strong>$1</strong>}gsx;
+ $text =~ s{ ( (?<=\W) _ (?=\S) (.+?) (?<=\S) _ (\S*) ) }
+ {
+ my $result = _has_multiple_underscores($3) ? $1 : "<em>$2</em>$3";
+ $result;
+ }gsxe;
+
+ $text =~ s{ (?<=\W) \* (?=\S) (.+?) (?<=\S) \* }{<em>$1</em>}gsx;
+
+ return $text;
+}
+
+# Override this function to ignore 'wrap_in_p_tags' from
+# the caller and to not generate <p> tags around the output.
+sub _FormParagraphs {
+ my ($self, $text) = @_;
+ return $self->SUPER::_FormParagraphs($text, { wrap_in_p_tags => 0 });
+}
+
+sub _DoStrikethroughs {
+ my ($self, $text) = @_;
+
+ $text =~ s{ ^ ~~ (?=\S) ([^~]+?) (?<=\S) ~~ (?!~) }{<del>$1</del>}gsx;
+ $text =~ s{ (?<=_|[^~\w]) ~~ (?=\S) ([^~]+?) (?<=\S) ~~ (?!~) }{<del>$1</del>}gsx;
+
+ return $text;
+}
+
+# The original _DoCodeSpans() uses the 's' modifier in its regex
+# which prevents _DoCodeBlocks() to match GFM fenced code blocks.
+# We copy the code from the original implementation and remove the
+# 's' modifier from it.
+sub _DoCodeSpans {
+ my ($self, $text) = @_;
+
+ $text =~ s@
+ (?<!\\) # Character before opening ` can't be a backslash
+ (`+) # $1 = Opening run of `
+ (.+?) # $2 = The code block
+ (?<!`)
+ \1 # Matching closer
+ (?!`)
+ @
+ my $c = "$2";
+ $c =~ s/^[ \t]*//g; # leading whitespace
+ $c =~ s/[ \t]*$//g; # trailing whitespace
+ $c = $self->_EncodeCode($c);
+ "<code>$c</code>";
+ @egx;
+
+ return $text;
+}
+
+# Override to add GFM Fenced Code Blocks
+sub _DoCodeBlocks {
+ my ($self, $text) = @_;
+
+ $text =~ s{
+ ^ `{3,} [\s\t]* \n
+ ( # $1 = the entire code block
+ (?: .* \n+)+?
+ )
+ `{3,} [\s\t]* $
+ }{
+ my $codeblock = $1;
+ my $result;
+
+ $codeblock = $self->_EncodeCode($codeblock);
+ $codeblock = $self->_Detab($codeblock);
+ $codeblock =~ s/\n\z//; # remove the trailing newline
+
+ $result = "\n\n<pre><code>" . $codeblock . "</code></pre>\n\n";
+ $result;
+ }egmx;
+
+ # And now do the standard code blocks
+ $text = $self->SUPER::_DoCodeBlocks($text);
+
+ return $text;
+}
+
+sub _EncodeCode {
+ my ($self, $text) = @_;
+
+ # We need to unescape the escaped HTML characters in code blocks.
+ # These are the reverse of the escapings done in Bugzilla::Util::html_quote()
+ $text =~ s/&lt;/</g;
+ $text =~ s/&gt;/>/g;
+ $text =~ s/&quot;/"/g;
+ $text =~ s/&#64;/@/g;
+ # '&amp;' substitution must be the last one, otherwise a literal like '&gt;'
+ # will turn to '>' because '&' is already changed to '&amp;' in Bugzilla::Util::html_quote().
+ # In other words, html_quote() will change '&gt;' to '&amp;gt;' and then we will
+ # change '&amp;gt' -> '&gt;' -> '>' if we write this substitution as the first one.
+ $text =~ s/&amp;/&/g;
+ $text = $self->SUPER::_EncodeCode($text);
+ $text =~ s/~/$g_escape_table{'~'}/go;
+
+ return $text;
+}
+
+sub _EncodeBackslashEscapes {
+ my ($self, $text) = @_;
+
+ $text = $self->SUPER::_EncodeBackslashEscapes($text);
+ $text =~ s/\\~/$g_escape_table{'~'}/go;
+
+ return $text;
+}
+
+sub _UnescapeSpecialChars {
+ my ($self, $text) = @_;
+
+ $text = $self->SUPER::_UnescapeSpecialChars($text);
+ $text =~ s/$g_escape_table{'~'}/~/go;
+
+ return $text;
+}
+
+# Check if the passed string is of the form multiple_underscores_in_a_word.
+# To check that, we first need to make sure that the string does not contain
+# any white-space. Then, if the string is composed of non-space chunks which
+# are bound together with underscores, the string has the desired form.
+sub _has_multiple_underscores {
+ my $string = shift;
+ return 0 unless defined($string) && length($string);
+ return 0 if $string =~ /[\t\s]+/;
+ return 1 if scalar (split /_/, $string) > 1;
+ return 0;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Bugzilla::Markdown - Generates HTML output from structured plain-text input.
+
+=head1 SYNOPSIS
+
+ use Bugzilla::Markdown;
+
+ my $markdown = Bugzilla::Markdown->new();
+ print $markdown->markdown($text);
+
+=head1 DESCRIPTION
+
+Bugzilla::Markdown implements a Markdown engine that produces
+an HTML-based output from a given plain-text input.
+
+The majority of the implementation is done by C<Text::Markdown>
+CPAN module. It also applies the linkifications done in L<Bugzilla::Template>
+to the input resulting in an output which is a combination of both Markdown
+structures and those defined by Bugzilla itself.
+
+=head2 Accessors
+
+=over
+
+=item C<markdown>
+
+C<string> Produces an HTML-based output string based on the structures
+and format defined in the given plain-text input.
+
+=over
+
+=item B<Params>
+
+=over
+
+=item C<text>
+
+C<string> A plain-text string which includes Markdown structures.
+
+=back
+
+=back
+
+=back