From ec5caa57cc14a328b8b994d49cb8def8eb95aea7 Mon Sep 17 00:00:00 2001
From: Koosha KM <koosha.khajeh@gmail.com>
Date: Thu, 28 Aug 2014 17:17:54 +0000
Subject: Bug 330707: Add optional support for MarkDown r=dkl,a=sgreen

---
 Bugzilla/Markdown.pm | 493 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 493 insertions(+)
 create mode 100644 Bugzilla/Markdown.pm

(limited to 'Bugzilla/Markdown.pm')

diff --git a/Bugzilla/Markdown.pm b/Bugzilla/Markdown.pm
new file mode 100644
index 000000000..c5a34fb6e
--- /dev/null
+++ b/Bugzilla/Markdown.pm
@@ -0,0 +1,493 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# This Source Code Form is "Incompatible With Secondary Licenses", as
+# defined by the Mozilla Public License, v. 2.0.
+
+package Bugzilla::Markdown;
+
+use 5.10.1;
+use strict;
+use warnings;
+
+use Bugzilla::Constants;
+use Bugzilla::Template;
+
+use Digest::MD5 qw(md5_hex);
+
+use parent qw(Text::Markdown);
+
+@Bugzilla::Markdown::EXPORT = qw(new);
+
+# Regex to match balanced [brackets]. See Friedl's
+# "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
+our ($g_nested_brackets, $g_nested_parens);
+$g_nested_brackets = qr{
+    (?>                                 # Atomic matching
+       [^\[\]]+                         # Anything other than brackets
+     |
+       \[
+         (??{ $g_nested_brackets })     # Recursive set of nested brackets
+       \]
+    )*
+}x;
+# Doesn't allow for whitespace, because we're using it to match URLs:
+$g_nested_parens = qr{
+    (?>                                 # Atomic matching
+       [^()\s]+                            # Anything other than parens or whitespace
+     |
+       \(
+         (??{ $g_nested_parens })        # Recursive set of nested brackets
+       \)
+    )*
+}x;
+
+our %g_escape_table;
+foreach my $char (split //, '\\`*_{}[]()>#+-.!~') {
+    $g_escape_table{$char} = md5_hex($char);
+}
+
+sub new {
+    my $invocant = shift;
+    my $class = ref $invocant || $invocant;
+    return $class->SUPER::new(tab_width => MARKDOWN_TAB_WIDTH,
+                              # Bugzilla uses HTML not XHTML
+                              empty_element_suffix => '>');
+}
+
+sub markdown {
+    my $self = shift;
+    my $text = shift;
+    my $user = Bugzilla->user;
+
+    if (Bugzilla->feature('markdown')
+        && $user->settings->{use_markdown}->{is_enabled}
+        && $user->setting('use_markdown') eq 'on')
+    {
+        return $self->SUPER::markdown($text, @_);
+    }
+
+    return Bugzilla::Template::quoteUrls($text);
+}
+
+sub _Markdown {
+    my $self = shift;
+    my $text = shift;
+
+    $text = Bugzilla::Template::quoteUrls($text);
+
+    return $self->SUPER::_Markdown($text, @_);
+}
+
+sub _RunSpanGamut {
+    # These are all the transformations that occur *within* block-level
+    # tags like paragraphs, headers, and list items.
+
+    my ($self, $text) = @_;
+
+    $text = $self->_DoCodeSpans($text);
+    $text = $self->_EscapeSpecialCharsWithinTagAttributes($text);
+    $text = $self->_EscapeSpecialChars($text);
+
+    $text = $self->_DoAnchors($text);
+
+    # Strikethroughs is Bugzilla's extension
+    $text = $self->_DoStrikethroughs($text);
+
+    $text = $self->_DoAutoLinks($text);
+    $text = $self->_EncodeAmpsAndAngles($text);
+    $text = $self->_DoItalicsAndBold($text);
+
+    $text =~ s/ {2,}\n/ <br$self->{empty_element_suffix}\n/g;
+
+    return $text;
+}
+
+# Override to check for HTML-escaped <>" chars.
+sub _StripLinkDefinitions {
+#
+# Strips link definitions from text, stores the URLs and titles in
+# hash references.
+#
+    my ($self, $text) = @_;
+    my $less_than_tab = $self->{tab_width} - 1;
+
+    # Link defs are in the form: ^[id]: url "optional title"
+    while ($text =~ s{
+            ^[ ]{0,$less_than_tab}\[(.+)\]: # id = \$1
+              [ \t]*
+              \n?               # maybe *one* newline
+              [ \t]*
+            (?:&lt;)?<a\s+href="(.+?)">\2</a>(?:&gt;)?          # url = \$2
+              [ \t]*
+              \n?               # maybe one newline
+              [ \t]*
+            (?:
+                (?<=\s)         # lookbehind for whitespace
+                (?:&quot;|\()
+                (.+?)           # title = \$3
+                (?:&quot;|\))
+                [ \t]*
+            )?  # title is optional
+            (?:\n+|\Z)
+        }{}omx) {
+        $self->{_urls}{lc $1} = $self->_EncodeAmpsAndAngles( $2 );    # Link IDs are case-insensitive
+        if ($3) {
+            $self->{_titles}{lc $1} = $3;
+            $self->{_titles}{lc $1} =~ s/"/&quot;/g;
+        }
+
+    }
+
+    return $text;
+}
+
+# We need to look for HTML-escaped '<' and '>' (i.e. &lt; and &gt;).
+# We also remove Email linkification from the original implementation
+# as it is already done in Bugzilla's quoteUrls().
+sub _DoAutoLinks {
+    my ($self, $text) = @_;
+
+    $text =~ s{(?:<|&lt;)((?:https?|ftp):[^'">\s]+)(?:>|&gt;)}{<a href="$1">$1</a>}gi;
+    return $text;
+}
+
+# The main reasons for overriding this method are
+# resolving URL conflicts with Bugzilla's quoteUrls()
+# and also changing '"' to '&quot;' in regular expressions wherever needed.
+sub _DoAnchors {
+#
+# Turn Markdown link shortcuts into <a> tags.
+#
+    my ($self, $text) = @_;
+
+    # We revert linkifications of non-email links and only
+    # those links whose URL and title are the same because
+    # this way we can be sure that link is generated by quoteUrls()
+    $text =~ s@<a \s+ href="(?! mailto ) (.+?)">\1</a>@$1@xmg;
+
+    #
+    # First, handle reference-style links: [link text] [id]
+    #
+    $text =~ s{
+        (                   # wrap whole match in $1
+          \[
+            ($g_nested_brackets)    # link text = $2
+          \]
+
+          [ ]?              # one optional space
+          (?:\n[ ]*)?       # one optional newline followed by spaces
+
+          \[
+            (.*?)       # id = $3
+          \]
+        )
+    }{
+        my $whole_match = $1;
+        my $link_text   = $2;
+        my $link_id     = lc $3;
+
+        if ($link_id eq "") {
+            $link_id = lc $link_text;   # for shortcut links like [this][].
+        }
+
+        $link_id =~ s{[ ]*\n}{ }g; # turn embedded newlines into spaces
+
+        $self->_GenerateAnchor($whole_match, $link_text, $link_id);
+    }xsge;
+
+    #
+    # Next, inline-style links: [link text](url "optional title")
+    #
+    $text =~ s{
+        (               # wrap whole match in $1
+          \[
+            ($g_nested_brackets)    # link text = $2
+          \]
+          \(            # literal paren
+            [ \t]*
+            ($g_nested_parens)   # href = $3
+            [ \t]*
+            (           # $4
+              (&quot;|')    # quote char = $5
+              (.*?)     # Title = $6
+              \5        # matching quote
+              [ \t]*    # ignore any spaces/tabs between closing quote and )
+            )?          # title is optional
+          \)
+        )
+    }{
+        my $result;
+        my $whole_match = $1;
+        my $link_text   = $2;
+        my $url         = $3;
+        my $title       = $6;
+
+        # Remove Bugzilla quoteUrls() linkification
+        if ($url =~ /^a href="/ && $url =~ m|</a$|) {
+            $url =~ s/^[^>]+>//;
+            $url =~ s@</a$@@;
+        }
+
+        # Limit URL to HTTP/HTTPS links
+        $url = "http://$url" unless $url =~ m!^https?://!i;
+
+        $self->_GenerateAnchor($whole_match, $link_text, undef, $url, $title);
+    }xsge;
+
+    #
+    # Last, handle reference-style shortcuts: [link text]
+    # These must come last in case you've also got [link test][1]
+    # or [link test](/foo)
+    #
+    $text =~ s{
+        (                    # wrap whole match in $1
+          \[
+            ([^\[\]]+)        # link text = $2; can't contain '[' or ']'
+          \]
+        )
+    }{
+        my $result;
+        my $whole_match = $1;
+        my $link_text   = $2;
+        (my $link_id = lc $2) =~ s{[ ]*\n}{ }g; # lower-case and turn embedded newlines into spaces
+
+        $self->_GenerateAnchor($whole_match, $link_text, $link_id);
+    }xsge;
+
+    return $text;
+}
+
+# The purpose of overriding this function is to add support
+# for a Github Flavored Markdown (GFM) feature called 'Multiple
+# underscores in words'. The standard markdown specification
+# specifies the underscore for making the text emphasized/bold.
+# However, some variable names in programming languages contain underscores
+# and we do not want a part of those variables to look emphasized/bold.
+# Instead, we render them as the way they originally are.
+sub _DoItalicsAndBold {
+    my ($self, $text) = @_;
+
+    # Handle at beginning of lines:
+    $text =~ s{ (^__ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\S)) }
+              {
+                  my $result = _has_multiple_underscores($2) ? $1 : "<strong>$2</strong>";
+                  $result;
+              }gsxe;
+
+    $text =~ s{ ^\*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{<strong>$1</strong>}gsx;
+
+    $text =~ s{ (^_ (?=\S) (.+?) (?<=\S) _ (?!\S)) }
+              {
+                  my $result = _has_multiple_underscores($2) ? $1 : "<em>$2</em>";
+                  $result;
+              }gsxe;
+
+    $text =~ s{ ^\* (?=\S) (.+?) (?<=\S) \* }{<em>$1</em>}gsx;
+
+    # <strong> must go first:
+    $text =~ s{ ( (?<=\W) __ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\S) ) }
+              {
+                  my $result = _has_multiple_underscores($2) ? $1 : "<strong>$2</strong>";
+                  $result;
+              }gsxe;
+
+
+    $text =~ s{ (?<=\W) \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{<strong>$1</strong>}gsx;
+
+    $text =~ s{ ( (?<=\W) _ (?=\S) (.+?) (?<=\S) _ (?!\S) ) }
+              {
+                  my $result = _has_multiple_underscores($2) ? $1 : "<em>$2</em>";
+                  $result;
+              }gsxe;
+
+    $text =~ s{ (?<=\W) \* (?=\S) (.+?) (?<=\S) \* }{<em>$1</em>}gsx;
+
+    # And now, a second pass to catch nested strong and emphasis special cases
+    $text =~ s{ ( (?<=\W) __ (?=\S) (.+?[*_]*) (?<=\S) __ (\S*) ) }
+              {
+                  my $result = _has_multiple_underscores($3) ? $1 : "<strong>$2</strong>$3";
+                  $result;
+              }gsxe;
+
+    $text =~ s{ (?<=\W) \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{<strong>$1</strong>}gsx;
+    $text =~ s{ ( (?<=\W) _ (?=\S) (.+?) (?<=\S) _ (\S*) ) }
+              {
+                  my $result = _has_multiple_underscores($3) ? $1 : "<em>$2</em>$3";
+                  $result;
+              }gsxe;
+
+    $text =~ s{ (?<=\W) \* (?=\S) (.+?) (?<=\S) \* }{<em>$1</em>}gsx;
+
+    return $text;
+}
+
+# Override this function to ignore 'wrap_in_p_tags' from
+# the caller and to not generate <p> tags around the output.
+sub _FormParagraphs {
+    my ($self, $text) = @_;
+    return $self->SUPER::_FormParagraphs($text, { wrap_in_p_tags => 0 });
+}
+
+sub _DoStrikethroughs {
+    my ($self, $text) = @_;
+
+    $text =~ s{ ^ ~~ (?=\S) ([^~]+?) (?<=\S) ~~ (?!~) }{<del>$1</del>}gsx;
+    $text =~ s{ (?<=_|[^~\w]) ~~ (?=\S) ([^~]+?) (?<=\S) ~~ (?!~) }{<del>$1</del>}gsx;
+
+    return $text;
+}
+
+# The original _DoCodeSpans() uses the 's' modifier in its regex
+# which prevents _DoCodeBlocks() to match GFM fenced code blocks.
+# We copy the code from the original implementation and remove the
+# 's' modifier from it.
+sub _DoCodeSpans {
+    my ($self, $text) = @_;
+
+    $text =~ s@
+            (?<!\\)     # Character before opening ` can't be a backslash
+            (`+)        # $1 = Opening run of `
+            (.+?)       # $2 = The code block
+            (?<!`)
+            \1          # Matching closer
+            (?!`)
+        @
+             my $c = "$2";
+             $c =~ s/^[ \t]*//g; # leading whitespace
+             $c =~ s/[ \t]*$//g; # trailing whitespace
+             $c = $self->_EncodeCode($c);
+            "<code>$c</code>";
+        @egx;
+
+    return $text;
+}
+
+# Override to add GFM Fenced Code Blocks
+sub _DoCodeBlocks {
+    my ($self, $text) = @_;
+
+    $text =~ s{
+        ^ `{3,} [\s\t]* \n
+        (                # $1 = the entire code block
+          (?: .* \n+)+?
+        )
+        `{3,} [\s\t]* $
+        }{
+            my $codeblock = $1;
+            my $result;
+
+            $codeblock = $self->_EncodeCode($codeblock);
+            $codeblock = $self->_Detab($codeblock);
+            $codeblock =~ s/\n\z//; # remove the trailing newline
+
+            $result = "\n\n<pre><code>" . $codeblock . "</code></pre>\n\n";
+            $result;
+        }egmx;
+
+    # And now do the standard code blocks
+    $text = $self->SUPER::_DoCodeBlocks($text);
+
+    return $text;
+}
+
+sub _EncodeCode {
+    my ($self, $text) = @_;
+
+    # We need to unescape the escaped HTML characters in code blocks.
+    # These are the reverse of the escapings done in Bugzilla::Util::html_quote()
+    $text =~ s/&lt;/</g;
+    $text =~ s/&gt;/>/g;
+    $text =~ s/&quot;/"/g;
+    $text =~ s/&#64;/@/g;
+    # '&amp;' substitution must be the last one, otherwise a literal like '&gt;'
+    # will turn to '>' because '&' is already changed to '&amp;' in Bugzilla::Util::html_quote().
+    # In other words, html_quote() will change '&gt;' to '&amp;gt;' and then we will
+    # change '&amp;gt' -> '&gt;' -> '>' if we write this substitution as the first one.
+    $text =~ s/&amp;/&/g;
+    $text = $self->SUPER::_EncodeCode($text);
+    $text =~ s/~/$g_escape_table{'~'}/go;
+
+    return $text;
+}
+
+sub _EncodeBackslashEscapes {
+    my ($self, $text) = @_;
+
+    $text = $self->SUPER::_EncodeBackslashEscapes($text);
+    $text =~ s/\\~/$g_escape_table{'~'}/go;
+
+    return $text;
+}
+
+sub _UnescapeSpecialChars {
+    my ($self, $text) = @_;
+
+    $text = $self->SUPER::_UnescapeSpecialChars($text);
+    $text =~ s/$g_escape_table{'~'}/~/go;
+
+    return $text;
+}
+
+# Check if the passed string is of the form multiple_underscores_in_a_word.
+# To check that, we first need to make sure that the string does not contain
+# any white-space. Then, if the string is composed of non-space chunks which
+# are bound together with underscores, the string has the desired form.
+sub _has_multiple_underscores {
+    my $string = shift;
+    return 0 unless defined($string) && length($string);
+    return 0 if $string =~ /[\t\s]+/;
+    return 1 if scalar (split /_/, $string) > 1;
+    return 0;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Bugzilla::Markdown - Generates HTML output from structured plain-text input.
+
+=head1 SYNOPSIS
+
+ use Bugzilla::Markdown;
+
+ my $markdown = Bugzilla::Markdown->new();
+ print $markdown->markdown($text);
+
+=head1 DESCRIPTION
+
+Bugzilla::Markdown implements a Markdown engine that produces
+an HTML-based output from a given plain-text input.
+
+The majority of the implementation is done by C<Text::Markdown>
+CPAN module. It also applies the linkifications done in L<Bugzilla::Template>
+to the input resulting in an output which is a combination of both Markdown
+structures and those defined by Bugzilla itself.
+
+=head2 Accessors
+
+=over
+
+=item C<markdown>
+
+C<string> Produces an HTML-based output string based on the structures
+and format defined in the given plain-text input.
+
+=over
+
+=item B<Params>
+
+=over
+
+=item C<text>
+
+C<string> A plain-text string which includes Markdown structures.
+
+=back
+
+=back
+
+=back
-- 
cgit v1.2.3-24-g4f1b