Bug 785309 - Profanivore is throwing "ascii "\xB4" does not map to Unicode" errors

r=glob
author: Dave Lawrence <dlawrence@mozilla.com> 2012-08-28 17:00:18 +0200
committer: Dave Lawrence <dlawrence@mozilla.com> 2012-08-28 17:00:18 +0200
commit: cca07e40f68720087992ec8b5337bca91a942d91 (patch)
tree: c06120a410eb111e9856e111d0c0ad5dfec0dc0c /extensions
parent: 2def70a01bd470f46fa8b83a71e1ec59b7089093 (diff)
download: bugzilla-cca07e40f68720087992ec8b5337bca91a942d91.tar.gz
bugzilla-cca07e40f68720087992ec8b5337bca91a942d91.tar.xz
2 files changed, 83 insertions, 8 deletions
diff --git a/extensions/Profanivore/Config.pm b/extensions/Profanivore/Config.pm
index 778301fbb..354325c58 100644
--- a/extensions/Profanivore/Config.pm
+++ b/extensions/Profanivore/Config.pm
@@ -29,7 +29,12 @@ use constant REQUIRED_MODULES => [
         package => 'Regexp-Common',
         module  => 'Regexp::Common',
         version => 0
+    },
+    {
+        package => 'HTML-Tree',
+        module  => 'HTML::Tree',
+        version => 0,
     }
 ];
 
-__PACKAGE__->NAME;
-\ No newline at end of file
+__PACKAGE__->NAME;
diff --git a/extensions/Profanivore/Extension.pm b/extensions/Profanivore/Extension.pm
index b77c09ce3..9889cc043 100644
--- a/extensions/Profanivore/Extension.pm
+++ b/extensions/Profanivore/Extension.pm
@@ -25,13 +25,15 @@ use base qw(Bugzilla::Extension);
 
 use Regexp::Common 'RE_ALL';
 
+use Bugzilla::Util qw(is_7bit_clean);
+
 our $VERSION = '0.01';
 
 sub bug_format_comment {
     my ($self, $args) = @_;
     my $regexes = $args->{'regexes'};
     my $comment = $args->{'comment'};
-  
+
     # Censor profanities if the comment author is not reasonably trusted.
     # However, allow people to see their own profanities, which might stop
     # them immediately noticing and trying to go around the filter. (I.e.
@@ -55,10 +57,10 @@ sub _replace_profanity {
 sub mailer_before_send {
     my ($self, $args) = @_;
     my $email = $args->{'email'};
-    
+
     my $author    = $email->header("X-Bugzilla-Who");
     my $recipient = $email->header("To");
-    
+
     if ($author && $recipient && lc($author) ne lc($recipient)) {
         my $email_suffix = Bugzilla->params->{'emailsuffix'};
         if ($email_suffix ne '') {
@@ -72,14 +74,82 @@ sub mailer_before_send {
             $author->id && 
             !$author->in_group('editbugs'))
         {
-            my $body = $email->body_str();
+            # Multipart emails
+            if (scalar $email->parts > 1) {
+                $email->walk_parts(sub {
+                    my ($part) = @_;
+                    return if $part->parts > 1; # Top-level
+                    # do not filter attachments such as patches, etc.
+                    if ($part->header('Content-Disposition')
+                        && $part->header('Content-Disposition') =~ /attachment/)
+                    {
+                        return;
+                    }
+                    _fix_encoding($part);
+                    my $body = $part->body_str;
+                    if ($part->content_type =~ /^text\/html/) {
+                        $body = _filter_html($body);
+                    }
+                    elsif ($part->content_type =~ /^text\/plain/) {
+                        $body = _filter_text($body);
+                    }
+                    $part->body_str_set($body);
+                });
+            }
+            # Single part email
+            else {
+                _fix_encoding($email);
+                $email->body_str_set(_filter_text($email->body_str));
+            }
+        }
+    }
+}
 
-            my $offensive = RE_profanity();
-            $body =~ s/$offensive/****/g;
+sub _fix_encoding {
+    my $part = shift;
+    my $body = $part->body;
+    if (Bugzilla->params->{'utf8'}) {
+        $part->charset_set('UTF-8');
+        # encoding_set works only with bytes, not with utf8 strings.
+        my $raw = $part->body_raw;
+        if (utf8::is_utf8($raw)) {
+            utf8::encode($raw);
+            $part->body_set($raw);
+        }
+    }
+    $part->encoding_set('quoted-printable') if !is_7bit_clean($body);
+}
+
+sub _filter_text {
+    my $text = shift;
+    my $offensive = RE_profanity();
+    $text =~ s/$offensive/****/g;
+    return $text;
+}
+
+sub _filter_html {
+    my $html = shift;
+    my $tree = HTML::Tree->new->parse_content($html);
+    my $comments_div = $tree->look_down( _tag => 'div', id => 'comments' );
+    return $html if !$comments_div;
+    my @comments = $comments_div->look_down( _tag => 'pre' );
+    foreach my $comment (@comments) {
+        _filter_html_node($comment);
+    }
+    return $tree->as_HTML;
+}
 
-            $email->body_str_set($body);
+sub _filter_html_node {
+    my $node = shift;
+    my $content = [ $node->content_list ];
+    foreach my $item_r ($node->content_refs_list) {
+        if (ref $$item_r) {
+            _filter_html_node($$item_r);
+        } else {
+            $$item_r = _filter_text($$item_r);
         }
     }
+    return $node;
 }
 
 __PACKAGE__->NAME;
author	Dave Lawrence <dlawrence@mozilla.com>	2012-08-28 17:00:18 +0200
committer	Dave Lawrence <dlawrence@mozilla.com>	2012-08-28 17:00:18 +0200
commit	cca07e40f68720087992ec8b5337bca91a942d91 (patch)
tree	c06120a410eb111e9856e111d0c0ad5dfec0dc0c /extensions
parent	2def70a01bd470f46fa8b83a71e1ec59b7089093 (diff)
download	bugzilla-cca07e40f68720087992ec8b5337bca91a942d91.tar.gz bugzilla-cca07e40f68720087992ec8b5337bca91a942d91.tar.xz