From cca07e40f68720087992ec8b5337bca91a942d91 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Tue, 28 Aug 2012 11:00:18 -0400 Subject: Bug 785309 - Profanivore is throwing "ascii "\xB4" does not map to Unicode" errors r=glob --- extensions/Profanivore/Config.pm | 7 +++- extensions/Profanivore/Extension.pm | 84 +++++++++++++++++++++++++++++++++---- 2 files changed, 83 insertions(+), 8 deletions(-) (limited to 'extensions/Profanivore') diff --git a/extensions/Profanivore/Config.pm b/extensions/Profanivore/Config.pm index 778301fbb..354325c58 100644 --- a/extensions/Profanivore/Config.pm +++ b/extensions/Profanivore/Config.pm @@ -29,7 +29,12 @@ use constant REQUIRED_MODULES => [ package => 'Regexp-Common', module => 'Regexp::Common', version => 0 + }, + { + package => 'HTML-Tree', + module => 'HTML::Tree', + version => 0, } ]; -__PACKAGE__->NAME; \ No newline at end of file +__PACKAGE__->NAME; diff --git a/extensions/Profanivore/Extension.pm b/extensions/Profanivore/Extension.pm index b77c09ce3..9889cc043 100644 --- a/extensions/Profanivore/Extension.pm +++ b/extensions/Profanivore/Extension.pm @@ -25,13 +25,15 @@ use base qw(Bugzilla::Extension); use Regexp::Common 'RE_ALL'; +use Bugzilla::Util qw(is_7bit_clean); + our $VERSION = '0.01'; sub bug_format_comment { my ($self, $args) = @_; my $regexes = $args->{'regexes'}; my $comment = $args->{'comment'}; - + # Censor profanities if the comment author is not reasonably trusted. # However, allow people to see their own profanities, which might stop # them immediately noticing and trying to go around the filter. (I.e. @@ -55,10 +57,10 @@ sub _replace_profanity { sub mailer_before_send { my ($self, $args) = @_; my $email = $args->{'email'}; - + my $author = $email->header("X-Bugzilla-Who"); my $recipient = $email->header("To"); - + if ($author && $recipient && lc($author) ne lc($recipient)) { my $email_suffix = Bugzilla->params->{'emailsuffix'}; if ($email_suffix ne '') { @@ -72,14 +74,82 @@ sub mailer_before_send { $author->id && !$author->in_group('editbugs')) { - my $body = $email->body_str(); + # Multipart emails + if (scalar $email->parts > 1) { + $email->walk_parts(sub { + my ($part) = @_; + return if $part->parts > 1; # Top-level + # do not filter attachments such as patches, etc. + if ($part->header('Content-Disposition') + && $part->header('Content-Disposition') =~ /attachment/) + { + return; + } + _fix_encoding($part); + my $body = $part->body_str; + if ($part->content_type =~ /^text\/html/) { + $body = _filter_html($body); + } + elsif ($part->content_type =~ /^text\/plain/) { + $body = _filter_text($body); + } + $part->body_str_set($body); + }); + } + # Single part email + else { + _fix_encoding($email); + $email->body_str_set(_filter_text($email->body_str)); + } + } + } +} - my $offensive = RE_profanity(); - $body =~ s/$offensive/****/g; +sub _fix_encoding { + my $part = shift; + my $body = $part->body; + if (Bugzilla->params->{'utf8'}) { + $part->charset_set('UTF-8'); + # encoding_set works only with bytes, not with utf8 strings. + my $raw = $part->body_raw; + if (utf8::is_utf8($raw)) { + utf8::encode($raw); + $part->body_set($raw); + } + } + $part->encoding_set('quoted-printable') if !is_7bit_clean($body); +} + +sub _filter_text { + my $text = shift; + my $offensive = RE_profanity(); + $text =~ s/$offensive/****/g; + return $text; +} + +sub _filter_html { + my $html = shift; + my $tree = HTML::Tree->new->parse_content($html); + my $comments_div = $tree->look_down( _tag => 'div', id => 'comments' ); + return $html if !$comments_div; + my @comments = $comments_div->look_down( _tag => 'pre' ); + foreach my $comment (@comments) { + _filter_html_node($comment); + } + return $tree->as_HTML; +} - $email->body_str_set($body); +sub _filter_html_node { + my $node = shift; + my $content = [ $node->content_list ]; + foreach my $item_r ($node->content_refs_list) { + if (ref $$item_r) { + _filter_html_node($$item_r); + } else { + $$item_r = _filter_text($$item_r); } } + return $node; } __PACKAGE__->NAME; -- cgit v1.2.3-24-g4f1b