summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorByron Jones <bjones@mozilla.com>2011-03-09 10:46:02 +0100
committerByron Jones <bjones@mozilla.com>2011-03-09 10:46:02 +0100
commite17bd11ddb9407d5b8cf2a53df5a9ecf514748d4 (patch)
tree0c50e3fbaa6db66c20d6e54101af5f9a5aecb013
parentfd4f9fadbede46895d8c9e9853e04e02fe701662 (diff)
downloadbugzilla-e17bd11ddb9407d5b8cf2a53df5a9ecf514748d4.tar.gz
bugzilla-e17bd11ddb9407d5b8cf2a53df5a9ecf514748d4.tar.xz
Bug 633776: Automatic charset detection for text attachments
r=mkanat, a=mkanat
-rwxr-xr-x[-rw-r--r--]Bugzilla/Config/Attachment.pm0
-rwxr-xr-x[-rw-r--r--]Bugzilla/Config/Common.pm0
-rwxr-xr-x[-rw-r--r--]Bugzilla/Install/Requirements.pm13
-rwxr-xr-x[-rw-r--r--]Bugzilla/Util.pm68
-rwxr-xr-xattachment.cgi8
-rwxr-xr-xcontrib/recode.pl55
-rwxr-xr-x[-rw-r--r--]template/en/default/admin/params/attachment.html.tmpl0
-rwxr-xr-x[-rw-r--r--]template/en/default/setup/strings.txt.pl1
8 files changed, 90 insertions, 55 deletions
diff --git a/Bugzilla/Config/Attachment.pm b/Bugzilla/Config/Attachment.pm
index e6e3b7f3d..e6e3b7f3d 100644..100755
--- a/Bugzilla/Config/Attachment.pm
+++ b/Bugzilla/Config/Attachment.pm
diff --git a/Bugzilla/Config/Common.pm b/Bugzilla/Config/Common.pm
index 9fffe02ee..9fffe02ee 100644..100755
--- a/Bugzilla/Config/Common.pm
+++ b/Bugzilla/Config/Common.pm
diff --git a/Bugzilla/Install/Requirements.pm b/Bugzilla/Install/Requirements.pm
index 047ed36f1..5b8f77e30 100644..100755
--- a/Bugzilla/Install/Requirements.pm
+++ b/Bugzilla/Install/Requirements.pm
@@ -291,6 +291,19 @@ sub OPTIONAL_MODULES {
version => 0,
feature => ['html_desc'],
},
+ {
+ # we need version 2.21 of Encode for mime_name
+ package => 'Encode',
+ module => 'Encode',
+ version => 2.21,
+ feature => ['detect_charset'],
+ },
+ {
+ package => 'Encode-Detect',
+ module => 'Encode::Detect',
+ version => 0,
+ feature => ['detect_charset'],
+ },
# Inbound Email
{
diff --git a/Bugzilla/Util.pm b/Bugzilla/Util.pm
index 058a49af3..ced15491d 100644..100755
--- a/Bugzilla/Util.pm
+++ b/Bugzilla/Util.pm
@@ -43,7 +43,8 @@ use base qw(Exporter);
file_mod_time is_7bit_clean
bz_crypt generate_random_password
validate_email_syntax clean_text
- get_text template_var disable_utf8);
+ get_text template_var disable_utf8
+ detect_encoding);
use Bugzilla::Constants;
@@ -58,6 +59,8 @@ use Math::Random::Secure qw(irand);
use Scalar::Util qw(tainted blessed);
use Template::Filters;
use Text::Wrap;
+use Encode qw(encode decode resolve_alias);
+use Encode::Guess;
sub trick_taint {
require Carp;
@@ -673,6 +676,63 @@ sub disable_utf8 {
}
}
+use constant UTF8_ACCIDENTAL => qw(shiftjis big5-eten euc-kr euc-jp);
+
+sub detect_encoding {
+ my $data = shift;
+
+ if (!Bugzilla->feature('detect_charset')) {
+ require Bugzilla::Error;
+ Bugzilla::Error::ThrowCodeError('feature_disabled',
+ { feature => 'detect_charset' });
+ }
+
+ require Encode::Detect::Detector;
+ import Encode::Detect::Detector 'detect';
+
+ my $encoding = detect($data);
+ $encoding = resolve_alias($encoding) if $encoding;
+
+ # Encode::Detect is bad at detecting certain charsets, but Encode::Guess
+ # is better at them. Here's the details:
+
+ # shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect
+ # tends to accidentally mis-detect UTF-8 strings as being
+ # these encodings.)
+ if ($encoding && grep($_ eq $encoding, UTF8_ACCIDENTAL)) {
+ $encoding = undef;
+ my $decoder = guess_encoding($data, UTF8_ACCIDENTAL);
+ $encoding = $decoder->name if ref $decoder;
+ }
+
+ # Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8,
+ # but Encode::Guess can usually tell which one it is.
+ if ($encoding && $encoding eq 'iso-8859-8') {
+ my $decoded_as = _guess_iso($data, 'iso-8859-8',
+ # These are ordered this way because it gives the most
+ # accurate results.
+ qw(iso-8859-7 iso-8859-2));
+ $encoding = $decoded_as if $decoded_as;
+ }
+
+ return $encoding;
+}
+
+# A helper for detect_encoding.
+sub _guess_iso {
+ my ($data, $versus, @isos) = (shift, shift, shift);
+
+ my $encoding;
+ foreach my $iso (@isos) {
+ my $decoder = guess_encoding($data, ($iso, $versus));
+ if (ref $decoder) {
+ $encoding = $decoder->name if ref $decoder;
+ last;
+ }
+ }
+ return $encoding;
+}
+
1;
__END__
@@ -903,6 +963,12 @@ ASCII 10 (LineFeed) and ASCII 13 (Carrage Return).
Disable utf8 on STDOUT (and display raw data instead).
+=item C<detect_encoding($str)>
+
+Guesses what encoding a given data is encoded in, returning the canonical name
+of the detected encoding (which may be different from the MIME charset
+specification).
+
=item C<clean_text($str)>
Returns the parameter "cleaned" by exchanging non-printable characters with spaces.
Specifically characters (ASCII 0 through 31) and (ASCII 127) will become ASCII 32 (Space).
diff --git a/attachment.cgi b/attachment.cgi
index 9273b5f29..8ea802f44 100755
--- a/attachment.cgi
+++ b/attachment.cgi
@@ -53,7 +53,7 @@ use Bugzilla::Attachment::PatchReader;
use Bugzilla::Token;
use Bugzilla::Keyword;
-use Encode qw(encode);
+use Encode qw(encode find_encoding);
# For most scripts we don't make $cgi and $template global variables. But
# when preparing Bugzilla for mod_perl, this script used these
@@ -335,6 +335,12 @@ sub view {
# In order to prevent Apache from adding a charset, we have to send a
# charset that's a single space.
$cgi->charset(' ');
+ if (Bugzilla->feature('detect_charset') && $contenttype =~ /^text\//) {
+ my $encoding = detect_encoding($attachment->data);
+ if ($encoding) {
+ $cgi->charset(find_encoding($encoding)->mime_name);
+ }
+ }
}
print $cgi->header(-type=>"$contenttype; name=\"$filename\"",
-content_disposition=> "$disposition; filename=\"$filename\"",
diff --git a/contrib/recode.pl b/contrib/recode.pl
index f7ba034ac..f8de12eb1 100755
--- a/contrib/recode.pl
+++ b/contrib/recode.pl
@@ -24,10 +24,10 @@ use lib qw(. lib);
use Bugzilla;
use Bugzilla::Constants;
+use Bugzilla::Util qw(detect_encoding);
use Digest::MD5 qw(md5_base64);
use Encode qw(encode decode resolve_alias is_utf8);
-use Encode::Guess;
use Getopt::Long;
use Pod::Usage;
@@ -71,53 +71,6 @@ sub trunc {
return $truncated;
}
-sub do_guess {
- my ($data) = @_;
-
- my $encoding = detect($data);
- $encoding = resolve_alias($encoding) if $encoding;
-
- # Encode::Detect is bad at detecting certain charsets, but Encode::Guess
- # is better at them. Here's the details:
-
- # shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect
- # tends to accidentally mis-detect UTF-8 strings as being
- # these encodings.)
- my @utf8_accidental = qw(shiftjis big5-eten euc-kr euc-jp);
- if ($encoding && grep($_ eq $encoding, @utf8_accidental)) {
- $encoding = undef;
- my $decoder = guess_encoding($data, @utf8_accidental);
- $encoding = $decoder->name if ref $decoder;
- }
-
- # Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8,
- # but Encode::Guess can usually tell which one it is.
- if ($encoding && $encoding eq 'iso-8859-8') {
- my $decoded_as = guess_iso($data, 'iso-8859-8',
- # These are ordered this way because it gives the most
- # accurate results.
- qw(iso-8859-7 iso-8859-2));
- $encoding = $decoded_as if $decoded_as;
- }
-
- return $encoding;
-}
-
-# A helper for do_guess.
-sub guess_iso {
- my ($data, $versus, @isos) = @_;
-
- my $encoding;
- foreach my $iso (@isos) {
- my $decoder = guess_encoding($data, ($iso, $versus));
- if (ref $decoder) {
- $encoding = $decoder->name if ref $decoder;
- last;
- }
- }
- return $encoding;
-}
-
sub is_valid_utf8 {
my ($str) = @_;
Encode::_utf8_on($str);
@@ -143,8 +96,6 @@ if (exists $switch{'charset'}) {
}
if ($switch{'guess'}) {
- # Encode::Detect::Detector doesn't seem to return a true value.
- # So we have to check if we can run detect.
if (!eval { require Encode::Detect::Detector }) {
my $root = ROOT_USER;
print STDERR <<EOT;
@@ -156,8 +107,6 @@ Encode::Detect, run the following command:
EOT
exit;
}
-
- import Encode::Detect::Detector qw(detect);
}
my %overrides;
@@ -255,7 +204,7 @@ foreach my $table ($dbh->bz_table_list_real) {
my $encoding;
if ($switch{'guess'}) {
- $encoding = do_guess($data);
+ $encoding = detect_encoding($data);
# We only show failures if they don't appear to be
# ASCII.
diff --git a/template/en/default/admin/params/attachment.html.tmpl b/template/en/default/admin/params/attachment.html.tmpl
index 69f62e9be..69f62e9be 100644..100755
--- a/template/en/default/admin/params/attachment.html.tmpl
+++ b/template/en/default/admin/params/attachment.html.tmpl
diff --git a/template/en/default/setup/strings.txt.pl b/template/en/default/setup/strings.txt.pl
index fe4f65e4e..2284c87ef 100644..100755
--- a/template/en/default/setup/strings.txt.pl
+++ b/template/en/default/setup/strings.txt.pl
@@ -108,6 +108,7 @@ END
feature_smtp_auth => 'SMTP Authentication',
feature_updates => 'Automatic Update Notifications',
feature_xmlrpc => 'XML-RPC Interface',
+ feature_detect_charset => 'Automatic charset detection for text attachments',
file_remove => 'Removing ##name##...',
file_rename => 'Renaming ##from## to ##to##...',