diff options
author | Byron Jones <bjones@mozilla.com> | 2011-03-09 10:46:02 +0100 |
---|---|---|
committer | Byron Jones <bjones@mozilla.com> | 2011-03-09 10:46:02 +0100 |
commit | e17bd11ddb9407d5b8cf2a53df5a9ecf514748d4 (patch) | |
tree | 0c50e3fbaa6db66c20d6e54101af5f9a5aecb013 | |
parent | fd4f9fadbede46895d8c9e9853e04e02fe701662 (diff) | |
download | bugzilla-e17bd11ddb9407d5b8cf2a53df5a9ecf514748d4.tar.gz bugzilla-e17bd11ddb9407d5b8cf2a53df5a9ecf514748d4.tar.xz |
Bug 633776: Automatic charset detection for text attachments
r=mkanat, a=mkanat
-rwxr-xr-x[-rw-r--r--] | Bugzilla/Config/Attachment.pm | 0 | ||||
-rwxr-xr-x[-rw-r--r--] | Bugzilla/Config/Common.pm | 0 | ||||
-rwxr-xr-x[-rw-r--r--] | Bugzilla/Install/Requirements.pm | 13 | ||||
-rwxr-xr-x[-rw-r--r--] | Bugzilla/Util.pm | 68 | ||||
-rwxr-xr-x | attachment.cgi | 8 | ||||
-rwxr-xr-x | contrib/recode.pl | 55 | ||||
-rwxr-xr-x[-rw-r--r--] | template/en/default/admin/params/attachment.html.tmpl | 0 | ||||
-rwxr-xr-x[-rw-r--r--] | template/en/default/setup/strings.txt.pl | 1 |
8 files changed, 90 insertions, 55 deletions
diff --git a/Bugzilla/Config/Attachment.pm b/Bugzilla/Config/Attachment.pm index e6e3b7f3d..e6e3b7f3d 100644..100755 --- a/Bugzilla/Config/Attachment.pm +++ b/Bugzilla/Config/Attachment.pm diff --git a/Bugzilla/Config/Common.pm b/Bugzilla/Config/Common.pm index 9fffe02ee..9fffe02ee 100644..100755 --- a/Bugzilla/Config/Common.pm +++ b/Bugzilla/Config/Common.pm diff --git a/Bugzilla/Install/Requirements.pm b/Bugzilla/Install/Requirements.pm index 047ed36f1..5b8f77e30 100644..100755 --- a/Bugzilla/Install/Requirements.pm +++ b/Bugzilla/Install/Requirements.pm @@ -291,6 +291,19 @@ sub OPTIONAL_MODULES { version => 0, feature => ['html_desc'], }, + { + # we need version 2.21 of Encode for mime_name + package => 'Encode', + module => 'Encode', + version => 2.21, + feature => ['detect_charset'], + }, + { + package => 'Encode-Detect', + module => 'Encode::Detect', + version => 0, + feature => ['detect_charset'], + }, # Inbound Email { diff --git a/Bugzilla/Util.pm b/Bugzilla/Util.pm index 058a49af3..ced15491d 100644..100755 --- a/Bugzilla/Util.pm +++ b/Bugzilla/Util.pm @@ -43,7 +43,8 @@ use base qw(Exporter); file_mod_time is_7bit_clean bz_crypt generate_random_password validate_email_syntax clean_text - get_text template_var disable_utf8); + get_text template_var disable_utf8 + detect_encoding); use Bugzilla::Constants; @@ -58,6 +59,8 @@ use Math::Random::Secure qw(irand); use Scalar::Util qw(tainted blessed); use Template::Filters; use Text::Wrap; +use Encode qw(encode decode resolve_alias); +use Encode::Guess; sub trick_taint { require Carp; @@ -673,6 +676,63 @@ sub disable_utf8 { } } +use constant UTF8_ACCIDENTAL => qw(shiftjis big5-eten euc-kr euc-jp); + +sub detect_encoding { + my $data = shift; + + if (!Bugzilla->feature('detect_charset')) { + require Bugzilla::Error; + Bugzilla::Error::ThrowCodeError('feature_disabled', + { feature => 'detect_charset' }); + } + + require Encode::Detect::Detector; + import Encode::Detect::Detector 'detect'; + + my $encoding = detect($data); + $encoding = resolve_alias($encoding) if $encoding; + + # Encode::Detect is bad at detecting certain charsets, but Encode::Guess + # is better at them. Here's the details: + + # shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect + # tends to accidentally mis-detect UTF-8 strings as being + # these encodings.) + if ($encoding && grep($_ eq $encoding, UTF8_ACCIDENTAL)) { + $encoding = undef; + my $decoder = guess_encoding($data, UTF8_ACCIDENTAL); + $encoding = $decoder->name if ref $decoder; + } + + # Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8, + # but Encode::Guess can usually tell which one it is. + if ($encoding && $encoding eq 'iso-8859-8') { + my $decoded_as = _guess_iso($data, 'iso-8859-8', + # These are ordered this way because it gives the most + # accurate results. + qw(iso-8859-7 iso-8859-2)); + $encoding = $decoded_as if $decoded_as; + } + + return $encoding; +} + +# A helper for detect_encoding. +sub _guess_iso { + my ($data, $versus, @isos) = (shift, shift, shift); + + my $encoding; + foreach my $iso (@isos) { + my $decoder = guess_encoding($data, ($iso, $versus)); + if (ref $decoder) { + $encoding = $decoder->name if ref $decoder; + last; + } + } + return $encoding; +} + 1; __END__ @@ -903,6 +963,12 @@ ASCII 10 (LineFeed) and ASCII 13 (Carrage Return). Disable utf8 on STDOUT (and display raw data instead). +=item C<detect_encoding($str)> + +Guesses what encoding a given data is encoded in, returning the canonical name +of the detected encoding (which may be different from the MIME charset +specification). + =item C<clean_text($str)> Returns the parameter "cleaned" by exchanging non-printable characters with spaces. Specifically characters (ASCII 0 through 31) and (ASCII 127) will become ASCII 32 (Space). diff --git a/attachment.cgi b/attachment.cgi index 9273b5f29..8ea802f44 100755 --- a/attachment.cgi +++ b/attachment.cgi @@ -53,7 +53,7 @@ use Bugzilla::Attachment::PatchReader; use Bugzilla::Token; use Bugzilla::Keyword; -use Encode qw(encode); +use Encode qw(encode find_encoding); # For most scripts we don't make $cgi and $template global variables. But # when preparing Bugzilla for mod_perl, this script used these @@ -335,6 +335,12 @@ sub view { # In order to prevent Apache from adding a charset, we have to send a # charset that's a single space. $cgi->charset(' '); + if (Bugzilla->feature('detect_charset') && $contenttype =~ /^text\//) { + my $encoding = detect_encoding($attachment->data); + if ($encoding) { + $cgi->charset(find_encoding($encoding)->mime_name); + } + } } print $cgi->header(-type=>"$contenttype; name=\"$filename\"", -content_disposition=> "$disposition; filename=\"$filename\"", diff --git a/contrib/recode.pl b/contrib/recode.pl index f7ba034ac..f8de12eb1 100755 --- a/contrib/recode.pl +++ b/contrib/recode.pl @@ -24,10 +24,10 @@ use lib qw(. lib); use Bugzilla; use Bugzilla::Constants; +use Bugzilla::Util qw(detect_encoding); use Digest::MD5 qw(md5_base64); use Encode qw(encode decode resolve_alias is_utf8); -use Encode::Guess; use Getopt::Long; use Pod::Usage; @@ -71,53 +71,6 @@ sub trunc { return $truncated; } -sub do_guess { - my ($data) = @_; - - my $encoding = detect($data); - $encoding = resolve_alias($encoding) if $encoding; - - # Encode::Detect is bad at detecting certain charsets, but Encode::Guess - # is better at them. Here's the details: - - # shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect - # tends to accidentally mis-detect UTF-8 strings as being - # these encodings.) - my @utf8_accidental = qw(shiftjis big5-eten euc-kr euc-jp); - if ($encoding && grep($_ eq $encoding, @utf8_accidental)) { - $encoding = undef; - my $decoder = guess_encoding($data, @utf8_accidental); - $encoding = $decoder->name if ref $decoder; - } - - # Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8, - # but Encode::Guess can usually tell which one it is. - if ($encoding && $encoding eq 'iso-8859-8') { - my $decoded_as = guess_iso($data, 'iso-8859-8', - # These are ordered this way because it gives the most - # accurate results. - qw(iso-8859-7 iso-8859-2)); - $encoding = $decoded_as if $decoded_as; - } - - return $encoding; -} - -# A helper for do_guess. -sub guess_iso { - my ($data, $versus, @isos) = @_; - - my $encoding; - foreach my $iso (@isos) { - my $decoder = guess_encoding($data, ($iso, $versus)); - if (ref $decoder) { - $encoding = $decoder->name if ref $decoder; - last; - } - } - return $encoding; -} - sub is_valid_utf8 { my ($str) = @_; Encode::_utf8_on($str); @@ -143,8 +96,6 @@ if (exists $switch{'charset'}) { } if ($switch{'guess'}) { - # Encode::Detect::Detector doesn't seem to return a true value. - # So we have to check if we can run detect. if (!eval { require Encode::Detect::Detector }) { my $root = ROOT_USER; print STDERR <<EOT; @@ -156,8 +107,6 @@ Encode::Detect, run the following command: EOT exit; } - - import Encode::Detect::Detector qw(detect); } my %overrides; @@ -255,7 +204,7 @@ foreach my $table ($dbh->bz_table_list_real) { my $encoding; if ($switch{'guess'}) { - $encoding = do_guess($data); + $encoding = detect_encoding($data); # We only show failures if they don't appear to be # ASCII. diff --git a/template/en/default/admin/params/attachment.html.tmpl b/template/en/default/admin/params/attachment.html.tmpl index 69f62e9be..69f62e9be 100644..100755 --- a/template/en/default/admin/params/attachment.html.tmpl +++ b/template/en/default/admin/params/attachment.html.tmpl diff --git a/template/en/default/setup/strings.txt.pl b/template/en/default/setup/strings.txt.pl index fe4f65e4e..2284c87ef 100644..100755 --- a/template/en/default/setup/strings.txt.pl +++ b/template/en/default/setup/strings.txt.pl @@ -108,6 +108,7 @@ END feature_smtp_auth => 'SMTP Authentication', feature_updates => 'Automatic Update Notifications', feature_xmlrpc => 'XML-RPC Interface', + feature_detect_charset => 'Automatic charset detection for text attachments', file_remove => 'Removing ##name##...', file_rename => 'Renaming ##from## to ##to##...', |