From e17bd11ddb9407d5b8cf2a53df5a9ecf514748d4 Mon Sep 17 00:00:00 2001 From: Byron Jones Date: Wed, 9 Mar 2011 17:46:02 +0800 Subject: Bug 633776: Automatic charset detection for text attachments r=mkanat, a=mkanat --- Bugzilla/Util.pm | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) mode change 100644 => 100755 Bugzilla/Util.pm (limited to 'Bugzilla/Util.pm') diff --git a/Bugzilla/Util.pm b/Bugzilla/Util.pm old mode 100644 new mode 100755 index 058a49af3..ced15491d --- a/Bugzilla/Util.pm +++ b/Bugzilla/Util.pm @@ -43,7 +43,8 @@ use base qw(Exporter); file_mod_time is_7bit_clean bz_crypt generate_random_password validate_email_syntax clean_text - get_text template_var disable_utf8); + get_text template_var disable_utf8 + detect_encoding); use Bugzilla::Constants; @@ -58,6 +59,8 @@ use Math::Random::Secure qw(irand); use Scalar::Util qw(tainted blessed); use Template::Filters; use Text::Wrap; +use Encode qw(encode decode resolve_alias); +use Encode::Guess; sub trick_taint { require Carp; @@ -673,6 +676,63 @@ sub disable_utf8 { } } +use constant UTF8_ACCIDENTAL => qw(shiftjis big5-eten euc-kr euc-jp); + +sub detect_encoding { + my $data = shift; + + if (!Bugzilla->feature('detect_charset')) { + require Bugzilla::Error; + Bugzilla::Error::ThrowCodeError('feature_disabled', + { feature => 'detect_charset' }); + } + + require Encode::Detect::Detector; + import Encode::Detect::Detector 'detect'; + + my $encoding = detect($data); + $encoding = resolve_alias($encoding) if $encoding; + + # Encode::Detect is bad at detecting certain charsets, but Encode::Guess + # is better at them. Here's the details: + + # shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect + # tends to accidentally mis-detect UTF-8 strings as being + # these encodings.) + if ($encoding && grep($_ eq $encoding, UTF8_ACCIDENTAL)) { + $encoding = undef; + my $decoder = guess_encoding($data, UTF8_ACCIDENTAL); + $encoding = $decoder->name if ref $decoder; + } + + # Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8, + # but Encode::Guess can usually tell which one it is. + if ($encoding && $encoding eq 'iso-8859-8') { + my $decoded_as = _guess_iso($data, 'iso-8859-8', + # These are ordered this way because it gives the most + # accurate results. + qw(iso-8859-7 iso-8859-2)); + $encoding = $decoded_as if $decoded_as; + } + + return $encoding; +} + +# A helper for detect_encoding. +sub _guess_iso { + my ($data, $versus, @isos) = (shift, shift, shift); + + my $encoding; + foreach my $iso (@isos) { + my $decoder = guess_encoding($data, ($iso, $versus)); + if (ref $decoder) { + $encoding = $decoder->name if ref $decoder; + last; + } + } + return $encoding; +} + 1; __END__ @@ -903,6 +963,12 @@ ASCII 10 (LineFeed) and ASCII 13 (Carrage Return). Disable utf8 on STDOUT (and display raw data instead). +=item C + +Guesses what encoding a given data is encoded in, returning the canonical name +of the detected encoding (which may be different from the MIME charset +specification). + =item C Returns the parameter "cleaned" by exchanging non-printable characters with spaces. Specifically characters (ASCII 0 through 31) and (ASCII 127) will become ASCII 32 (Space). -- cgit v1.2.3-24-g4f1b