Bug 633776: Automatic charset detection for text attachments

r=mkanat, a=mkanat
author: Byron Jones <bjones@mozilla.com> 2011-03-09 10:46:02 +0100
committer: Byron Jones <bjones@mozilla.com> 2011-03-09 10:46:02 +0100
commit: e17bd11ddb9407d5b8cf2a53df5a9ecf514748d4 (patch)
tree: 0c50e3fbaa6db66c20d6e54101af5f9a5aecb013 /Bugzilla
parent: fd4f9fadbede46895d8c9e9853e04e02fe701662 (diff)
download: bugzilla-e17bd11ddb9407d5b8cf2a53df5a9ecf514748d4.tar.gz
bugzilla-e17bd11ddb9407d5b8cf2a53df5a9ecf514748d4.tar.xz
4 files changed, 80 insertions, 1 deletions
diff --git a/Bugzilla/Config/Attachment.pm b/Bugzilla/Config/Attachment.pm
index e6e3b7f3d..e6e3b7f3d 100644..100755
--- a/Bugzilla/Config/Attachment.pm
+++ b/Bugzilla/Config/Attachment.pm
diff --git a/Bugzilla/Config/Common.pm b/Bugzilla/Config/Common.pm
index 9fffe02ee..9fffe02ee 100644..100755
--- a/Bugzilla/Config/Common.pm
+++ b/Bugzilla/Config/Common.pm
diff --git a/Bugzilla/Install/Requirements.pm b/Bugzilla/Install/Requirements.pm
index 047ed36f1..5b8f77e30 100644..100755
--- a/Bugzilla/Install/Requirements.pm
+++ b/Bugzilla/Install/Requirements.pm
@@ -291,6 +291,19 @@ sub OPTIONAL_MODULES {
         version => 0,
         feature => ['html_desc'],
     },
+    {
+        # we need version 2.21 of Encode for mime_name
+        package => 'Encode',
+        module  => 'Encode',
+        version => 2.21,
+        feature => ['detect_charset'],
+    },
+    {
+        package => 'Encode-Detect',
+        module  => 'Encode::Detect',
+        version => 0,
+        feature => ['detect_charset'],
+    },
 
     # Inbound Email
     {
diff --git a/Bugzilla/Util.pm b/Bugzilla/Util.pm
index 058a49af3..ced15491d 100644..100755
--- a/Bugzilla/Util.pm
+++ b/Bugzilla/Util.pm
@@ -43,7 +43,8 @@ use base qw(Exporter);
                              file_mod_time is_7bit_clean
                              bz_crypt generate_random_password
                              validate_email_syntax clean_text
-                             get_text template_var disable_utf8);
+                             get_text template_var disable_utf8
+                             detect_encoding);
 
 use Bugzilla::Constants;
 
@@ -58,6 +59,8 @@ use Math::Random::Secure qw(irand);
 use Scalar::Util qw(tainted blessed);
 use Template::Filters;
 use Text::Wrap;
+use Encode qw(encode decode resolve_alias);
+use Encode::Guess;
 
 sub trick_taint {
     require Carp;
@@ -673,6 +676,63 @@ sub disable_utf8 {
     }
 }
 
+use constant UTF8_ACCIDENTAL => qw(shiftjis big5-eten euc-kr euc-jp);
+
+sub detect_encoding {
+    my $data = shift;
+
+    if (!Bugzilla->feature('detect_charset')) {
+        require Bugzilla::Error;
+        Bugzilla::Error::ThrowCodeError('feature_disabled',
+            { feature => 'detect_charset' });
+    }
+
+    require Encode::Detect::Detector;
+    import Encode::Detect::Detector 'detect';
+
+    my $encoding = detect($data);
+    $encoding = resolve_alias($encoding) if $encoding;
+
+    # Encode::Detect is bad at detecting certain charsets, but Encode::Guess
+    # is better at them. Here's the details:
+
+    # shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect
+    # tends to accidentally mis-detect UTF-8 strings as being
+    # these encodings.)
+    if ($encoding && grep($_ eq $encoding, UTF8_ACCIDENTAL)) {
+        $encoding = undef;
+        my $decoder = guess_encoding($data, UTF8_ACCIDENTAL);
+        $encoding = $decoder->name if ref $decoder;
+    }
+
+    # Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8,
+    # but Encode::Guess can usually tell which one it is.
+    if ($encoding && $encoding eq 'iso-8859-8') {
+        my $decoded_as = _guess_iso($data, 'iso-8859-8', 
+            # These are ordered this way because it gives the most 
+            # accurate results.
+            qw(iso-8859-7 iso-8859-2));
+        $encoding = $decoded_as if $decoded_as;
+    }
+
+    return $encoding;
+}
+
+# A helper for detect_encoding.
+sub _guess_iso {
+    my ($data, $versus, @isos) = (shift, shift, shift);
+
+    my $encoding;
+    foreach my $iso (@isos) {
+        my $decoder = guess_encoding($data, ($iso, $versus));
+        if (ref $decoder) {
+            $encoding = $decoder->name if ref $decoder;
+            last;
+        }
+    }
+    return $encoding;
+}
+
 1;
 
 __END__
@@ -903,6 +963,12 @@ ASCII 10 (LineFeed) and ASCII 13 (Carrage Return).
 
 Disable utf8 on STDOUT (and display raw data instead).
 
+=item C<detect_encoding($str)>
+
+Guesses what encoding a given data is encoded in, returning the canonical name
+of the detected encoding (which may be different from the MIME charset 
+specification).
+
 =item C<clean_text($str)>
 Returns the parameter "cleaned" by exchanging non-printable characters with spaces.
 Specifically characters (ASCII 0 through 31) and (ASCII 127) will become ASCII 32 (Space).
author	Byron Jones <bjones@mozilla.com>	2011-03-09 10:46:02 +0100
committer	Byron Jones <bjones@mozilla.com>	2011-03-09 10:46:02 +0100
commit	e17bd11ddb9407d5b8cf2a53df5a9ecf514748d4 (patch)
tree	0c50e3fbaa6db66c20d6e54101af5f9a5aecb013 /Bugzilla
parent	fd4f9fadbede46895d8c9e9853e04e02fe701662 (diff)
download	bugzilla-e17bd11ddb9407d5b8cf2a53df5a9ecf514748d4.tar.gz bugzilla-e17bd11ddb9407d5b8cf2a53df5a9ecf514748d4.tar.xz