From e17bd11ddb9407d5b8cf2a53df5a9ecf514748d4 Mon Sep 17 00:00:00 2001
From: Byron Jones <bjones@mozilla.com>
Date: Wed, 9 Mar 2011 17:46:02 +0800
Subject: Bug 633776: Automatic charset detection for text attachments
 r=mkanat, a=mkanat

---
 contrib/recode.pl | 55 ++-----------------------------------------------------
 1 file changed, 2 insertions(+), 53 deletions(-)

(limited to 'contrib/recode.pl')

diff --git a/contrib/recode.pl b/contrib/recode.pl
index f7ba034ac..f8de12eb1 100755
--- a/contrib/recode.pl
+++ b/contrib/recode.pl
@@ -24,10 +24,10 @@ use lib qw(. lib);
 
 use Bugzilla;
 use Bugzilla::Constants;
+use Bugzilla::Util qw(detect_encoding);
 
 use Digest::MD5 qw(md5_base64);
 use Encode qw(encode decode resolve_alias is_utf8);
-use Encode::Guess;
 use Getopt::Long;
 use Pod::Usage;
 
@@ -71,53 +71,6 @@ sub trunc {
     return $truncated;
 }
 
-sub do_guess {
-    my ($data) = @_;
-
-    my $encoding = detect($data);
-    $encoding = resolve_alias($encoding) if $encoding;
-
-    # Encode::Detect is bad at detecting certain charsets, but Encode::Guess
-    # is better at them. Here's the details:
-
-    # shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect
-    # tends to accidentally mis-detect UTF-8 strings as being
-    # these encodings.)
-    my @utf8_accidental = qw(shiftjis big5-eten euc-kr euc-jp);
-    if ($encoding && grep($_ eq $encoding, @utf8_accidental)) {
-        $encoding = undef;
-        my $decoder = guess_encoding($data, @utf8_accidental);
-        $encoding = $decoder->name if ref $decoder;
-    }
-
-    # Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8,
-    # but Encode::Guess can usually tell which one it is.
-    if ($encoding && $encoding eq 'iso-8859-8') {
-        my $decoded_as = guess_iso($data, 'iso-8859-8', 
-            # These are ordered this way because it gives the most 
-            # accurate results.
-            qw(iso-8859-7 iso-8859-2));
-        $encoding = $decoded_as if $decoded_as;
-    }
-
-    return $encoding;
-}
-
-# A helper for do_guess.
-sub guess_iso {
-    my ($data, $versus, @isos) = @_;
-
-    my $encoding;
-    foreach my $iso (@isos) {
-        my $decoder = guess_encoding($data, ($iso, $versus));
-        if (ref $decoder) {
-            $encoding = $decoder->name if ref $decoder;
-            last;
-        }
-    }
-    return $encoding;
-}
-
 sub is_valid_utf8 {
     my ($str) = @_;
     Encode::_utf8_on($str);
@@ -143,8 +96,6 @@ if (exists $switch{'charset'}) {
 }
 
 if ($switch{'guess'}) {
-    # Encode::Detect::Detector doesn't seem to return a true value.
-    # So we have to check if we can run detect.
     if (!eval { require Encode::Detect::Detector }) {
         my $root = ROOT_USER;
         print STDERR <<EOT;
@@ -156,8 +107,6 @@ Encode::Detect, run the following command:
 EOT
         exit;
     }
-
-    import Encode::Detect::Detector qw(detect);
 }
 
 my %overrides;
@@ -255,7 +204,7 @@ foreach my $table ($dbh->bz_table_list_real) {
 
                 my $encoding;
                 if ($switch{'guess'}) {
-                    $encoding = do_guess($data);
+                    $encoding = detect_encoding($data);
 
                     # We only show failures if they don't appear to be
                     # ASCII.
-- 
cgit v1.2.3-24-g4f1b