From b8012210e03800d6473d2b19d7cf1beae6d12dc3 Mon Sep 17 00:00:00 2001 From: "mkanat%bugzilla.org" <> Date: Sat, 18 Nov 2006 23:00:28 +0000 Subject: Bug 280633: Tools to migrate existing legacy encoded database to UTF-8 (Unicode) Patch By Max Kanat-Alexander r=justdave, a=justdave --- contrib/recode.pl | 346 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 346 insertions(+) create mode 100755 contrib/recode.pl (limited to 'contrib/recode.pl') diff --git a/contrib/recode.pl b/contrib/recode.pl new file mode 100755 index 000000000..7db01dddf --- /dev/null +++ b/contrib/recode.pl @@ -0,0 +1,346 @@ +#!/usr/bin/perl -w +# -*- Mode: perl; indent-tabs-mode: nil -*- +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# The Original Code is the Bugzilla Bug Tracking System. +# +# The Initial Developer of the Original Code is Everything Solved. +# Portions created by Everything Solved are Copyright (C) 2006 +# Everything Solved. All Rights Reserved. +# +# Contributor(s): Max Kanat-Alexander + +use strict; +# Allow the script to be run from contrib or as contrib/recode.pl +use lib '..'; + +use Bugzilla; +use Bugzilla::Constants; + +use Digest::MD5 qw(md5_base64); +use Encode qw(encode decode resolve_alias is_utf8); +use Encode::Guess; +use Getopt::Long; +use Pod::Usage; + +############# +# Constants # +############# + +use constant IGNORE_ENCODINGS => qw(utf8 ascii iso-8859-1); + +use constant MAX_STRING_LEN => 25; + +# For certain tables, we can't automatically determine their Primary Key. +# So, we specify it here as a string. +use constant SPECIAL_KEYS => { + bugs_activity => 'bug_id,bug_when,fieldid', + profile_setting => 'user_id,setting_name', + profiles_activity => 'userid,profiles_when,fieldid', + setting_value => 'name,value', +}; + +############### +# Subroutines # +############### + +# "truncate" is a file operation in perl, so we can't use that name. +sub trunc { + my ($str) = @_; + my $truncated = substr($str, 0, MAX_STRING_LEN); + if (length($truncated) ne length($str)) { + $truncated .= '...'; + } + return $truncated; +} + +sub do_guess { + my ($data) = @_; + + my $encoding = detect($data); + $encoding = resolve_alias($encoding) if $encoding; + + # Encode::Detect is bad at detecting certain charsets, but Encode::Guess + # is better at them. Here's the details: + + # shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect + # tends to accidentally mis-detect UTF-8 strings as being + # these encodings.) + my @utf8_accidental = qw(shiftjis big5-eten euc-kr euc-jp); + if ($encoding && grep($_ eq $encoding, @utf8_accidental)) { + $encoding = undef; + my $decoder = guess_encoding($data, @utf8_accidental); + $encoding = $decoder->name if ref $decoder; + } + + # Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8, + # but Encode::Guess can usually tell which one it is. + if ($encoding && $encoding eq 'iso-8859-8') { + my $decoded_as = guess_iso($data, 'iso-8859-8', + # These are ordered this way because it gives the most + # accurate results. + qw(iso-8859-7 iso-8859-2)); + $encoding = $decoded_as if $decoded_as; + } + + return $encoding; +} + +# A helper for do_guess. +sub guess_iso { + my ($data, $versus, @isos) = @_; + + my $encoding; + foreach my $iso (@isos) { + my $decoder = guess_encoding($data, ($iso, $versus)); + if (ref $decoder) { + $encoding = $decoder->name if ref $decoder; + last; + } + } + return $encoding; +} + +sub is_valid_utf8 { + my ($str) = @_; + Encode::_utf8_on($str); + return is_utf8($str, 1); +} + +############### +# Main Script # +############### + +my %switch; +GetOptions(\%switch, 'dry-run', 'guess', 'charset=s', 'show-failures', + 'overrides=s', 'help|h'); + +pod2usage({ -verbose => 1 }) if $switch{'help'}; + +# You have to specify at least one of these switches. +pod2usage({ -verbose => 0 }) if (!$switch{'charset'} && !$switch{'guess'}); + +if (exists $switch{'charset'}) { + $switch{'charset'} = resolve_alias($switch{'charset'}) + || die "'$switch{charset}' is not a valid charset."; +} + +if ($switch{'guess'}) { + # Encode::Detect::Detector doesn't seem to return a true value. + # So we have to check if we can run detect. + if (!eval { require Encode::Detect::Detector }) { + my $root = ROOT_USER; + print STDERR <getlines(); + $file->close(); + foreach my $line (@lines) { + chomp($line); + my ($digest, $encoding) = split(' ', $line); + $overrides{$digest} = $encoding; + } +} + +my $dbh = Bugzilla->dbh; + +# Make the database give us raw bytes. +$dbh->do('SET character_set_results = NULL') + if $dbh->isa('Bugzilla::DB::Mysql'); + +$dbh->begin_work; + +foreach my $table ($dbh->bz_table_list_real) { + my @columns = $dbh->bz_table_columns($table); + + my $pk = SPECIAL_KEYS->{$table}; + + # Figure out the primary key. + foreach my $column (@columns) { + my $def = $dbh->bz_column_info($table, $column); + $pk = $column if $def->{PRIMARYKEY}; + } + # If there's no PK, it's defined by the a UNIQUE index. + if (!$pk) { + foreach my $column (@columns) { + my $index = $dbh->bz_index_info($table, "${table}_${column}_idx"); + if ($index && ref($index) eq 'HASH') { + $pk = join(',', @{$index->{FIELDS}}) + if $index->{TYPE} eq 'UNIQUE'; + } + } + } + + foreach my $column (@columns) { + my $def = $dbh->bz_column_info($table, $column); + # If this is a text column, it may need work. + if ($def->{TYPE} =~ /text|char/i) { + print "Converting $table.$column...\n"; + my $sth = $dbh->prepare("SELECT $column, $pk FROM $table + WHERE $column IS NOT NULL + AND $column != ''"); + + my @pk_array = map {"$_ = ?"} split(',', $pk); + my $pk_where = join(' AND ', @pk_array); + my $update_sth = $dbh->prepare( + "UPDATE $table SET $column = ? WHERE $pk_where"); + + $sth->execute(); + + while (my @result = $sth->fetchrow_array) { + my $data = shift @result; + my $digest = md5_base64($data); + + my $encoding; + if ($switch{'guess'}) { + $encoding = do_guess($data); + + # We only show failures if they don't appear to be + # ASCII. + if ($switch{'show-failures'} && !$encoding + && !is_valid_utf8($data)) + { + my $truncated = trunc($data); + print "Failed to guess: Key: $digest", + " DATA: $truncated\n"; + } + + # If we fail a guess, and the data is valid UTF-8, + # just assume we failed because it's UTF-8. + next if is_valid_utf8($data); + } + + # If we couldn't detect the charset (or were instructed + # not to try), we fall back to --charset. If there's no + # fallback, we just do nothing. + if (!$encoding && $switch{'charset'}) { + $encoding = $switch{'charset'}; + } + + $encoding = $overrides{$digest} if $overrides{$digest}; + + # We only fix it if it's not ASCII or UTF-8 already. + if ($encoding && !grep($_ eq $encoding, IGNORE_ENCODINGS)) { + my $decoded = encode('utf8', decode($encoding, $data)); + if ($switch{'dry-run'} && $data ne $decoded) { + print "From: [" . trunc($data) . "] Key: $digest\n", + "To: [" . trunc($decoded) . "]", + " Encoding : $encoding\n"; + } + else { + $update_sth->execute($decoded, @result); + } + } + } # while (my @result = $sth->fetchrow_array) + } # if ($column->{TYPE} =~ /text|char/i) + } # foreach my $column (@columns) +} + +$dbh->commit; + +__END__ + +=head1 NAME + +recode.pl - Converts a database from one encoding (or multiple encodings) +to UTF-8. + +=head1 SYNOPSIS + + contrib/recode.pl [--guess [--show-failures]] [--charset=iso-8859-2] + [--overrides=file_name] + + --dry-run Don't modify the database. + + --charset Primary charset your data is currently in. This can be + optionally omitted if you do --guess. + + --guess Try to guess the charset of the data. + + --show-failures If we fail to guess, show where we failed. + + --overrides Specify a file containing overrides. See --help + for more info. + + --help Display detailed help. + + If you aren't sure what to do, try: + + contrib/recode.pl --guess --charset=cp1252 + +=head1 OPTIONS + +=over + +=item --dry-run + +Don't modify the database, just print out what the conversions will be. + +recode.pl will print out a Key for each item. You can use this in the +overrides file, described below. + +=item --guess + +If your database is in multiple different encodings, specify this switch +and recode.pl will do its best to determine the original charset of the data. +The detection is usually very reliable. + +If recode.pl cannot guess the charset, it will leave the data alone, unless +you've specified --charset. + +=item --charset=charset-name + +If you do not specify --guess, then your database is converted +from this character set into the UTF-8. + +If you have specified --guess, recode.pl will use this charset as +a fallback--when it cannot guess the charset of a particular piece +of data, it will guess that the data is in this charset and convert +it from this charset to UTF-8. + +charset-name must be a charset that is known to perl's Encode +module. To see a list of available charsets, do: + +Cencodings(":all"))'> + +=item --show-failures + +If --guess fails to guess a charset, print out the data it failed on. + +=item --overrides=file_name + +This is a way of specifying certain encodings to override the encodings of +--guess. The file is a series of lines. The line should start with the Key +from --dry-run, and then a space, and then the encoding you'd like to use. + +=back -- cgit v1.2.3-24-g4f1b