From 9f0310bf8c0821347699b434f659eb52decabf87 Mon Sep 17 00:00:00 2001 From: "mkanat%bugzilla.org" <> Date: Fri, 23 Nov 2007 12:58:33 +0000 Subject: Bug 363153: Turn on the utf8 bit on all strings in Bugzilla that contain non-ASCII data, if the utf8 parameter is on. This means that string functions like substr() work properly on multi-byte languages, now. Patch By Max Kanat-Alexander r=wurblzap, a=mkanat --- Bugzilla.pm | 1 + Bugzilla/CGI.pm | 21 +++++++++++++++++++++ Bugzilla/Constants.pm | 8 ++++---- Bugzilla/DB/Mysql.pm | 4 +++- Bugzilla/DB/Pg.pm | 4 +++- Bugzilla/Mailer.pm | 4 +++- Bugzilla/Util.pm | 6 ++++++ email_in.pl | 6 +++--- 8 files changed, 44 insertions(+), 10 deletions(-) diff --git a/Bugzilla.pm b/Bugzilla.pm index 93c37a51c..075031482 100644 --- a/Bugzilla.pm +++ b/Bugzilla.pm @@ -81,6 +81,7 @@ use constant SHUTDOWNHTML_EXIT_SILENTLY => [ # Note that this is a raw subroutine, not a method, so $class isn't available. sub init_page { + (binmode STDOUT, ':utf8') if Bugzilla->params->{'utf8'}; # Some environment variables are not taint safe delete @::ENV{'PATH', 'IFS', 'CDPATH', 'ENV', 'BASH_ENV'}; diff --git a/Bugzilla/CGI.pm b/Bugzilla/CGI.pm index ef2cb70f5..3498b3c70 100644 --- a/Bugzilla/CGI.pm +++ b/Bugzilla/CGI.pm @@ -233,6 +233,27 @@ sub header { return $self->SUPER::header(@_) || ""; } +# CGI.pm is not utf8-aware and passes data as bytes instead of UTF-8 strings. +sub param { + my $self = shift; + if (Bugzilla->params->{'utf8'} && scalar(@_) == 1) { + if (wantarray) { + return map { _fix_utf8($_) } $self->SUPER::param(@_); + } + else { + return _fix_utf8(scalar $self->SUPER::param(@_)); + } + } + return $self->SUPER::param(@_); +} + +sub _fix_utf8 { + my $input = shift; + # The is_utf8 is here in case CGI gets smart about utf8 someday. + utf8::decode($input) if defined $input && !utf8::is_utf8($input); + return $input; +} + # The various parts of Bugzilla which create cookies don't want to have to # pass them around to all of the callers. Instead, store them locally here, # and then output as required from |header|. diff --git a/Bugzilla/Constants.pm b/Bugzilla/Constants.pm index 12d54abee..e34fc0bb7 100644 --- a/Bugzilla/Constants.pm +++ b/Bugzilla/Constants.pm @@ -382,10 +382,10 @@ use constant DB_MODULE => { dbd => { package => 'DBD-mysql', module => 'DBD::mysql', - version => '2.9003', - # Certain versions are broken, development versions are - # always disallowed. - blacklist => ['^3\.000[3-6]', '_'], + # Disallow development versions + blacklist => ['_'], + # For UTF-8 support + version => '4.00', }, name => 'MySQL'}, 'pg' => {db => 'Bugzilla::DB::Pg', db_version => '8.00.0000', diff --git a/Bugzilla/DB/Mysql.pm b/Bugzilla/DB/Mysql.pm index 25ee32b64..9e0d25277 100644 --- a/Bugzilla/DB/Mysql.pm +++ b/Bugzilla/DB/Mysql.pm @@ -58,8 +58,10 @@ sub new { my $dsn = "DBI:mysql:host=$host;database=$dbname"; $dsn .= ";port=$port" if $port; $dsn .= ";mysql_socket=$sock" if $sock; + + my $attrs = { mysql_enable_utf8 => Bugzilla->params->{'utf8'} }; - my $self = $class->db_new($dsn, $user, $pass); + my $self = $class->db_new($dsn, $user, $pass, $attrs); # This makes sure that if the tables are encoded as UTF-8, we # return their data correctly. diff --git a/Bugzilla/DB/Pg.pm b/Bugzilla/DB/Pg.pm index 9f5b67757..bd16b654c 100644 --- a/Bugzilla/DB/Pg.pm +++ b/Bugzilla/DB/Pg.pm @@ -68,7 +68,9 @@ sub new { # creating tables. $dsn .= ";options='-c client_min_messages=warning'"; - my $self = $class->db_new($dsn, $user, $pass); + my $attrs = { pg_enable_utf8 => Bugzilla->params->{'utf8'} }; + + my $self = $class->db_new($dsn, $user, $pass, $attrs); # all class local variables stored in DBI derived class needs to have # a prefix 'private_'. See DBI documentation. diff --git a/Bugzilla/Mailer.pm b/Bugzilla/Mailer.pm index 03f370a4e..48f40d8b7 100644 --- a/Bugzilla/Mailer.pm +++ b/Bugzilla/Mailer.pm @@ -67,7 +67,9 @@ sub MessageToMTA { # Encode the headers correctly in quoted-printable foreach my $header qw(From To Cc Reply-To Sender Errors-To Subject) { if (my $value = $email->header($header)) { - $value = Encode::decode("UTF-8", $value) if Bugzilla->params->{'utf8'}; + if (Bugzilla->params->{'utf8'} && !utf8::is_utf8($value)) { + $value = utf8::decode($value); + } my $encoded = encode('MIME-Q', $value); $email->header_set($header, $encoded); } diff --git a/Bugzilla/Util.pm b/Bugzilla/Util.pm index ffc2af95a..144fb87c1 100644 --- a/Bugzilla/Util.pm +++ b/Bugzilla/Util.pm @@ -185,6 +185,8 @@ sub html_light_quote { # This originally came from CGI.pm, by Lincoln D. Stein sub url_quote { my ($toencode) = (@_); + utf8::encode($toencode) # The below regex works only on bytes + if Bugzilla->params->{'utf8'} && utf8::is_utf8($toencode); $toencode =~ s/([^a-zA-Z0-9_\-.])/uc sprintf("%%%02x",ord($1))/eg; return $toencode; } @@ -206,6 +208,10 @@ sub xml_quote { return $var; } +# This function must not be relied upon to return a valid string to pass to +# the DB or the user in UTF-8 situations. The only thing you can rely upon +# it for is that if you url_decode a string, it will url_encode back to the +# exact same thing. sub url_decode { my ($todecode) = (@_); $todecode =~ tr/+/ /; # pluses become spaces diff --git a/email_in.pl b/email_in.pl index 7b8232192..97fea4847 100644 --- a/email_in.pl +++ b/email_in.pl @@ -38,7 +38,7 @@ use Email::MIME; use Email::MIME::Attachment::Stripper; use Getopt::Long qw(:config bundling); use Pod::Usage; -use Encode qw(encode decode); +use Encode; use Bugzilla; use Bugzilla::Bug qw(ValidateBugID); @@ -306,8 +306,8 @@ sub get_text_alternative { debug_print("Part Character Encoding: $charset", 2); if (!$ct || $ct =~ /^text\/plain/i) { $body = $part->body; - if (Bugzilla->params->{'utf8'}) { - $body = encode('UTF-8', decode($charset, $body)); + if (Bugzilla->params->{'utf8'} && !utf8::is_utf8($body)) { + $body = Encode::decode($charset, $body); } last; } -- cgit v1.2.3-24-g4f1b