From 4adf7f1b401955a1938cfc7a9decdc77af2fab20 Mon Sep 17 00:00:00 2001 From: "mkanat%bugzilla.org" <> Date: Sat, 24 Oct 2009 05:30:14 +0000 Subject: Bug 519584: Implement a framework for migrating from other bug-trackers, and start with a GNATS importer. Patch by Max Kanat-Alexander (module owner) a=mkanat --- Bugzilla/Migrate/Gnats.pm | 709 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 709 insertions(+) create mode 100644 Bugzilla/Migrate/Gnats.pm (limited to 'Bugzilla/Migrate/Gnats.pm') diff --git a/Bugzilla/Migrate/Gnats.pm b/Bugzilla/Migrate/Gnats.pm new file mode 100644 index 000000000..232100f2d --- /dev/null +++ b/Bugzilla/Migrate/Gnats.pm @@ -0,0 +1,709 @@ +# -*- Mode: perl; indent-tabs-mode: nil -*- +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# The Original Code is The Bugzilla Migration Tool. +# +# The Initial Developer of the Original Code is Lambda Research +# Corporation. Portions created by the Initial Developer are Copyright +# (C) 2009 the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Max Kanat-Alexander + +package Bugzilla::Migrate::Gnats; +use strict; +use base qw(Bugzilla::Migrate); + +use Bugzilla::Constants; +use Bugzilla::Install::Util qw(indicate_progress); +use Bugzilla::Util qw(format_time trim generate_random_password lsearch); + +use Email::Address; +use Email::MIME; +use File::Basename; +use IO::File; +use List::Util qw(first); + +use constant REQUIRED_MODULES => [ + { + package => 'Email-Simple-FromHandle', + module => 'Email::Simple::FromHandle', + # This version added seekable handles. + version => 0.050, + }, +]; + +use constant FIELD_MAP => { + 'Number' => 'bug_id', + 'Category' => 'product', + 'Synopsis' => 'short_desc', + 'Responsible' => 'assigned_to', + 'State' => 'bug_status', + 'Class' => 'cf_type', + 'Classification' => '', + 'Originator' => 'reporter', + 'Arrival-Date' => 'creation_ts', + 'Last-Modified' => 'delta_ts', + 'Release' => 'version', + 'Severity' => 'bug_severity', + 'Description' => 'comment', +}; + +use constant VALUE_MAP => { + bug_severity => { + 'serious' => 'major', + 'cosmetic' => 'trivial', + 'new-feature' => 'enhancement', + 'non-critical' => 'normal', + }, + bug_status => { + 'open' => 'NEW', + 'analyzed' => 'ASSIGNED', + 'suspended' => 'RESOLVED', + 'feedback' => 'RESOLVED', + 'released' => 'VERIFIED', + }, + bug_status_resolution => { + 'feedback' => 'FIXED', + 'released' => 'FIXED', + 'closed' => 'FIXED', + 'suspended' => 'LATER', + }, + priority => { + 'medium' => 'Normal', + }, +}; + +use constant GNATS_CONFIG_VARS => ( + { + name => 'gnats_path', + default => '/var/lib/gnats', + desc => < 'default_email_domain', + default => 'example.com', + desc => <<'END', +# Some GNATS users do not have full email addresses, but Bugzilla requires +# every user to have an email address. What domain should be appended to +# usernames that don't have emails, to make them into email addresses? +# (For example, if you leave this at the default, "unknown" would become +# "unknown@example.com".) +END + }, + { + name => 'component_name', + default => 'General', + desc => <<'END', +# GNATS has only "Category" to classify bugs. However, Bugzilla has a +# multi-level system of Products that contain Components. When importing +# GNATS categories, they become a Product with one Component. What should +# the name of that Component be? +END + }, + { + name => 'version_regex', + default => '', + desc => <<'END', +# In GNATS, the "version" field can contain almost anything. However, in +# Bugzilla, it's a drop-down, so you don't want too many choices in there. +# If you specify a regular expression here, versions will be tested against +# this regular expression, and if they match, the first match (the first set +# of parentheses in the regular expression, also called "$1") will be used +# as the version value for the bug instead of the full version value specified +# in GNATS. +END + }, + { + name => 'default_originator', + default => 'gnats-admin', + desc => <<'END', +# Sometimes, a PR has no valid Originator, so we fall back to the From +# header of the email. If the From header also isn't a valid username +# (is just a name with spaces in it--we can't convert that to an email +# address) then this username (which can either be a GNATS username or an +# email address) will be considered to be the Originator of the PR. +END + } +); + +sub CONFIG_VARS { + my $self = shift; + my @vars = (GNATS_CONFIG_VARS, $self->SUPER::CONFIG_VARS); + my $field_map = first { $_->{name} eq 'translate_fields' } @vars; + $field_map->{default} = FIELD_MAP; + my $value_map = first { $_->{name} eq 'translate_values' } @vars; + $value_map->{default} = VALUE_MAP; + return @vars; +} + +# Directories that aren't projects, or that we shouldn't be parsing +use constant SKIP_DIRECTORIES => qw( + gnats-adm + gnats-queue + pending +); + +use constant NON_COMMENT_FIELDS => qw( + Audit-Trail + Closed-Date + Confidential + Unformatted + attachments +); + +# Certain fields can contain things that look like fields in them, +# because they might contain quoted emails. To avoid mis-parsing, +# we list out here the exact order of fields at the end of a PR +# and wait for the next field to consider that we actually have +# a field to parse. +use constant END_FIELD_ORDER => [qw( + Description + How-To-Repeat + Fix + Release-Note + Audit-Trail + Unformatted +)]; + +use constant CUSTOM_FIELDS => { + cf_type => { + type => FIELD_TYPE_SINGLE_SELECT, + description => 'Type', + }, +}; + +use constant FIELD_REGEX => qr/^>(\S+):\s*(.*)$/; + +# Used for bugs that have no Synopsis. +use constant NO_SUBJECT => "(no subject)"; + +# This is the divider that GNATS uses between attachments in its database +# files. It's missign two hyphens at the beginning because MIME Emails use +# -- to start boundaries. +use constant GNATS_BOUNDARY => '----gnatsweb-attachment----'; + +use constant LONG_VERSION_LENGTH => 32; + +######### +# Hooks # +######### + +sub before_insert { + my $self = shift; + + # gnats_id isn't a valid User::create field, and we don't need it + # anymore now. + delete $_->{gnats_id} foreach @{ $self->users }; + + # Grab a version out of a bug for each product, so that there is a + # valid "version" argument for Bugzilla::Product->create. + foreach my $product (@{ $self->products }) { + my $bug = first { $_->{product} eq $product->{name} and $_->{version} } + @{ $self->bugs }; + if (defined $bug) { + $product->{version} = $bug->{version}; + } + else { + $product->{version} = 'unspecified'; + } + } +} + +######### +# Users # +######### + +sub _read_users { + my $self = shift; + my $path = $self->config('gnats_path'); + my $file = "$path/gnats-adm/responsible"; + $self->debug("Reading users from $file"); + my $default_domain = $self->config('default_email_domain'); + open(my $users_fh, '<', $file) || die "$file: $!"; + my @users; + foreach my $line (<$users_fh>) { + $line = trim($line); + next if $line =~ /^#/; + my ($id, $name, $email) = split(':', $line, 3); + $email ||= "$id\@$default_domain"; + # We can't call our own translate_value, because that depends on + # the existence of user_map, which doesn't exist until after + # this method. However, we still want to translate any users found. + $email = $self->SUPER::translate_value('user', $email); + push(@users, { realname => $name, login_name => $email, + gnats_id => $id }); + } + close($users_fh); + return \@users; +} + +sub user_map { + my $self = shift; + $self->{user_map} ||= { map { $_->{gnats_id} => $_->{login_name} } + @{ $self->users } }; + return $self->{user_map}; +} + +sub add_user { + my ($self, $id, $email) = @_; + return if defined $self->user_map->{$id}; + $self->user_map->{$id} = $email; + push(@{ $self->users }, { login_name => $email, gnats_id => $id }); +} + +sub user_to_email { + my ($self, $value) = @_; + if (defined $self->user_map->{$value}) { + $value = $self->user_map->{$value}; + } + elsif ($value !~ /@/) { + my $domain = $self->config('default_email_domain'); + $value = "$value\@$domain"; + } + return $value; +} + +############ +# Products # +############ + +sub _read_products { + my $self = shift; + my $path = $self->config('gnats_path'); + my $file = "$path/gnats-adm/categories"; + $self->debug("Reading categories from $file"); + + open(my $categories_fh, '<', $file) || die "$file: $!"; + my @products; + foreach my $line (<$categories_fh>) { + $line = trim($line); + next if $line =~ /^#/; + my ($name, $description, $assigned_to, $cc) = split(':', $line, 4); + my %product = ( name => $name, description => $description ); + + my @initial_cc = split(',', $cc); + @initial_cc = @{ $self->translate_value('user', \@initial_cc) }; + $assigned_to = $self->translate_value('user', $assigned_to); + my %component = ( name => $self->config('component_name'), + description => $description, + initialowner => $assigned_to, + initial_cc => \@initial_cc ); + $product{components} = [\%component]; + push(@products, \%product); + } + close($categories_fh); + return \@products; +} + +################ +# Reading Bugs # +################ + +sub _read_bugs { + my $self = shift; + my $path = $self->config('gnats_path'); + my @directories = glob("$path/*"); + my @bugs; + foreach my $directory (@directories) { + next if !-d $directory; + my $name = basename($directory); + next if grep($_ eq $name, SKIP_DIRECTORIES); + push(@bugs, @{ $self->_parse_project($directory) }); + } + @bugs = sort { $a->{Number} <=> $b->{Number} } @bugs; + return \@bugs; +} + +sub _parse_project { + my ($self, $directory) = @_; + my @files = glob("$directory/*"); + + $self->debug("Reading Project: $directory"); + # Sometimes other files get into gnats directories. + @files = grep { basename($_) =~ /^\d+$/ } @files; + my @bugs; + my $count = 1; + my $total = scalar @files; + print basename($directory) . ":\n"; + foreach my $file (@files) { + push(@bugs, $self->_parse_bug_file($file)); + if (!$self->verbose) { + indicate_progress({ current => $count++, every => 5, + total => $total }); + } + } + return \@bugs; +} + +sub _parse_bug_file { + my ($self, $file) = @_; + $self->debug("Reading $file"); + open(my $fh, "<", $file) || die "$file: $!"; + my $email = Email::Simple::FromHandle->new($fh); + my $fields = $self->_get_gnats_field_data($email); + # We parse attachments here instead of during translate_bug, + # because otherwise we'd be taking up huge amounts of memory storing + # all the raw attachment data in memory. + $fields->{attachments} = $self->_parse_attachments($fields); + close($fh); + return $fields; +} + +sub _get_gnats_field_data { + my ($self, $email) = @_; + my ($current_field, @value_lines, %fields); + $email->reset_handle(); + my $handle = $email->handle; + foreach my $line (<$handle>) { + # If this line starts a field name + if ($line =~ FIELD_REGEX) { + my ($new_field, $rest_of_line) = ($1, $2); + + # If this is one of the last few PR fields, then make sure + # that we're getting our fields in the right order. + my $new_field_valid = 1; + my $current_field_pos = + lsearch(END_FIELD_ORDER, $current_field || ''); + if ($current_field_pos > -1) { + my $new_field_pos = lsearch(END_FIELD_ORDER, $new_field); + # We accept any field, as long as it's later than this one. + $new_field_valid = $new_field_pos > $current_field_pos ? 1 : 0; + } + + if ($new_field_valid) { + if ($current_field) { + $fields{$current_field} = _handle_lines(\@value_lines); + @value_lines = (); + } + $current_field = $new_field; + $line = $rest_of_line; + } + } + push(@value_lines, $line) if defined $line; + } + $fields{$current_field} = _handle_lines(\@value_lines); + $fields{cc} = [$email->header('Cc')] if $email->header('Cc'); + + # If the Originator is invalid and we don't have a translation for it, + # use the From header instead. + my $originator = $self->translate_value('reporter', $fields{Originator}, + { check_only => 1 }); + if ($originator !~ Bugzilla->params->{emailregexp}) { + # We use the raw header sometimes, because it looks like "From: user" + # which Email::Address won't parse but we can still use. + my $address = $email->header('From'); + my ($parsed) = Email::Address->parse($address); + if ($parsed) { + $address = $parsed->address; + } + if ($address) { + $self->debug( + "PR $fields{Number} had an Originator that was not a valid" + . " user ($fields{Originator}). Using From ($address)" + . " instead.\n"); + my $address_email = $self->translate_value('reporter', $address, + { check_only => 1 }); + if ($address_email !~ Bugzilla->params->{emailregexp}) { + $self->debug(" From was also invalid, using default_originator.\n"); + $address = $self->config('default_originator'); + } + $fields{Originator} = $address; + } + } + + $self->debug(\%fields, 3); + return \%fields; +} + +sub _handle_lines { + my ($lines) = @_; + my $value = join('', @$lines); + $value =~ s/\s+$//; + return $value; +} + +#################### +# Translating Bugs # +#################### + +sub translate_bug { + my ($self, $fields) = @_; + + my ($bug, $other_fields) = $self->SUPER::translate_bug($fields); + + $bug->{attachments} = delete $other_fields->{attachments}; + + if (defined $other_fields->{_add_to_comment}) { + $bug->{comment} .= delete $other_fields->{_add_to_comment}; + } + + my ($changes, $extra_comment) = + $self->_parse_audit_trail($bug, $other_fields->{'Audit-Trail'}); + + my @comments; + foreach my $change (@$changes) { + if (exists $change->{comment}) { + push(@comments, { + thetext => $change->{comment}, + who => $change->{who}, + bug_when => $change->{bug_when} }); + delete $change->{comment}; + } + } + $bug->{history} = $changes; + + if (trim($extra_comment)) { + push(@comments, { thetext => $extra_comment, who => $bug->{reporter}, + bug_when => $bug->{delta_ts} || $bug->{creation_ts} }); + } + $bug->{comments} = \@comments; + + $bug->{component} = $self->config('component_name'); + if (!$bug->{short_desc}) { + $bug->{short_desc} = NO_SUBJECT; + } + + foreach my $attachment (@{ $bug->{attachments} || [] }) { + $attachment->{submitter} = $bug->{reporter}; + $attachment->{creation_ts} = $bug->{creation_ts}; + } + + $self->debug($bug, 3); + return $bug; +} + +sub _parse_audit_trail { + my ($self, $bug, $audit_trail) = @_; + return [] if !trim($audit_trail); + $self->debug(" Parsing audit trail...", 2); + + if ($audit_trail !~ /^\S+-Changed-\S+:/ms) { + # This is just a comment from the bug's creator. + $self->debug(" Audit trail is just a comment.", 2); + return ([], $audit_trail); + } + + my (@changes, %current_data, $current_column, $on_why); + my $extra_comment = ''; + my $current_field; + my @all_lines = split("\n", $audit_trail); + foreach my $line (@all_lines) { + # GNATS history looks like: + # Status-Changed-From-To: open->closed + # Status-Changed-By: jack + # Status-Changed-When: Mon May 12 14:46:59 2003 + # Status-Changed-Why: + # This is some comment here about the change. + if ($line =~ /^(\S+)-Changed-(\S+):(.*)/) { + my ($field, $column, $value) = ($1, $2, $3); + my $bz_field = $self->translate_field($field); + # If it's not a field we're importing, we don't care about + # its history. + next if !$bz_field; + # GNATS doesn't track values for description changes, + # unfortunately, and that's the only information we'd be able to + # use in Bugzilla for the audit trail on that field. + next if $bz_field eq 'comment'; + $current_field = $bz_field if !$current_field; + if ($bz_field ne $current_field) { + $self->_store_audit_change( + \@changes, $current_field, \%current_data); + %current_data = (); + $current_field = $bz_field; + } + $value = trim($value); + $self->debug(" $bz_field $column: $value", 3); + if ($column eq 'From-To') { + my ($from, $to) = split('->', $value, 2); + # Sometimes there's just a - instead of a -> between the values. + if (!defined($to)) { + ($from, $to) = split('-', $value, 2); + } + $current_data{added} = $to; + $current_data{removed} = $from; + } + elsif ($column eq 'By') { + my $email = $self->translate_value('user', $value); + # Sometimes we hit users in the audit trail that we haven't + # seen anywhere else. + $current_data{who} = $email; + } + elsif ($column eq 'When') { + $current_data{bug_when} = $self->parse_date($value); + } + if ($column eq 'Why') { + $value = '' if !defined $value; + $current_data{comment} = $value; + $on_why = 1; + } + else { + $on_why = 0; + } + } + elsif ($on_why) { + # "Why" lines are indented four characters. + $line =~ s/^\s{4}//; + $current_data{comment} .= "$line\n"; + } + else { + $self->debug( + "Extra Audit-Trail line on $bug->{product} $bug->{bug_id}:" + . " $line\n", 2); + $extra_comment .= "$line\n"; + } + } + $self->_store_audit_change(\@changes, $current_field, \%current_data); + return (\@changes, $extra_comment); +} + +sub _store_audit_change { + my ($self, $changes, $old_field, $current_data) = @_; + + $current_data->{field} = $old_field; + $current_data->{removed} = + $self->translate_value($old_field, $current_data->{removed}); + $current_data->{added} = + $self->translate_value($old_field, $current_data->{added}); + push(@$changes, { %$current_data }); +} + +sub _parse_attachments { + my ($self, $fields) = @_; + my $unformatted = delete $fields->{'Unformatted'}; + my $gnats_boundary = GNATS_BOUNDARY; + # A sanity checker to make sure that we're parsing attachments right. + my $num_attachments = 0; + $num_attachments++ while ($unformatted =~ /\Q$gnats_boundary\E/g); + # Sometimes there's a GNATS_BOUNDARY that is on the same line as other data. + $unformatted =~ s/(\S\s*)\Q$gnats_boundary\E$/$1\n$gnats_boundary/mg; + # Often the "Unformatted" section starts with stuff before + # ----gnatsweb-attachment---- that isn't necessary. + $unformatted =~ s/^\s*From:.+?Reply-to:[^\n]+//s; + $unformatted = trim($unformatted); + return [] if !$unformatted; + $self->debug('Reading attachments...', 2); + my $boundary = generate_random_password(48); + $unformatted =~ s/\Q$gnats_boundary\E/--$boundary/g; + # Sometimes the whole Unformatted section is indented by exactly + # one space, and needs to be fixed. + if ($unformatted =~ /--\Q$boundary\E\n /) { + $unformatted =~ s/^ //mg; + } + $unformatted = <parts; + # Remove the fake body. + my $part1 = shift @parts; + if ($part1->body) { + $self->debug(" Additional Unformatted data found on " + . $fields->{Category} . " bug " . $fields->{Number}); + $self->debug($part1->body, 3); + $fields->{_add_comment} .= "\n\nUnformatted:\n" . $part1->body; + } + + my @attachments; + foreach my $part (@parts) { + $self->debug(' Parsing attachment: ' . $part->filename); + my $temp_fh = IO::File->new_tmpfile or die ("Can't create tempfile: $!"); + $temp_fh->binmode; + print $temp_fh $part->body; + my $content_type = $part->content_type; + $content_type =~ s/; name=.+$//; + my $attachment = { filename => $part->filename, + description => $part->filename, + mimetype => $content_type, + data => $temp_fh }; + $self->debug($attachment, 3); + push(@attachments, $attachment); + } + + if (scalar(@attachments) ne $num_attachments) { + warn "WARNING: Expected $num_attachments attachments but got " + . scalar(@attachments) . "\n" ; + $self->debug($unformatted, 3); + } + return \@attachments; +} + +sub translate_value { + my $self = shift; + my ($field, $value, $options) = @_; + my $original_value = $value; + $options ||= {}; + + if (!ref($value) and grep($_ eq $field, $self->USER_FIELDS)) { + if ($value =~ /(\S+\@\S+)/) { + $value = $1; + $value =~ s/^$//; + } + else { + # Sometimes names have extra stuff on the end like "(Somebody's Name)" + $value =~ s/\s+\(.+\)$//; + # Sometimes user fields look like "(user)" instead of just "user". + $value =~ s/^\((.+)\)$/$1/; + $value = trim($value); + } + } + + if ($field eq 'version' and $value ne '') { + my $version_re = $self->config('version_regex'); + if ($version_re and $value =~ $version_re) { + $value = $1; + } + # In the GNATS that I tested this with, there were many extremely long + # values for "version" that caused some import problems (they were + # longer than the max allowed version value). So if the version value + # is longer than 32 characters, pull out the first thing that looks + # like a version number. + elsif (length($value) > LONG_VERSION_LENGTH) { + $value =~ s/^.+?\b(\d[\w\.]+)\b.+$/$1/; + } + } + + my @args = @_; + $args[1] = $value; + + $value = $self->SUPER::translate_value(@args); + return $value if ref $value; + + if (grep($_ eq $field, $self->USER_FIELDS)) { + my $from_value = $value; + $value = $self->user_to_email($value); + $args[1] = $value; + # If we got something new from user_to_email, do any necessary + # translation of it. + $value = $self->SUPER::translate_value(@args); + if (!$options->{check_only}) { + $self->add_user($from_value, $value); + } + } + + return $value; +} + +1; -- cgit v1.2.3-24-g4f1b