From 4acb2424e62cbd64bc92a5dec2cbe1e2b7096157 Mon Sep 17 00:00:00 2001 From: Max Kanat-Alexander Date: Mon, 21 Jun 2010 19:10:21 -0700 Subject: Bug 22353: Automatic duplicate bug detection on enter_bug.cgi r=glob, a=mkanat --- Bugzilla/Bug.pm | 133 +++++++++++++++++++++++++++++++++++++++------ Bugzilla/Constants.pm | 5 ++ Bugzilla/DB.pm | 5 ++ Bugzilla/DB/Mysql.pm | 5 +- Bugzilla/DB/Oracle.pm | 5 +- Bugzilla/User.pm | 2 +- Bugzilla/WebService/Bug.pm | 35 ++++++++++-- 7 files changed, 161 insertions(+), 29 deletions(-) (limited to 'Bugzilla') diff --git a/Bugzilla/Bug.pm b/Bugzilla/Bug.pm index 6df7363d5..80a4b5933 100644 --- a/Bugzilla/Bug.pm +++ b/Bugzilla/Bug.pm @@ -49,7 +49,7 @@ use Bugzilla::Group; use Bugzilla::Status; use Bugzilla::Comment; -use List::MoreUtils qw(firstidx); +use List::MoreUtils qw(firstidx uniq); use List::Util qw(min first); use Storable qw(dclone); use URI; @@ -446,6 +446,87 @@ sub match { return $class->SUPER::match(@_); } +sub possible_duplicates { + my ($class, $params) = @_; + my $short_desc = $params->{summary}; + my $products = $params->{products} || []; + my $limit = $params->{limit} || MAX_POSSIBLE_DUPLICATES; + $limit = MAX_POSSIBLE_DUPLICATES if $limit > MAX_POSSIBLE_DUPLICATES; + $products = [$products] if !ref($products) eq 'ARRAY'; + + my $orig_limit = $limit; + detaint_natural($limit) + || ThrowCodeError('param_must_be_numeric', + { function => 'possible_duplicates', + param => $orig_limit }); + + my $dbh = Bugzilla->dbh; + my $user = Bugzilla->user; + my @words = split(/[\b\s]+/, $short_desc || ''); + # Exclude punctuation from the array. + @words = map { /(\w+)/; $1 } @words; + # And make sure that each word is longer than 2 characters. + @words = grep { defined $_ and length($_) > 2 } @words; + + return [] if !@words; + + my ($where_sql, $relevance_sql); + if ($dbh->FULLTEXT_OR) { + my $joined_terms = join($dbh->FULLTEXT_OR, @words); + ($where_sql, $relevance_sql) = + $dbh->sql_fulltext_search('bugs_fulltext.short_desc', + $joined_terms, 1); + $relevance_sql ||= $where_sql; + } + else { + my (@where, @relevance); + my $count = 0; + foreach my $word (@words) { + $count++; + my ($term, $rel_term) = $dbh->sql_fulltext_search( + 'bugs_fulltext.short_desc', $word, $count); + push(@where, $term); + push(@relevance, $rel_term || $term); + } + + $where_sql = join(' OR ', @where); + $relevance_sql = join(' + ', @relevance); + } + + my $product_ids = join(',', map { $_->id } @$products); + my $product_sql = $product_ids ? "AND product_id IN ($product_ids)" : ""; + + # Because we collapse duplicates, we want to get slightly more bugs + # than were actually asked for. + my $sql_limit = $limit + 5; + + my $possible_dupes = $dbh->selectall_arrayref( + "SELECT bugs.bug_id AS bug_id, bugs.resolution AS resolution, + ($relevance_sql) AS relevance + FROM bugs + INNER JOIN bugs_fulltext ON bugs.bug_id = bugs_fulltext.bug_id + WHERE ($where_sql) $product_sql + ORDER BY relevance DESC, bug_id DESC + LIMIT $sql_limit", {Slice=>{}}); + + my @actual_dupe_ids; + # Resolve duplicates into their ultimate target duplicates. + foreach my $bug (@$possible_dupes) { + my $push_id = $bug->{bug_id}; + if ($bug->{resolution} && $bug->{resolution} eq 'DUPLICATE') { + $push_id = _resolve_ultimate_dup_id($bug->{bug_id}); + } + push(@actual_dupe_ids, $push_id); + } + @actual_dupe_ids = uniq @actual_dupe_ids; + if (scalar @actual_dupe_ids > $limit) { + @actual_dupe_ids = @actual_dupe_ids[0..($limit-1)]; + } + + my $visible = $user->visible_bugs(\@actual_dupe_ids); + return $class->new_from_list($visible); +} + # Docs for create() (there's no POD in this file yet, but we very # much need this documented right now): # @@ -1426,23 +1507,7 @@ sub _check_dup_id { # Make sure a loop isn't created when marking this bug # as duplicate. - my %dupes; - my $this_dup = $dupe_of; - my $sth = $dbh->prepare('SELECT dupe_of FROM duplicates WHERE dupe = ?'); - - while ($this_dup) { - if ($this_dup == $self->id) { - ThrowUserError('dupe_loop_detected', { bug_id => $self->id, - dupe_of => $dupe_of }); - } - # If $dupes{$this_dup} is already set to 1, then a loop - # already exists which does not involve this bug. - # As the user is not responsible for this loop, do not - # prevent him from marking this bug as a duplicate. - last if exists $dupes{$this_dup}; - $dupes{$this_dup} = 1; - $this_dup = $dbh->selectrow_array($sth, undef, $this_dup); - } + _resolve_ultimate_dup_id($self->id, $dupe_of, 1); my $cur_dup = $self->dup_id || 0; if ($cur_dup != $dupe_of && Bugzilla->params->{'commentonduplicate'} @@ -2843,6 +2908,38 @@ sub dup_id { return $self->{'dup_id'}; } +sub _resolve_ultimate_dup_id { + my ($bug_id, $dupe_of, $loops_are_an_error) = @_; + my $dbh = Bugzilla->dbh; + my $sth = $dbh->prepare('SELECT dupe_of FROM duplicates WHERE dupe = ?'); + + my $this_dup = $dupe_of || $dbh->selectrow_array($sth, undef, $bug_id); + my $last_dup = $bug_id; + + my %dupes; + while ($this_dup) { + if ($this_dup == $bug_id) { + if ($loops_are_an_error) { + ThrowUserError('dupe_loop_detected', { bug_id => $bug_id, + dupe_of => $dupe_of }); + } + else { + return $last_dup; + } + } + # If $dupes{$this_dup} is already set to 1, then a loop + # already exists which does not involve this bug. + # As the user is not responsible for this loop, do not + # prevent him from marking this bug as a duplicate. + return $last_dup if exists $dupes{$this_dup}; + $dupes{$this_dup} = 1; + $last_dup = $this_dup; + $this_dup = $dbh->selectrow_array($sth, undef, $this_dup); + } + + return $last_dup; +} + sub actual_time { my ($self) = @_; return $self->{'actual_time'} if exists $self->{'actual_time'}; diff --git a/Bugzilla/Constants.pm b/Bugzilla/Constants.pm index 55ef4a0e3..9af9e7b72 100644 --- a/Bugzilla/Constants.pm +++ b/Bugzilla/Constants.pm @@ -175,6 +175,7 @@ use File::Basename; MAX_FIELD_VALUE_SIZE MAX_FREETEXT_LENGTH MAX_BUG_URL_LENGTH + MAX_POSSIBLE_DUPLICATES PASSWORD_DIGEST_ALGORITHM PASSWORD_SALT_LENGTH @@ -527,6 +528,10 @@ use constant MAX_FREETEXT_LENGTH => 255; # The longest a bug URL in a BUG_URLS field can be. use constant MAX_BUG_URL_LENGTH => 255; +# The largest number of possible duplicates that Bug::possible_duplicates +# will return. +use constant MAX_POSSIBLE_DUPLICATES => 25; + # This is the name of the algorithm used to hash passwords before storing # them in the database. This can be any string that is valid to pass to # Perl's "Digest" module. Note that if you change this, it won't take diff --git a/Bugzilla/DB.pm b/Bugzilla/DB.pm index 8b8d74c90..8c1aba8dd 100644 --- a/Bugzilla/DB.pm +++ b/Bugzilla/DB.pm @@ -73,6 +73,11 @@ use constant ENUM_DEFAULTS => { resolution => ["","FIXED","INVALID","WONTFIX", "DUPLICATE","WORKSFORME"], }; +# The character that means "OR" in a boolean fulltext search. If empty, +# the database doesn't support OR searches in fulltext searches. +# Used by Bugzilla::Bug::possible_duplicates. +use constant FULLTEXT_OR => ''; + ##################################################################### # Connection Methods ##################################################################### diff --git a/Bugzilla/DB/Mysql.pm b/Bugzilla/DB/Mysql.pm index 13069a78a..4b90a2a34 100644 --- a/Bugzilla/DB/Mysql.pm +++ b/Bugzilla/DB/Mysql.pm @@ -40,8 +40,8 @@ For interface details see L and L. =cut package Bugzilla::DB::Mysql; - use strict; +use base qw(Bugzilla::DB); use Bugzilla::Constants; use Bugzilla::Install::Util qw(install_string); @@ -57,8 +57,7 @@ use Text::ParseWords; # MAX_COMMENT_LENGTH is big. use constant MAX_COMMENTS => 50; -# This module extends the DB interface via inheritance -use base qw(Bugzilla::DB); +use constant FULLTEXT_OR => '|'; sub new { my ($class, $params) = @_; diff --git a/Bugzilla/DB/Oracle.pm b/Bugzilla/DB/Oracle.pm index 6fa7a9869..a671a0e68 100644 --- a/Bugzilla/DB/Oracle.pm +++ b/Bugzilla/DB/Oracle.pm @@ -35,16 +35,14 @@ For interface details see L and L. =cut package Bugzilla::DB::Oracle; - use strict; +use base qw(Bugzilla::DB); use DBD::Oracle; use DBD::Oracle qw(:ora_types); use Bugzilla::Constants; use Bugzilla::Error; use Bugzilla::Util; -# This module extends the DB interface via inheritance -use base qw(Bugzilla::DB); ##################################################################### # Constants @@ -52,6 +50,7 @@ use base qw(Bugzilla::DB); use constant EMPTY_STRING => '__BZ_EMPTY_STR__'; use constant ISOLATION_LEVEL => 'READ COMMITTED'; use constant BLOB_TYPE => { ora_type => ORA_BLOB }; +use constant FULLTEXT_OR => ' OR '; sub new { my ($class, $params) = @_; diff --git a/Bugzilla/User.pm b/Bugzilla/User.pm index aa01d93a5..6c7be2241 100644 --- a/Bugzilla/User.pm +++ b/Bugzilla/User.pm @@ -900,7 +900,7 @@ sub can_enter_product { $product && grep($_->name eq $product->name, @{ $self->get_enterable_products }); - return 1 if $can_enter; + return $product if $can_enter; return 0 unless $warn == THROW_ERROR; diff --git a/Bugzilla/WebService/Bug.pm b/Bugzilla/WebService/Bug.pm index ee5591027..d5c1fab74 100644 --- a/Bugzilla/WebService/Bug.pm +++ b/Bugzilla/WebService/Bug.pm @@ -36,6 +36,7 @@ use Bugzilla::Util qw(trick_taint trim); use Bugzilla::Version; use Bugzilla::Milestone; use Bugzilla::Status; +use Bugzilla::Token qw(issue_hash_token); ############# # Constants # @@ -322,7 +323,7 @@ sub get { else { $bug = Bugzilla::Bug->check($bug_id); } - push(@bugs, $self->_bug_to_hash($bug)); + push(@bugs, $self->_bug_to_hash($bug, $params)); } return { bugs => \@bugs, faults => \@faults }; @@ -421,7 +422,28 @@ sub search { my $bugs = Bugzilla::Bug->match($params); my $visible = Bugzilla->user->visible_bugs($bugs); - my @hashes = map { $self->_bug_to_hash($_) } @$visible; + my @hashes = map { $self->_bug_to_hash($_, $params) } @$visible; + return { bugs => \@hashes }; +} + +sub possible_duplicates { + my ($self, $params) = validate(@_, 'product'); + my $user = Bugzilla->user; + + # Undo the array-ification that validate() does, for "summary". + $params->{summary} || ThrowCodeError('param_required', + { function => 'Bug.possible_duplicates', param => 'summary' }); + + my @products; + foreach my $name (@{ $params->{'product'} || [] }) { + my $object = $user->can_enter_product($name, THROW_ERROR); + push(@products, $object); + } + + my $possible_dupes = Bugzilla::Bug->possible_duplicates( + { summary => $params->{summary}, products => \@products, + limit => $params->{limit} }); + my @hashes = map { $self->_bug_to_hash($_, $params) } @$possible_dupes; return { bugs => \@hashes }; } @@ -617,7 +639,7 @@ sub attachments { # A helper for get() and search(). sub _bug_to_hash { - my ($self, $bug) = @_; + my ($self, $bug, $filters) = @_; # Timetracking fields are deleted if the user doesn't belong to # the corresponding group. @@ -646,6 +668,11 @@ sub _bug_to_hash { $item{'component'} = $self->type('string', $bug->component); $item{'dupe_of'} = $self->type('int', $bug->dup_id); + if (Bugzilla->user->id) { + my $token = issue_hash_token([$bug->id, $bug->delta_ts]); + $item{'update_token'} = $self->type('string', $token); + } + # if we do not delete this key, additional user info, including their # real name, etc, will wind up in the 'internals' hashref delete $item{internals}->{assigned_to_obj}; @@ -659,7 +686,7 @@ sub _bug_to_hash { $item{'alias'} = undef; } - return \%item; + return filter $filters, \%item; } sub _attachment_to_hash { -- cgit v1.2.3-24-g4f1b