Bug 22353: Automatic duplicate bug detection on enter_bug.cgi

r=glob, a=mkanat
author: Max Kanat-Alexander <mkanat@bugzilla.org> 2010-06-22 04:10:21 +0200
committer: Max Kanat-Alexander <mkanat@bugzilla.org> 2010-06-22 04:10:21 +0200
commit: 4acb2424e62cbd64bc92a5dec2cbe1e2b7096157 (patch)
tree: 9076b3809846e8fc58ce720e0cd32dc0d3c8ff77 /Bugzilla/Bug.pm
parent: 601bda78fa436e7030ebcefc589d930c99f1373e (diff)
download: bugzilla-4acb2424e62cbd64bc92a5dec2cbe1e2b7096157.tar.gz
bugzilla-4acb2424e62cbd64bc92a5dec2cbe1e2b7096157.tar.xz
1 files changed, 115 insertions, 18 deletions
diff --git a/Bugzilla/Bug.pm b/Bugzilla/Bug.pm
index 6df7363d5..80a4b5933 100644
--- a/Bugzilla/Bug.pm
+++ b/Bugzilla/Bug.pm
@@ -49,7 +49,7 @@ use Bugzilla::Group;
 use Bugzilla::Status;
 use Bugzilla::Comment;
 
-use List::MoreUtils qw(firstidx);
+use List::MoreUtils qw(firstidx uniq);
 use List::Util qw(min first);
 use Storable qw(dclone);
 use URI;
@@ -446,6 +446,87 @@ sub match {
     return $class->SUPER::match(@_);
 }
 
+sub possible_duplicates {
+    my ($class, $params) = @_;
+    my $short_desc = $params->{summary};
+    my $products = $params->{products} || [];
+    my $limit = $params->{limit} || MAX_POSSIBLE_DUPLICATES;
+    $limit = MAX_POSSIBLE_DUPLICATES if $limit > MAX_POSSIBLE_DUPLICATES;
+    $products = [$products] if !ref($products) eq 'ARRAY';
+
+    my $orig_limit = $limit;
+    detaint_natural($limit) 
+        || ThrowCodeError('param_must_be_numeric', 
+                          { function => 'possible_duplicates',
+                            param    => $orig_limit });
+
+    my $dbh = Bugzilla->dbh;
+    my $user = Bugzilla->user;
+    my @words = split(/[\b\s]+/, $short_desc || '');
+    # Exclude punctuation from the array.
+    @words = map { /(\w+)/; $1 } @words;
+    # And make sure that each word is longer than 2 characters.
+    @words = grep { defined $_ and length($_) > 2 } @words;
+
+    return [] if !@words;
+
+    my ($where_sql, $relevance_sql);
+    if ($dbh->FULLTEXT_OR) {
+        my $joined_terms = join($dbh->FULLTEXT_OR, @words);
+        ($where_sql, $relevance_sql) = 
+            $dbh->sql_fulltext_search('bugs_fulltext.short_desc', 
+                                      $joined_terms, 1);
+        $relevance_sql ||= $where_sql;
+    }
+    else {
+        my (@where, @relevance);
+        my $count = 0;
+        foreach my $word (@words) {
+            $count++;
+            my ($term, $rel_term) = $dbh->sql_fulltext_search(
+                'bugs_fulltext.short_desc', $word, $count);
+            push(@where, $term);
+            push(@relevance, $rel_term || $term);
+        }
+
+        $where_sql = join(' OR ', @where);
+        $relevance_sql = join(' + ', @relevance);
+    }
+
+    my $product_ids = join(',', map { $_->id } @$products);
+    my $product_sql = $product_ids ? "AND product_id IN ($product_ids)" : "";
+
+    # Because we collapse duplicates, we want to get slightly more bugs
+    # than were actually asked for.
+    my $sql_limit = $limit + 5;
+
+    my $possible_dupes = $dbh->selectall_arrayref(
+        "SELECT bugs.bug_id AS bug_id, bugs.resolution AS resolution,
+                ($relevance_sql) AS relevance
+           FROM bugs
+                INNER JOIN bugs_fulltext ON bugs.bug_id = bugs_fulltext.bug_id
+          WHERE ($where_sql) $product_sql
+       ORDER BY relevance DESC, bug_id DESC
+          LIMIT $sql_limit", {Slice=>{}});
+
+    my @actual_dupe_ids;
+    # Resolve duplicates into their ultimate target duplicates.
+    foreach my $bug (@$possible_dupes) {
+        my $push_id = $bug->{bug_id};
+        if ($bug->{resolution} && $bug->{resolution} eq 'DUPLICATE') {
+            $push_id = _resolve_ultimate_dup_id($bug->{bug_id});
+        }
+        push(@actual_dupe_ids, $push_id);
+    }
+    @actual_dupe_ids = uniq @actual_dupe_ids;
+    if (scalar @actual_dupe_ids > $limit) {
+        @actual_dupe_ids = @actual_dupe_ids[0..($limit-1)];
+    }
+
+    my $visible = $user->visible_bugs(\@actual_dupe_ids);
+    return $class->new_from_list($visible);
+}
+
 # Docs for create() (there's no POD in this file yet, but we very
 # much need this documented right now):
 #
@@ -1426,23 +1507,7 @@ sub _check_dup_id {
 
     # Make sure a loop isn't created when marking this bug
     # as duplicate.
-    my %dupes;
-    my $this_dup = $dupe_of;
-    my $sth = $dbh->prepare('SELECT dupe_of FROM duplicates WHERE dupe = ?');
-
-    while ($this_dup) {
-        if ($this_dup == $self->id) {
-            ThrowUserError('dupe_loop_detected', { bug_id  => $self->id,
-                                                   dupe_of => $dupe_of });
-        }
-        # If $dupes{$this_dup} is already set to 1, then a loop
-        # already exists which does not involve this bug.
-        # As the user is not responsible for this loop, do not
-        # prevent him from marking this bug as a duplicate.
-        last if exists $dupes{$this_dup};
-        $dupes{$this_dup} = 1;
-        $this_dup = $dbh->selectrow_array($sth, undef, $this_dup);
-    }
+   _resolve_ultimate_dup_id($self->id, $dupe_of, 1);
 
     my $cur_dup = $self->dup_id || 0;
     if ($cur_dup != $dupe_of && Bugzilla->params->{'commentonduplicate'}
@@ -2843,6 +2908,38 @@ sub dup_id {
     return $self->{'dup_id'};
 }
 
+sub _resolve_ultimate_dup_id {
+    my ($bug_id, $dupe_of, $loops_are_an_error) = @_;
+    my $dbh = Bugzilla->dbh;
+    my $sth = $dbh->prepare('SELECT dupe_of FROM duplicates WHERE dupe = ?');
+
+    my $this_dup = $dupe_of || $dbh->selectrow_array($sth, undef, $bug_id);
+    my $last_dup = $bug_id;
+
+    my %dupes;
+    while ($this_dup) {
+        if ($this_dup == $bug_id) {
+            if ($loops_are_an_error) {
+                ThrowUserError('dupe_loop_detected', { bug_id  => $bug_id,
+                                                       dupe_of => $dupe_of });
+            }
+            else {
+                return $last_dup;
+            }
+        }
+        # If $dupes{$this_dup} is already set to 1, then a loop
+        # already exists which does not involve this bug.
+        # As the user is not responsible for this loop, do not
+        # prevent him from marking this bug as a duplicate.
+        return $last_dup if exists $dupes{$this_dup};
+        $dupes{$this_dup} = 1;
+        $last_dup = $this_dup;
+        $this_dup = $dbh->selectrow_array($sth, undef, $this_dup);
+    }
+
+    return $last_dup;
+}
+
 sub actual_time {
     my ($self) = @_;
     return $self->{'actual_time'} if exists $self->{'actual_time'};
author	Max Kanat-Alexander <mkanat@bugzilla.org>	2010-06-22 04:10:21 +0200
committer	Max Kanat-Alexander <mkanat@bugzilla.org>	2010-06-22 04:10:21 +0200
commit	4acb2424e62cbd64bc92a5dec2cbe1e2b7096157 (patch)
tree	9076b3809846e8fc58ce720e0cd32dc0d3c8ff77 /Bugzilla/Bug.pm
parent	601bda78fa436e7030ebcefc589d930c99f1373e (diff)
download	bugzilla-4acb2424e62cbd64bc92a5dec2cbe1e2b7096157.tar.gz bugzilla-4acb2424e62cbd64bc92a5dec2cbe1e2b7096157.tar.xz