From 2429c5daba37968dacb9b84e6eb671b057765fda Mon Sep 17 00:00:00 2001 From: "mkanat%bugzilla.org" <> Date: Tue, 25 Mar 2008 03:47:21 +0000 Subject: Bug 399370: Fulltext search with a LIKE on bugs.short_desc is too slow (make Bugzilla use a separate fulltext table) Patch By Max Kanat-Alexander (module owner) a=mkanat --- Bugzilla/Bug.pm | 47 ++++++++++++++++++++++++++------ Bugzilla/DB/Mysql.pm | 20 +++++++++++--- Bugzilla/DB/Pg.pm | 4 +++ Bugzilla/DB/Schema.pm | 25 +++++++++++++++-- Bugzilla/DB/Schema/Mysql.pm | 2 +- Bugzilla/Install/DB.pm | 42 ++++++++++++++++++++++++----- Bugzilla/Search.pm | 66 +++++++++------------------------------------ importxml.pl | 1 + 8 files changed, 133 insertions(+), 74 deletions(-) diff --git a/Bugzilla/Bug.pm b/Bugzilla/Bug.pm index 96f185c1f..6223320d6 100755 --- a/Bugzilla/Bug.pm +++ b/Bugzilla/Bug.pm @@ -408,12 +408,6 @@ sub create { } } - $dbh->bz_commit_transaction(); - - # Because MySQL doesn't support transactions on the longdescs table, - # we do this after we've committed the transaction. That way we're - # fairly sure we're inserting a good Bug ID. - # And insert the comment. We always insert a comment on bug creation, # but sometimes it's blank. my @columns = qw(bug_id who bug_when thetext); @@ -429,6 +423,12 @@ sub create { $dbh->do('INSERT INTO longdescs (' . join(',', @columns) . ") VALUES ($qmarks)", undef, @values); + $dbh->bz_commit_transaction(); + + # Because MySQL doesn't support transactions on the fulltext table, + # we do this after we've committed the transaction. That way we're + # sure we're inserting a good Bug ID. + $bug->_sync_fulltext('new bug'); return $bug; } @@ -615,6 +615,13 @@ sub update { $self->{delta_ts} = $delta_ts; } + # The only problem with this here is that update() is often called + # in the middle of a transaction, and if that transaction is rolled + # back, this change will *not* be rolled back. As we expect rollbacks + # to be extremely rare, that is OK for us. + $self->_sync_fulltext() + if $self->{added_comments} || $changes->{short_desc}; + return $changes; } @@ -766,6 +773,30 @@ sub update_keywords { return [$removed_keywords, $added_keywords]; } +# Should be called any time you update short_desc or change a comment. +sub _sync_fulltext { + my ($self, $new_bug) = @_; + my $dbh = Bugzilla->dbh; + if ($new_bug) { + $dbh->do('INSERT INTO bugs_fulltext (bug_id, short_desc) + SELECT bug_id, short_desc FROM bugs WHERE bug_id = ?', + undef, $self->id); + } + else { + $dbh->do('UPDATE bugs_fulltext SET short_desc = ? WHERE bug_id = ?', + undef, $self->short_desc, $self->id); + } + my $comments = $dbh->selectall_arrayref( + 'SELECT thetext, isprivate FROM longdescs WHERE bug_id = ?', + undef, $self->id); + my $all = join("\n", map { $_->[0] } @$comments); + my @no_private = grep { !$_->[1] } @$comments; + my $nopriv_string = join("\n", map { $_->[0] } @no_private); + $dbh->do('UPDATE bugs_fulltext SET comments = ?, comments_noprivate = ? + WHERE bug_id = ?', undef, $all, $nopriv_string, $self->id); +} + + # This is the correct way to delete bugs from the DB. # No bug should be deleted from anywhere else except from here. # @@ -821,11 +852,10 @@ sub remove_from_db { # Several of the previous tables also depend on attach_id. $dbh->do("DELETE FROM attachments WHERE bug_id = ?", undef, $bug_id); $dbh->do("DELETE FROM bugs WHERE bug_id = ?", undef, $bug_id); + $dbh->do("DELETE FROM longdescs WHERE bug_id = ?", undef, $bug_id); $dbh->bz_commit_transaction(); - $dbh->do("DELETE FROM longdescs WHERE bug_id = ?", undef, $bug_id); - # Now this bug no longer exists $self->DESTROY; return $self; @@ -2696,6 +2726,7 @@ sub update_comment { # We assume _check_comment() has already been called earlier. Bugzilla->dbh->do('UPDATE longdescs SET thetext = ? WHERE comment_id = ?', undef, ($new_comment, $comment_id)); + $self->_sync_fulltext(); # Update the comment object with this new text. $current_comment_obj[0]->{'body'} = $new_comment; diff --git a/Bugzilla/DB/Mysql.pm b/Bugzilla/DB/Mysql.pm index 2da0c44ea..0bd7b7d37 100644 --- a/Bugzilla/DB/Mysql.pm +++ b/Bugzilla/DB/Mysql.pm @@ -255,11 +255,11 @@ EOT print "\nISAM->MyISAM table conversion done.\n\n"; } - my $sd_index_deleted = 0; + my ($sd_index_deleted, $longdescs_index_deleted); my @tables = $self->bz_table_list_real(); - # We want to convert the bugs table to MyISAM, but it's possible that - # it has a fulltext index on it and this will fail unless we remove - # the index. + # We want to convert tables to InnoDB, but it's possible that they have + # fulltext indexes on them, and conversation will fail unless we remove + # the indexes. if (grep($_ eq 'bugs', @tables)) { if ($self->bz_index_info_real('bugs', 'short_desc')) { $self->bz_drop_index_raw('bugs', 'short_desc'); @@ -268,6 +268,13 @@ EOT $self->bz_drop_index_raw('bugs', 'bugs_short_desc_idx'); $sd_index_deleted = 1; # Used for later schema cleanup. } + if ($self->bz_index_info_real('longdescs', 'thetext')) { + $self->bz_drop_index_raw('longdescs', 'thetext'); + } + if ($self->bz_index_info_real('longdescs', 'longdescs_thetext_idx')) { + $self->bz_drop_index_raw('longdescs', 'longdescs_thetext_idx'); + $longdescs_index_deleted = 1; # For later schema cleanup. + } } # Upgrade tables from MyISAM to InnoDB @@ -480,6 +487,11 @@ EOT $self->_bz_real_schema->delete_index('bugs', 'bugs_short_desc_idx'); $self->_bz_store_real_schema; } + if ($longdescs_index_deleted) { + $self->_bz_real_schema->delete_index('longdescs', + 'longdescs_thetext_idx'); + $self->_bz_store_real_schema; + } # The old timestamp fields need to be adjusted here instead of in # checksetup. Otherwise the UPDATE statements inside of bz_add_column diff --git a/Bugzilla/DB/Pg.pm b/Bugzilla/DB/Pg.pm index 9675e1f26..4777ba89a 100644 --- a/Bugzilla/DB/Pg.pm +++ b/Bugzilla/DB/Pg.pm @@ -179,6 +179,10 @@ sub bz_setup_database { # field, because it can't have index data longer than 2770 # characters on that field. $self->bz_drop_index('longdescs', 'longdescs_thetext_idx'); + # Same for all the comments fields in the fulltext table. + $self->bz_drop_index('bugs_fulltext', 'bugs_fulltext_comments_idx'); + $self->bz_drop_index('bugs_fulltext', + 'bugs_fulltext_comments_noprivate_idx'); # PostgreSQL also wants an index for calling LOWER on # login_name, which we do with sql_istrcmp all over the place. diff --git a/Bugzilla/DB/Schema.pm b/Bugzilla/DB/Schema.pm index 23f6e0ca4..b8ac649b7 100644 --- a/Bugzilla/DB/Schema.pm +++ b/Bugzilla/DB/Schema.pm @@ -279,6 +279,29 @@ use constant ABSTRACT_SCHEMA => { ], }, + bugs_fulltext => { + FIELDS => [ + bug_id => {TYPE => 'INT3', NOTNULL => 1, PRIMARYKEY => 1, + REFERENCES => {TABLE => 'bugs', + COLUMN => 'bug_id', + DELETE => 'CASCADE'}}, + short_desc => {TYPE => 'varchar(255)', NOTNULL => 1}, + # Comments are stored all together in one column for searching. + # This allows us to examine all comments together when deciding + # the relevance of a bug in fulltext search. + comments => {TYPE => 'LONGTEXT'}, + comments_noprivate => {TYPE => 'LONGTEXT'}, + ], + INDEXES => [ + bugs_fulltext_short_desc_idx => {FIELDS => ['short_desc'], + TYPE => 'FULLTEXT'}, + bugs_fulltext_comments_idx => {FIELDS => ['comments'], + TYPE => 'FULLTEXT'}, + bugs_fulltext_comments_noprivate_idx => { + FIELDS => ['comments_noprivate'], TYPE => 'FULLTEXT'}, + ], + }, + bugs_activity => { FIELDS => [ bug_id => {TYPE => 'INT3', NOTNULL => 1}, @@ -336,8 +359,6 @@ use constant ABSTRACT_SCHEMA => { longdescs_bug_id_idx => ['bug_id'], longdescs_who_idx => [qw(who bug_id)], longdescs_bug_when_idx => ['bug_when'], - longdescs_thetext_idx => {FIELDS => ['thetext'], - TYPE => 'FULLTEXT'}, ], }, diff --git a/Bugzilla/DB/Schema/Mysql.pm b/Bugzilla/DB/Schema/Mysql.pm index 2f4bc42b2..627716970 100644 --- a/Bugzilla/DB/Schema/Mysql.pm +++ b/Bugzilla/DB/Schema/Mysql.pm @@ -86,7 +86,7 @@ use constant REVERSE_MAPPING => { # as in their db-specific version, so no reverse mapping is needed. }; -use constant MYISAM_TABLES => qw(longdescs); +use constant MYISAM_TABLES => qw(bugs_fulltext); #------------------------------------------------------------------------------ sub _initialize { diff --git a/Bugzilla/Install/DB.pm b/Bugzilla/Install/DB.pm index c6668aec2..ca011ce6c 100644 --- a/Bugzilla/Install/DB.pm +++ b/Bugzilla/Install/DB.pm @@ -315,12 +315,6 @@ sub update_table_definitions { $dbh->do('UPDATE quips SET userid = NULL WHERE userid = 0'); } - # Right now, we only create the "thetext" index on MySQL. - if ($dbh->isa('Bugzilla::DB::Mysql')) { - $dbh->bz_add_index('longdescs', 'longdescs_thetext_idx', - {TYPE => 'FULLTEXT', FIELDS => [qw(thetext)]}); - } - _convert_attachments_filename_from_mediumtext(); $dbh->bz_add_column('quips', 'approved', @@ -525,6 +519,9 @@ sub update_table_definitions { $dbh->bz_alter_column('namedqueries', 'query_type', {TYPE => 'BOOLEAN', NOTNULL => 1, DEFAULT => 0}); + $dbh->bz_drop_index('longdescs', 'longdescs_thetext_idx'); + _populate_bugs_fulltext(); + ################################################################ # New --TABLE-- changes should go *** A B O V E *** this point # ################################################################ @@ -2992,6 +2989,39 @@ sub _check_content_length { } } +sub _populate_bugs_fulltext { + my $dbh = Bugzilla->dbh; + my $fulltext = $dbh->selectrow_array('SELECT 1 FROM bugs_fulltext ' + . $dbh->sql_limit(1)); + # We only populate the table if it's empty... + if (!$fulltext) { + # ... and if there are bugs in the bugs table. + my $bug_ids = $dbh->selectcol_arrayref('SELECT bug_id FROM bugs'); + return if !@$bug_ids; + + print "Populating bugs_fulltext.short_desc...\n"; + $dbh->do('INSERT INTO bugs_fulltext (bug_id, short_desc) + SELECT bug_id, short_desc FROM bugs'); + print "Populating bugs_fulltext comments fields...\n"; + my $count = 1; + my $sth_all = $dbh->prepare('SELECT thetext FROM longdescs + WHERE bug_id = ?'); + my $sth_nopriv = $dbh->prepare('SELECT thetext FROM longdescs + WHERE bug_id = ? AND isprivate = 0'); + my $sth_update = $dbh->prepare( + 'UPDATE bugs_fulltext SET comments = ?, comments_noprivate = ? + WHERE bug_id = ?'); + foreach my $id (@$bug_ids) { + my $all = $dbh->selectcol_arrayref($sth_all, undef, $id); + my $nopriv = $dbh->selectcol_arrayref($sth_nopriv, undef, $id); + $sth_update->execute(join("\n", @$all), join("\n", @$nopriv), $id); + indicate_progress({ total => scalar @$bug_ids, every => 100, + current => $count++ }); + } + print "\n"; + } +} + 1; __END__ diff --git a/Bugzilla/Search.pm b/Bugzilla/Search.pm index 252813a06..111875dac 100644 --- a/Bugzilla/Search.pm +++ b/Bugzilla/Search.pm @@ -47,11 +47,6 @@ use Bugzilla::Keyword; use Date::Format; use Date::Parse; -# How much we should add to the relevance, for each word that matches -# in bugs.short_desc, during fulltext search. This is high because -# we want summary matches to be *very* relevant, by default. -use constant SUMMARY_RELEVANCE_FACTOR => 7; - # Some fields are not sorted on themselves, but on other fields. # We need to have a list of these fields and what they map to. # Each field points to an array that contains the fields mapped @@ -1021,21 +1016,6 @@ sub BuildOrderBy { push(@$stringlist, trim($orderfield . ' ' . $orderdirection)); } -# This is a helper for fulltext search -sub _split_words_into_like { - my ($field, $text) = @_; - my $dbh = Bugzilla->dbh; - # This code is very similar to Bugzilla::DB::sql_fulltext_search, - # so you can look there if you'd like an explanation of what it's - # doing. - my $lower_text = lc($text); - my @words = split(/\s+/, $lower_text); - @words = map($dbh->quote("%$_%"), @words); - map(trick_taint($_), @words); - @words = map($dbh->sql_istrcmp($field, $_, 'LIKE'), @words); - return @words; -} - ##################################################################### # Search Functions ##################################################################### @@ -1252,44 +1232,24 @@ sub _content_matches { # accept the "matches" operator, which is specific to full-text # index searches. - # Add the longdescs table to the query so we can search comments. - my $table = "longdescs_$$chartid"; - my $extra = ""; - if (Bugzilla->params->{"insidergroup"} - && !Bugzilla->user->in_group(Bugzilla->params->{"insidergroup"})) - { - $extra = "AND $table.isprivate < 1"; - } - push(@$supptables, "LEFT JOIN longdescs AS $table " . - "ON bugs.bug_id = $table.bug_id $extra"); - + # Add the fulltext table to the query so we can search on it. + my $table = "bugs_fulltext_$$chartid"; + my $comments_col = "comments"; + $comments_col = "comments_noprivate" unless $self->{'user'}->is_insider; + push(@$supptables, "LEFT JOIN bugs_fulltext AS $table " . + "ON bugs.bug_id = $table.bug_id"); + # Create search terms to add to the SELECT and WHERE clauses. - # $term1 searches comments. - my $term1 = $dbh->sql_fulltext_search("${table}.thetext", $$v); - - # short_desc searching for the WHERE clause - my @words = _split_words_into_like('bugs.short_desc', $$v); - my $term2_where = join(' OR ', @words); - - # short_desc relevance - my $factor = SUMMARY_RELEVANCE_FACTOR; - my @s_words = map("CASE WHEN $_ THEN $factor ELSE 0 END", @words); - my $term2_select = join(' + ', @s_words); - + my $term1 = $dbh->sql_fulltext_search("$table.$comments_col", $$v); + my $term2 = $dbh->sql_fulltext_search("$table.short_desc", $$v); + # The term to use in the WHERE clause. - $$term = "$term1 > 0 OR ($term2_where)"; - + $$term = "$term1 > 0 OR $term2 > 0"; + # In order to sort by relevance (in case the user requests it), # we SELECT the relevance value and give it an alias so we can # add it to the SORT BY clause when we build it in buglist.cgi. - # - # Note: We should be calculating the relevance based on all - # comments for a bug, not just matching comments, but that's hard - # (see http://bugzilla.mozilla.org/show_bug.cgi?id=145588#c35). - my $select_term = "(SUM($term1) + $term2_select) AS relevance"; - - # add the column not used in aggregate function explicitly - push(@$groupby, 'bugs.short_desc'); + my $select_term = "($term1 + $term2) AS relevance"; # Users can specify to display the relevance field, in which case # it'll show up in the list of fields being selected, and we need diff --git a/importxml.pl b/importxml.pl index b7203a6e6..2644d3380 100755 --- a/importxml.pl +++ b/importxml.pl @@ -1253,6 +1253,7 @@ sub process_bug { VALUES (?,?,?,?,?,?)", undef, $id, $exporterid, $timestamp, $worktime, $private, $long_description ); + Bugzilla::Bug->new($id)->_sync_fulltext(); # Add this bug to each group of which its product is a member. my $sth_group = $dbh->prepare("INSERT INTO bug_group_map (bug_id, group_id) -- cgit v1.2.3-24-g4f1b