Bug 554819: Quicksearch should be using Text::ParseWords instead of custom code in splitString

Also fixes QS with accented characters (bug 730207) r=dkl a=LpSolit
author: Frédéric Buclin <LpSolit@gmail.com> 2012-03-29 19:56:41 +0200
committer: Frédéric Buclin <LpSolit@gmail.com> 2012-03-29 19:56:41 +0200
commit: 8f4fd271de4edbf1a206dfa0a98f8276f6189709 (patch)
tree: c7f97377b18dd5b1845c80fffe79c0b413b23863 /Bugzilla
parent: c89d334af4b6d288f6294cfe8acfa939744fcec6 (diff)
download: bugzilla-8f4fd271de4edbf1a206dfa0a98f8276f6189709.tar.gz
bugzilla-8f4fd271de4edbf1a206dfa0a98f8276f6189709.tar.xz
2 files changed, 95 insertions, 95 deletions
diff --git a/Bugzilla/Search/Quicksearch.pm b/Bugzilla/Search/Quicksearch.pm
index 8425a2be2..7b951d5ae 100644
--- a/Bugzilla/Search/Quicksearch.pm
+++ b/Bugzilla/Search/Quicksearch.pm
@@ -32,6 +32,7 @@ use Bugzilla::Util;
 
 use List::Util qw(min max);
 use List::MoreUtils qw(firstidx);
+use Text::ParseWords qw(parse_line);
 
 use base qw(Exporter);
 @Bugzilla::Search::Quicksearch::EXPORT = qw(quicksearch);
@@ -142,54 +143,95 @@ sub quicksearch {
     $searchstring =~ s/(^[\s,]+|[\s,]+$)//g;
     ThrowUserError('buglist_parameters_required') unless ($searchstring);
 
-    $fulltext = Bugzilla->user->setting('quicksearch_fulltext') eq 'on' ? 1 : 0;
-
     if ($searchstring =~ m/^[0-9,\s]*$/) {
         _bug_numbers_only($searchstring);
     }
     else {
         _handle_alias($searchstring);
 
-        # Globally translate " AND ", " OR ", " NOT " to space, pipe, dash.
-        $searchstring =~ s/\s+AND\s+/ /g;
-        $searchstring =~ s/\s+OR\s+/|/g;
-        $searchstring =~ s/\s+NOT\s+/ -/g;
+        # Retain backslashes and quotes, to know which strings are quoted,
+        # and which ones are not.
+        my @words = parse_line('\s+', 1, $searchstring);
+        # If parse_line() returns no data, this means strings are badly quoted.
+        # Rather than trying to guess what the user wanted to do, we throw an error.
+        scalar(@words)
+          || ThrowUserError('quicksearch_unbalanced_quotes', {string => $searchstring});
+
+        # A query cannot start with AND or OR, nor can it end with AND, OR or NOT.
+        ThrowUserError('quicksearch_invalid_query')
+          if ($words[0] =~ /^(?:AND|OR)$/ || $words[$#words] =~ /^(?:AND|OR|NOT)$/);
+
+        my (@qswords, @or_group);
+        while (scalar @words) {
+            my $word = shift @words;
+            # AND is the default word separator, similar to a whitespace,
+            # but |a AND OR b| is not a valid combination.
+            if ($word eq 'AND') {
+                ThrowUserError('quicksearch_invalid_query', {operators => ['AND', 'OR']})
+                  if $words[0] eq 'OR';
+            }
+            # |a OR AND b| is not a valid combination.
+            # |a OR OR b| is equivalent to |a OR b| and so is harmless.
+            elsif ($word eq 'OR') {
+                ThrowUserError('quicksearch_invalid_query', {operators => ['OR', 'AND']})
+                  if $words[0] eq 'AND';
+            }
+            # NOT negates the following word.
+            # |NOT AND| and |NOT OR| are not valid combinations.
+            # |NOT NOT| is fine but has no effect as they cancel themselves.
+            elsif ($word eq 'NOT') {
+                $word = shift @words;
+                next if $word eq 'NOT';
+                if ($word eq 'AND' || $word eq 'OR') {
+                    ThrowUserError('quicksearch_invalid_query', {operators => ['NOT', $word]});
+                }
+                unshift(@words, "-$word");
+            }
+            else {
+                # OR groups words together, as OR has higher precedence than AND.
+                push(@or_group, $word);
+                # If the next word is not OR, then we are not in a OR group,
+                # or we are leaving it.
+                if (!defined $words[0] || $words[0] ne 'OR') {
+                    push(@qswords, join('|', @or_group));
+                    @or_group = ();
+                }
+            }
+        }
 
-        my @words = splitString($searchstring);
-        _handle_status_and_resolution(\@words);
+        _handle_status_and_resolution(\@qswords);
 
         my (@unknownFields, %ambiguous_fields);
+        $fulltext = Bugzilla->user->setting('quicksearch_fulltext') eq 'on' ? 1 : 0;
 
         # Loop over all main-level QuickSearch words.
-        foreach my $qsword (@words) {
-            my $negate = substr($qsword, 0, 1) eq '-';
-            if ($negate) {
-                $qsword = substr($qsword, 1);
-            }
+        foreach my $qsword (@qswords) {
+            my @or_operand = parse_line('\|', 1, $qsword);
+            foreach my $term (@or_operand) {
+                my $negate = substr($term, 0, 1) eq '-';
+                if ($negate) {
+                    $term = substr($term, 1);
+                }
 
-            # No special first char
-            if (!_handle_special_first_chars($qsword, $negate)) {
-                # Split by '|' to get all operands for a boolean OR.
-                foreach my $or_operand (split(/\|/, $qsword)) {
-                    if (!_handle_field_names($or_operand, $negate,
-                                             \@unknownFields, 
-                                             \%ambiguous_fields))
-                    {
-                        # Having ruled out the special cases, we may now split
-                        # by comma, which is another legal boolean OR indicator.
-                        foreach my $word (split(/,/, $or_operand)) {
-                            if (!_special_field_syntax($word, $negate)) {
-                                _default_quicksearch_word($word, $negate);
-                            }
-                            _handle_urls($word, $negate);
-                        }
+                next if _handle_special_first_chars($term, $negate);
+                next if _handle_field_names($term, $negate, \@unknownFields,
+                                            \%ambiguous_fields);
+
+                # Having ruled out the special cases, we may now split
+                # by comma, which is another legal boolean OR indicator.
+                # Remove quotes from quoted words, if any.
+                @words = parse_line(',', 0, $term);
+                foreach my $word (@words) {
+                    if (!_special_field_syntax($word, $negate)) {
+                        _default_quicksearch_word($word, $negate);
                     }
+                    _handle_urls($word, $negate);
                 }
             }
             $chart++;
             $and = 0;
             $or = 0;
-        } # foreach (@words)
+        }
 
         # Inform user about any unknown fields
         if (scalar(@unknownFields) || scalar(keys %ambiguous_fields)) {
@@ -315,7 +357,7 @@ sub _handle_special_first_chars {
 
     my $firstChar = substr($qsword, 0, 1);
     my $baseWord = substr($qsword, 1);
-    my @subWords = split(/[\|,]/, $baseWord);
+    my @subWords = split(/,/, $baseWord);
 
     if ($firstChar eq '#') {
         addChart('short_desc', 'substring', $baseWord, $negate);
@@ -347,7 +389,7 @@ sub _handle_special_first_chars {
 
 sub _handle_field_names {
     my ($or_operand, $negate, $unknownFields, $ambiguous_fields) = @_;
-    
+
     # Flag and requestee shortcut
     if ($or_operand =~ /^(?:flag:)?([^\?]+\?)([^\?]*)$/) {
         addChart('flagtypes.name', 'substring', $1, $negate);
@@ -355,32 +397,40 @@ sub _handle_field_names {
         addChart('requestees.login_name', 'substring', $2, $negate);
         return 1;
     }
-    
-    # generic field1,field2,field3:value1,value2 notation
-    if ($or_operand =~ /^([^:]+):([^:]+)$/) {
-        my @fields = split(/,/, $1);
-        my @values = split(/,/, $2);
+
+    # Generic field1,field2,field3:value1,value2 notation.
+    # We have to correctly ignore commas and colons in quotes.
+    my @field_values = parse_line(':', 1, $or_operand);
+    if (scalar @field_values == 2) {
+        my @fields = parse_line(',', 1, $field_values[0]);
+        my @values = parse_line(',', 1, $field_values[1]);
         foreach my $field (@fields) {
             my $translated = _translate_field_name($field);
             # Skip and record any unknown fields
             if (!defined $translated) {
                 push(@$unknownFields, $field);
-                next;
             }
             # If we got back an array, that means the substring is
             # ambiguous and could match more than field name
             elsif (ref $translated) {
                 $ambiguous_fields->{$field} = $translated;
-                next;
             }
-            foreach my $value (@values) {
-                my $operator = FIELD_OPERATOR->{$translated} || 'substring';
-                addChart($translated, $operator, $value, $negate);
+            else {
+                foreach my $value (@values) {
+                    my $operator = FIELD_OPERATOR->{$translated} || 'substring';
+                    # If the string was quoted to protect some special
+                    # characters such as commas and colons, we need
+                    # to remove quotes.
+                    if ($value =~ /^(["'])(.+)\g1$/) {
+                        $value = $2;
+                        $value =~ s/\\(["'])/$1/g;
+                    }
+                    addChart($translated, $operator, $value, $negate);
+                }
             }
         }
         return 1;
     }
-    
     return 0;
 }
 
@@ -513,41 +563,6 @@ sub _handle_urls {
 # Helpers
 ###########################################################################
 
-# Split string on whitespace, retaining quoted strings as one
-sub splitString {
-    my $string = shift;
-    my @quoteparts;
-    my @parts;
-    my $i = 0;
-
-    # Now split on quote sign; be tolerant about unclosed quotes
-    @quoteparts = split(/"/, $string);
-    foreach my $part (@quoteparts) {
-        # After every odd quote, quote special chars
-        if ($i++ %2) {
-            $part = url_quote($part);
-            # Protect the minus sign from being considered
-            # as negation, in quotes.
-            $part =~ s/(?<=^)\-/%2D/;
-        }
-    }
-    # Join again
-    $string = join('"', @quoteparts);
-
-    # Now split on unescaped whitespace
-    @parts = split(/\s+/, $string);
-    foreach (@parts) {
-        # Protect plus signs from becoming a blank.
-        # If "+" appears as the first character, leave it alone
-        # as it has a special meaning. Strings which start with
-        # "+" must be quoted.
-        s/(?<!^)\+/%2B/g;
-        # Remove quotes
-        s/"//g;
-    }
-    return @parts;
-}
-
 # Quote and escape a phrase appropriately for a "content matches" search.
 sub _matches_phrase {
     my ($phrase) = @_;
@@ -613,7 +628,7 @@ sub makeChart {
     my $cgi = Bugzilla->cgi;
     $cgi->param("field$expr", $field);
     $cgi->param("type$expr",  $type);
-    $cgi->param("value$expr", url_decode($value));
+    $cgi->param("value$expr", $value);
 }
 
 1;
diff --git a/Bugzilla/Util.pm b/Bugzilla/Util.pm
index 6d8622e04..7ecaddc88 100644
--- a/Bugzilla/Util.pm
+++ b/Bugzilla/Util.pm
@@ -34,7 +34,7 @@ use base qw(Exporter);
 @Bugzilla::Util::EXPORT = qw(trick_taint detaint_natural
                              detaint_signed
                              html_quote url_quote xml_quote
-                             css_class_quote html_light_quote url_decode
+                             css_class_quote html_light_quote
                              i_am_cgi correct_urlbase remote_ip
                              do_ssl_redirect_if_required use_attachbase
                              diff_arrays on_main_db
@@ -243,14 +243,6 @@ sub xml_quote {
     return $var;
 }
 
-sub url_decode {
-    my ($todecode) = (@_);
-    $todecode =~ tr/+/ /;       # pluses become spaces
-    $todecode =~ s/%([0-9a-fA-F]{2})/pack("c",hex($1))/ge;
-    utf8::decode($todecode) if Bugzilla->params->{'utf8'};
-    return $todecode;
-}
-
 sub i_am_cgi {
     # I use SERVER_SOFTWARE because it's required to be
     # defined for all requests in the CGI spec.
@@ -756,9 +748,6 @@ Bugzilla::Util - Generic utility functions for bugzilla
   xml_quote($var);
   email_filter($var);
 
-  # Functions for decoding
-  $rv = url_decode($var);
-
   # Functions that tell you about your environment
   my $is_cgi   = i_am_cgi();
   my $urlbase  = correct_urlbase();
@@ -870,10 +859,6 @@ This is similar to C<html_quote>, except that ' is escaped to &apos;. This
 is kept separate from html_quote partly for compatibility with previous code
 (for &apos;) and partly for future handling of non-ASCII characters.
 
-=item C<url_decode($val)>
-
-Converts the %xx encoding from the given URL back to its original form.
-
 =item C<email_filter>
 
 Removes the hostname from email addresses in the string, if the user
author	Frédéric Buclin <LpSolit@gmail.com>	2012-03-29 19:56:41 +0200
committer	Frédéric Buclin <LpSolit@gmail.com>	2012-03-29 19:56:41 +0200
commit	8f4fd271de4edbf1a206dfa0a98f8276f6189709 (patch)
tree	c7f97377b18dd5b1845c80fffe79c0b413b23863 /Bugzilla
parent	c89d334af4b6d288f6294cfe8acfa939744fcec6 (diff)
download	bugzilla-8f4fd271de4edbf1a206dfa0a98f8276f6189709.tar.gz bugzilla-8f4fd271de4edbf1a206dfa0a98f8276f6189709.tar.xz