Make websearch_to_tsquery() parse text in quotes as a single token
authorAlexander Korotkov <akorotkov@postgresql.org>
Mon, 3 May 2021 00:58:03 +0000 (03:58 +0300)
committerAlexander Korotkov <akorotkov@postgresql.org>
Mon, 3 May 2021 01:18:19 +0000 (04:18 +0300)
websearch_to_tsquery() splits text in quotes into tokens and connects them with
phrase operator on its own.  However, that leads to surprising results when the
token contains no words.

For instance, websearch_to_tsquery('"aaa: bbb"') is 'aaa <2> bbb', because
it is equivalent of to_tsquery(E'aaa <-> \':\' <-> bbb').  But
websearch_to_tsquery('"aaa: bbb"') has to be 'aaa <-> bbb' in order to match
to_tsvector('aaa: bbb').

Since 0c4f355c6a, we anyway connect lexemes of complex tokens with phrase
operators.  Thus, let's just websearch_to_tsquery() parse text in quotes as
a single token.  Therefore, websearch_to_tsquery() should process the quoted
text in the same way phraseto_tsquery() does.  This solution is what we exactly
need and also simplifies the code.

This commit is an incompatible change, so we don't backpatch it.

Reported-by: Valentin Gatien-Baron
Discussion: https://postgr.es/m/CA%2B0DEqiZs7gdOd4ikmg%3D0UWG%2BSwWOLxPsk_JW-sx9WNOyrb0KQ%40mail.gmail.com
Author: Alexander Korotkov
Reviewed-by: Tom Lane, Zhihong Yu
src/backend/utils/adt/tsquery.c
src/test/regress/expected/tsearch.out
src/test/regress/sql/tsearch.sql

index fe4470174f5d10558b7730f0a78e07c76b87ae88..b2ca0d2f8a248daf0a79233e0e0c0450dbd2bd99 100644 (file)
@@ -77,7 +77,6 @@ struct TSQueryParserStateData
    char       *buf;            /* current scan point */
    int         count;          /* nesting count, incremented by (,
                                 * decremented by ) */
-   bool        in_quotes;      /* phrase in quotes "" */
    ts_parserstate state;
 
    /* polish (prefix) notation in list, filled in by push* functions */
@@ -235,9 +234,6 @@ parse_or_operator(TSQueryParserState pstate)
 {
    char       *ptr = pstate->buf;
 
-   if (pstate->in_quotes)
-       return false;
-
    /* it should begin with "OR" literal */
    if (pg_strncasecmp(ptr, "or", 2) != 0)
        return false;
@@ -398,38 +394,29 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
                    state->buf++;
                    state->state = WAITOPERAND;
 
-                   if (state->in_quotes)
-                       continue;
-
                    *operator = OP_NOT;
                    return PT_OPR;
                }
                else if (t_iseq(state->buf, '"'))
                {
+                   /* Everything in quotes is processed as a single token */
+
+                   /* skip opening quote */
                    state->buf++;
+                   *strval = state->buf;
 
-                   if (!state->in_quotes)
-                   {
-                       state->state = WAITOPERAND;
+                   /* iterate to the closing quote or end of the string */
+                   while (*state->buf != '\0' && !t_iseq(state->buf, '"'))
+                       state->buf++;
+                   *lenval = state->buf - *strval;
 
-                       if (strchr(state->buf, '"'))
-                       {
-                           /* quoted text should be ordered <-> */
-                           state->in_quotes = true;
-                           return PT_OPEN;
-                       }
+                   /* skip closing quote if not end of the string */
+                   if (*state->buf != '\0')
+                       state->buf++;
 
-                       /* web search tolerates missing quotes */
-                       continue;
-                   }
-                   else
-                   {
-                       /* we have to provide an operand */
-                       state->in_quotes = false;
-                       state->state = WAITOPERATOR;
-                       pushStop(state);
-                       return PT_CLOSE;
-                   }
+                   state->state = WAITOPERATOR;
+                   state->count++;
+                   return PT_VAL;
                }
                else if (ISOPERATOR(state->buf))
                {
@@ -467,24 +454,13 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
            case WAITOPERATOR:
                if (t_iseq(state->buf, '"'))
                {
-                   if (!state->in_quotes)
-                   {
-                       /*
-                        * put implicit AND after an operand and handle this
-                        * quote in WAITOPERAND
-                        */
-                       state->state = WAITOPERAND;
-                       *operator = OP_AND;
-                       return PT_OPR;
-                   }
-                   else
-                   {
-                       state->buf++;
-
-                       /* just close quotes */
-                       state->in_quotes = false;
-                       return PT_CLOSE;
-                   }
+                   /*
+                    * put implicit AND after an operand and handle this quote
+                    * in WAITOPERAND
+                    */
+                   state->state = WAITOPERAND;
+                   *operator = OP_AND;
+                   return PT_OPR;
                }
                else if (parse_or_operator(state))
                {
@@ -498,18 +474,8 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
                }
                else if (!t_isspace(state->buf))
                {
-                   if (state->in_quotes)
-                   {
-                       /* put implicit <-> after an operand */
-                       *operator = OP_PHRASE;
-                       *weight = 1;
-                   }
-                   else
-                   {
-                       /* put implicit AND after an operand */
-                       *operator = OP_AND;
-                   }
-
+                   /* put implicit AND after an operand */
+                   *operator = OP_AND;
                    state->state = WAITOPERAND;
                    return PT_OPR;
                }
@@ -846,7 +812,6 @@ parse_tsquery(char *buf,
    state.buffer = buf;
    state.buf = buf;
    state.count = 0;
-   state.in_quotes = false;
    state.state = WAITFIRSTOPERAND;
    state.polstr = NIL;
 
index 4ae62320c9ffebe17ce20441d7281d6cfb321ce1..45b92a6338836c7f88b2ae948c0d4f036a2f3abb 100644 (file)
@@ -2678,9 +2678,9 @@ select websearch_to_tsquery('simple', 'abc OR_abc');
 
 -- test quotes
 select websearch_to_tsquery('english', '"pg_class pg');
-  websearch_to_tsquery   
--------------------------
- 'pg' <-> 'class' & 'pg'
+   websearch_to_tsquery    
+---------------------------
+ 'pg' <-> 'class' <-> 'pg'
 (1 row)
 
 select websearch_to_tsquery('english', 'pg_class pg"');
@@ -2695,6 +2695,12 @@ select websearch_to_tsquery('english', '"pg_class pg"');
  'pg' <-> 'class' <-> 'pg'
 (1 row)
 
+select websearch_to_tsquery('english', '"pg_class : pg"');
+   websearch_to_tsquery    
+---------------------------
+ 'pg' <-> 'class' <-> 'pg'
+(1 row)
+
 select websearch_to_tsquery('english', 'abc "pg_class pg"');
        websearch_to_tsquery        
 -----------------------------------
@@ -2708,15 +2714,15 @@ select websearch_to_tsquery('english', '"pg_class pg" def');
 (1 row)
 
 select websearch_to_tsquery('english', 'abc "pg pg_class pg" def');
-                  websearch_to_tsquery                  
---------------------------------------------------------
- 'abc' & 'pg' <-> ( 'pg' <-> 'class' ) <-> 'pg' & 'def'
+                websearch_to_tsquery                
+----------------------------------------------------
+ 'abc' & 'pg' <-> 'pg' <-> 'class' <-> 'pg' & 'def'
 (1 row)
 
 select websearch_to_tsquery('english', ' or "pg pg_class pg" or ');
-          websearch_to_tsquery          
-----------------------------------------
- 'pg' <-> ( 'pg' <-> 'class' ) <-> 'pg'
+        websearch_to_tsquery        
+------------------------------------
+ 'pg' <-> 'pg' <-> 'class' <-> 'pg'
 (1 row)
 
 select websearch_to_tsquery('english', '""pg pg_class pg""');
index b02ed73f6a8c2d22358f26b6c7d719906aa6ec2d..d929210998ae5ec6e37b9b82e45a85440f843451 100644 (file)
@@ -759,6 +759,7 @@ select websearch_to_tsquery('simple', 'abc OR_abc');
 select websearch_to_tsquery('english', '"pg_class pg');
 select websearch_to_tsquery('english', 'pg_class pg"');
 select websearch_to_tsquery('english', '"pg_class pg"');
+select websearch_to_tsquery('english', '"pg_class : pg"');
 select websearch_to_tsquery('english', 'abc "pg_class pg"');
 select websearch_to_tsquery('english', '"pg_class pg" def');
 select websearch_to_tsquery('english', 'abc "pg pg_class pg" def');