Fix up core tsquery GIN support for new extractQuery API.
authorTom Lane <tgl@sss.pgh.pa.us>
Sun, 9 Jan 2011 19:34:50 +0000 (14:34 -0500)
committerTom Lane <tgl@sss.pgh.pa.us>
Sun, 9 Jan 2011 19:34:50 +0000 (14:34 -0500)
No need for the empty-prefix-match kluge to force a full scan anymore.

src/backend/utils/adt/tsginidx.c
src/backend/utils/adt/tsvector_op.c
src/include/tsearch/ts_utils.h

index 9e7ca66132b3897761882944e16094a2a7c430bc..56cd9b70d28b715268046f10216edbdcf05ecba5 100644 (file)
@@ -13,6 +13,7 @@
  */
 #include "postgres.h"
 
+#include "access/gin.h"
 #include "access/skey.h"
 #include "tsearch/ts_type.h"
 #include "tsearch/ts_utils.h"
@@ -26,8 +27,7 @@ gin_cmp_tslexeme(PG_FUNCTION_ARGS)
        text       *b = PG_GETARG_TEXT_PP(1);
        int                     cmp;
 
-       cmp = tsCompareString(
-                                                 VARDATA_ANY(a), VARSIZE_ANY_EXHDR(a),
+       cmp = tsCompareString(VARDATA_ANY(a), VARSIZE_ANY_EXHDR(a),
                                                  VARDATA_ANY(b), VARSIZE_ANY_EXHDR(b),
                                                  false);
 
@@ -48,8 +48,7 @@ gin_cmp_prefix(PG_FUNCTION_ARGS)
 #endif
        int                     cmp;
 
-       cmp = tsCompareString(
-                                                 VARDATA_ANY(a), VARSIZE_ANY_EXHDR(a),
+       cmp = tsCompareString(VARDATA_ANY(a), VARSIZE_ANY_EXHDR(a),
                                                  VARDATA_ANY(b), VARSIZE_ANY_EXHDR(b),
                                                  true);
 
@@ -96,71 +95,72 @@ gin_extract_tsquery(PG_FUNCTION_ARGS)
 {
        TSQuery         query = PG_GETARG_TSQUERY(0);
        int32      *nentries = (int32 *) PG_GETARG_POINTER(1);
-
        /* StrategyNumber strategy = PG_GETARG_UINT16(2); */
        bool      **ptr_partialmatch = (bool **) PG_GETARG_POINTER(3);
        Pointer   **extra_data = (Pointer **) PG_GETARG_POINTER(4);
+       /* bool   **nullFlags = (bool **) PG_GETARG_POINTER(5); */
+       int32      *searchMode = (int32 *) PG_GETARG_POINTER(6);
        Datum      *entries = NULL;
-       bool       *partialmatch;
 
        *nentries = 0;
 
        if (query->size > 0)
        {
+               QueryItem  *item = GETQUERY(query);
                int4            i,
-                                       j = 0,
-                                       len;
-               QueryItem  *item;
-               bool            use_fullscan = false;
+                                       j;
+               bool       *partialmatch;
                int                *map_item_operand;
 
-               item = clean_NOT(GETQUERY(query), &len);
-               if (!item)
-               {
-                       use_fullscan = true;
-                       *nentries = 1;
-               }
-
-               item = GETQUERY(query);
+               /*
+                * If the query doesn't have any required positive matches (for
+                * instance, it's something like '! foo'), we have to do a full
+                * index scan.
+                */
+               if (tsquery_requires_match(item))
+                       *searchMode = GIN_SEARCH_MODE_DEFAULT;
+               else
+                       *searchMode = GIN_SEARCH_MODE_ALL;
 
+               /* count number of VAL items */
+               j = 0;
                for (i = 0; i < query->size; i++)
+               {
                        if (item[i].type == QI_VAL)
-                               (*nentries)++;
+                               j++;
+               }
+               *nentries = j;
 
-               entries = (Datum *) palloc(sizeof(Datum) * (*nentries));
-               partialmatch = *ptr_partialmatch = (bool *) palloc(sizeof(bool) * (*nentries));
+               entries = (Datum *) palloc(sizeof(Datum) * j);
+               partialmatch = *ptr_partialmatch = (bool *) palloc(sizeof(bool) * j);
 
                /*
                 * Make map to convert item's number to corresponding operand's (the
                 * same, entry's) number. Entry's number is used in check array in
                 * consistent method. We use the same map for each entry.
                 */
-               *extra_data = (Pointer *) palloc0(sizeof(Pointer) * (*nentries));
-               map_item_operand = palloc0(sizeof(int) * (query->size + 1));
+               *extra_data = (Pointer *) palloc(sizeof(Pointer) * j);
+               map_item_operand = (int *) palloc0(sizeof(int) * query->size);
 
+               /* Now rescan the VAL items and fill in the arrays */
+               j = 0;
                for (i = 0; i < query->size; i++)
+               {
                        if (item[i].type == QI_VAL)
                        {
-                               text       *txt;
                                QueryOperand *val = &item[i].qoperand;
+                               text       *txt;
 
                                txt = cstring_to_text_with_len(GETOPERAND(query) + val->distance,
                                                                                           val->length);
+                               entries[j] = PointerGetDatum(txt);
+                               partialmatch[j] = val->prefix;
                                (*extra_data)[j] = (Pointer) map_item_operand;
                                map_item_operand[i] = j;
-                               partialmatch[j] = val->prefix;
-                               entries[j++] = PointerGetDatum(txt);
+                               j++;
                        }
-
-               if (use_fullscan)
-               {
-                       (*extra_data)[j] = (Pointer) map_item_operand;
-                       map_item_operand[i] = j;
-                       entries[j++] = PointerGetDatum(cstring_to_text_with_len("", 0));
                }
        }
-       else
-               *nentries = -1;                 /* nothing can be found */
 
        PG_FREE_IF_COPY(query, 0);
 
@@ -222,12 +222,10 @@ gin_tsquery_consistent(PG_FUNCTION_ARGS)
                gcv.map_item_operand = (int *) (extra_data[0]);
                gcv.need_recheck = recheck;
 
-               res = TS_execute(
-                                                GETQUERY(query),
+               res = TS_execute(GETQUERY(query),
                                                 &gcv,
                                                 true,
-                                                checkcondition_gin
-                       );
+                                                checkcondition_gin);
        }
 
        PG_RETURN_BOOL(res);
index 38c1401398ce39c16fa13651a38b4c35459e13ff..b7a822d3544aa93dd24318381622d6130273d9e5 100644 (file)
@@ -525,7 +525,8 @@ tsvector_concat(PG_FUNCTION_ARGS)
 
 /*
  * Compare two strings by tsvector rules.
- * if isPrefix = true then it returns not-zero value if b has prefix a
+ *
+ * if isPrefix = true then it returns zero value iff b has prefix a
  */
 int4
 tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
@@ -535,8 +536,7 @@ tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
        if (lena == 0)
        {
                if (prefix)
-                       cmp = 0;                        /* emtry string is equal to any if a prefix
-                                                                * match */
+                       cmp = 0;                        /* empty string is prefix of anything */
                else
                        cmp = (lenb > 0) ? -1 : 0;
        }
@@ -551,14 +551,9 @@ tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
                if (prefix)
                {
                        if (cmp == 0 && lena > lenb)
-                       {
-                               /*
-                                * b argument is not beginning with argument a
-                                */
-                               cmp = 1;
-                       }
+                               cmp = 1;                /* a is longer, so not a prefix of b */
                }
-               else if ((cmp == 0) && (lena != lenb))
+               else if (cmp == 0 && lena != lenb)
                {
                        cmp = (lena < lenb) ? -1 : 1;
                }
@@ -650,13 +645,13 @@ checkcondition_str(void *checkval, QueryOperand *val)
 }
 
 /*
- * check for boolean condition.
+ * Evaluate tsquery boolean expression.
  *
- * if calcnot is false, NOT expressions are always evaluated to be true. This is used in ranking.
+ * chkcond is a callback function used to evaluate each VAL node in the query.
  * checkval can be used to pass information to the callback. TS_execute doesn't
  * do anything with it.
- * chkcond is a callback function used to evaluate each VAL node in the query.
- *
+ * if calcnot is false, NOT expressions are always evaluated to be true. This
+ * is used in ranking.
  */
 bool
 TS_execute(QueryItem *curitem, void *checkval, bool calcnot,
@@ -675,6 +670,7 @@ TS_execute(QueryItem *curitem, void *checkval, bool calcnot,
                                return !TS_execute(curitem + 1, checkval, calcnot, chkcond);
                        else
                                return true;
+
                case OP_AND:
                        if (TS_execute(curitem + curitem->qoperator.left, checkval, calcnot, chkcond))
                                return TS_execute(curitem + 1, checkval, calcnot, chkcond);
@@ -695,6 +691,55 @@ TS_execute(QueryItem *curitem, void *checkval, bool calcnot,
        return false;
 }
 
+/*
+ * Detect whether a tsquery boolean expression requires any positive matches
+ * to values shown in the tsquery.
+ *
+ * This is needed to know whether a GIN index search requires full index scan.
+ * For example, 'x & !y' requires a match of x, so it's sufficient to scan
+ * entries for x; but 'x | !y' could match rows containing neither x nor y.
+ */
+bool
+tsquery_requires_match(QueryItem *curitem)
+{
+       /* since this function recurses, it could be driven to stack overflow */
+       check_stack_depth();
+
+       if (curitem->type == QI_VAL)
+               return true;
+
+       switch (curitem->qoperator.oper)
+       {
+               case OP_NOT:
+                       /*
+                        * Assume there are no required matches underneath a NOT.  For
+                        * some cases with nested NOTs, we could prove there's a required
+                        * match, but it seems unlikely to be worth the trouble.
+                        */
+                       return false;
+
+               case OP_AND:
+                       /* If either side requires a match, we're good */
+                       if (tsquery_requires_match(curitem + curitem->qoperator.left))
+                               return true;
+                       else
+                               return tsquery_requires_match(curitem + 1);
+
+               case OP_OR:
+                       /* Both sides must require a match */
+                       if (tsquery_requires_match(curitem + curitem->qoperator.left))
+                               return tsquery_requires_match(curitem + 1);
+                       else
+                               return false;
+
+               default:
+                       elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
+       }
+
+       /* not reachable, but keep compiler quiet */
+       return false;
+}
+
 /*
  * boolean operations
  */
index 62890aabb70193446c4bb61426819ea9cd84bcdf..1bd40344881a958fac324e9c51aac83d6c52faf0 100644 (file)
@@ -104,9 +104,9 @@ extern text *generateHeadline(HeadlineParsedText *prs);
 /*
  * Common check function for tsvector @@ tsquery
  */
-
 extern bool TS_execute(QueryItem *curitem, void *checkval, bool calcnot,
                   bool (*chkcond) (void *checkval, QueryOperand *val));
+extern bool tsquery_requires_match(QueryItem *curitem);
 
 /*
  * to_ts* - text transformation to tsvector, tsquery