Add some knowledge about prefix matches to tsmatchsel(). It's not terribly
authorTom Lane <tgl@sss.pgh.pa.us>
Sun, 1 Aug 2010 21:31:08 +0000 (21:31 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sun, 1 Aug 2010 21:31:08 +0000 (21:31 +0000)
bright, but it beats assuming that a prefix match behaves identically to an
exact match, which is what the code was doing before :-(.  Noted while
experimenting with Artur Dobrowski's example.

src/backend/tsearch/ts_selfuncs.c

index 68d67c7a4e66cff45c4361ca67f751e4dce959e9..3948ef9367789ff403815153cce53892b06d1876 100644 (file)
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -257,25 +257,23 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
  *
  *  1 - select(oper) in NOT nodes
  *
- *  freq[val] in VAL nodes, if the value is in MCELEM
+ *  histogram-based estimation in prefix VAL nodes
+ *
+ *  freq[val] in exact VAL nodes, if the value is in MCELEM
  *  min(freq[MCELEM]) / 2 in VAL nodes, if it is not
  *
  * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
  * binary search for determining freq[MCELEM].
  *
  * If we don't have stats for the tsvector, we still use this logic,
- * except we always use DEFAULT_TS_MATCH_SEL for VAL nodes.  This case
- * is signaled by lookup == NULL.
+ * except we use default estimates for VAL nodes.  This case is signaled
+ * by lookup == NULL.
  */
 static Selectivity
 tsquery_opr_selec(QueryItem *item, char *operand,
                  TextFreq *lookup, int length, float4 minfreq)
 {
-   LexemeKey   key;
-   TextFreq   *searchres;
-   Selectivity selec,
-               s1,
-               s2;
+   Selectivity selec;
 
    /* since this function recurses, it could be driven to stack overflow */
    check_stack_depth();
@@ -283,10 +281,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
    if (item->type == QI_VAL)
    {
        QueryOperand *oper = (QueryOperand *) item;
-
-       /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
-       if (lookup == NULL)
-           return (Selectivity) DEFAULT_TS_MATCH_SEL;
+       LexemeKey   key;
 
        /*
         * Prepare the key for bsearch().
@@ -294,56 +289,115 @@ tsquery_opr_selec(QueryItem *item, char *operand,
        key.lexeme = operand + oper->distance;
        key.length = oper->length;
 
-       searchres = (TextFreq *) bsearch(&key, lookup, length,
-                                        sizeof(TextFreq),
-                                        compare_lexeme_textfreq);
-
-       if (searchres)
+       if (oper->prefix)
        {
+           /* Prefix match, ie the query item is lexeme:* */
+           Selectivity matched,
+                       allmcvs;
+           int         i;
+
+           /*
+            * Our strategy is to scan through the MCV list and add up the
+            * frequencies of the ones that match the prefix, thereby
+            * assuming that the MCVs are representative of the whole lexeme
+            * population in this respect.  Compare histogram_selectivity().
+            *
+            * This is only a good plan if we have a pretty fair number of
+            * MCVs available; we set the threshold at 100.  If no stats or
+            * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
+            */
+           if (lookup == NULL || length < 100)
+               return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
+
+           matched = allmcvs = 0;
+           for (i = 0; i < length; i++)
+           {
+               TextFreq   *t = lookup + i;
+               int         tlen = VARSIZE_ANY_EXHDR(t->element);
+
+               if (tlen >= key.length &&
+                   strncmp(key.lexeme, VARDATA_ANY(t->element),
+                           key.length) == 0)
+                   matched += t->frequency;
+               allmcvs += t->frequency;
+           }
+
+           if (allmcvs > 0)    /* paranoia about zero divide */
+               selec = matched / allmcvs;
+           else
+               selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
+
            /*
-            * The element is in MCELEM.  Return precise selectivity (or at
-            * least as precise as ANALYZE could find out).
+            * In any case, never believe that a prefix match has selectivity
+            * less than DEFAULT_TS_MATCH_SEL.
             */
-           return (Selectivity) searchres->frequency;
+           selec = Max(DEFAULT_TS_MATCH_SEL, selec);
        }
        else
        {
-           /*
-            * The element is not in MCELEM.  Punt, but assume that the
-            * selectivity cannot be more than minfreq / 2.
-            */
-           return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
+           /* Regular exact lexeme match */
+           TextFreq   *searchres;
+
+           /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
+           if (lookup == NULL)
+               return (Selectivity) DEFAULT_TS_MATCH_SEL;
+
+           searchres = (TextFreq *) bsearch(&key, lookup, length,
+                                            sizeof(TextFreq),
+                                            compare_lexeme_textfreq);
+
+           if (searchres)
+           {
+               /*
+                * The element is in MCELEM.  Return precise selectivity (or
+                * at least as precise as ANALYZE could find out).
+                */
+               selec = searchres->frequency;
+           }
+           else
+           {
+               /*
+                * The element is not in MCELEM.  Punt, but assume that the
+                * selectivity cannot be more than minfreq / 2.
+                */
+               selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
+           }
        }
    }
-
-   /* Current TSQuery node is an operator */
-   switch (item->qoperator.oper)
+   else
    {
-       case OP_NOT:
-           selec = 1.0 - tsquery_opr_selec(item + 1, operand,
-                                           lookup, length, minfreq);
-           break;
-
-       case OP_AND:
-           s1 = tsquery_opr_selec(item + 1, operand,
-                                  lookup, length, minfreq);
-           s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
-                                  lookup, length, minfreq);
-           selec = s1 * s2;
-           break;
-
-       case OP_OR:
-           s1 = tsquery_opr_selec(item + 1, operand,
-                                  lookup, length, minfreq);
-           s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
-                                  lookup, length, minfreq);
-           selec = s1 + s2 - s1 * s2;
-           break;
-
-       default:
-           elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
-           selec = 0;          /* keep compiler quiet */
-           break;
+       /* Current TSQuery node is an operator */
+       Selectivity s1,
+                   s2;
+
+       switch (item->qoperator.oper)
+       {
+           case OP_NOT:
+               selec = 1.0 - tsquery_opr_selec(item + 1, operand,
+                                               lookup, length, minfreq);
+               break;
+
+           case OP_AND:
+               s1 = tsquery_opr_selec(item + 1, operand,
+                                      lookup, length, minfreq);
+               s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
+                                      lookup, length, minfreq);
+               selec = s1 * s2;
+               break;
+
+           case OP_OR:
+               s1 = tsquery_opr_selec(item + 1, operand,
+                                      lookup, length, minfreq);
+               s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
+                                      lookup, length, minfreq);
+               selec = s1 + s2 - s1 * s2;
+               break;
+
+           default:
+               elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
+               selec = 0;          /* keep compiler quiet */
+               break;
+       }
    }
 
    /* Clamp intermediate results to stay sane despite roundoff error */