Cosmetic improvements for default text search parser's ts_headline code.

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 9 Apr 2020 16:36:59 +0000 (12:36 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 9 Apr 2020 16:36:59 +0000 (12:36 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 9 Apr 2020 16:36:59 +0000 (12:36 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 9 Apr 2020 16:36:59 +0000 (12:36 -0400)
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c

index 898466fcefd64cae2845884c9a8126b4c6f38336..9a351ac9da4cce53c09a4cde32871c48aecbec06 100644 (file)
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -1915,6 +1915,12 @@ prsd_end(PG_FUNCTION_ARGS)
         PG_RETURN_VOID();
  }
  
+
+/*
+ * ts_headline support begins here
+ */
+
+/* token type classification macros */
  #define LEAVETOKEN(x)  ( (x)==SPACE )
  #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
  #define ENDPUNCTOKEN(x) ( (x)==SPACE )
@@ -1926,23 +1932,54 @@ prsd_end(PG_FUNCTION_ARGS)
  #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
  #define NOENDTOKEN(x)  ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
  
+/*
+ * Macros useful in headline selection.  These rely on availability of
+ * "HeadlineParsedText *prs" describing some text, and "int shortword"
+ * describing the "short word" length parameter.
+ */
+
+/* Interesting words are non-repeated search terms */
+#define INTERESTINGWORD(j) \
+       (prs->words[j].item && !prs->words[j].repeated)
+
+/* Don't want to end at a non-word or a short word */
+#define BADENDPOINT(j) \
+       (NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword)
+
  typedef struct
  {
+       /* one cover (well, really one fragment) for mark_hl_fragments */
+       int32           startpos;               /* fragment's starting word index */
+       int32           endpos;                 /* ending word index (inclusive) */
+       int32           poslen;                 /* number of interesting words */
+       int32           curlen;                 /* total number of words */
+       bool            chosen;                 /* chosen? */
+       bool            excluded;               /* excluded? */
+} CoverPos;
+
+typedef struct
+{
+       /* callback data for checkcondition_HL */
         HeadlineWordEntry *words;
         int                     len;
  } hlCheck;
  
+
+/*
+ * TS_execute callback for matching a tsquery operand to headline words
+ */
  static bool
  checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
  {
-       int                     i;
         hlCheck    *checkval = (hlCheck *) opaque;
+       int                     i;
  
+       /* scan words array for marching items */
         for (i = 0; i < checkval->len; i++)
         {
                 if (checkval->words[i].item == val)
                 {
-                       /* don't need to find all positions */
+                       /* if data == NULL, don't need to report positions */
                         if (!data)
                                 return true;
  
@@ -2038,8 +2075,14 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
         return false;
  }
  
+/*
+ * Apply suitable highlight marking to words selected by headline selector
+ *
+ * The words from startpos to endpos inclusive are marked per highlightall
+ */
  static void
-mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
+mark_fragment(HeadlineParsedText *prs, bool highlightall,
+                         int startpos, int endpos)
  {
         int                     i;
  
@@ -2047,7 +2090,7 @@ mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
         {
                 if (prs->words[i].item)
                         prs->words[i].selected = 1;
-               if (highlight == 0)
+               if (!highlightall)
                 {
                         if (HLIDREPLACE(prs->words[i].type))
                                 prs->words[i].replace = 1;
@@ -2064,16 +2107,15 @@ mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
         }
  }
  
-typedef struct
-{
-       int32           startpos;
-       int32           endpos;
-       int32           poslen;
-       int32           curlen;
-       int16           in;
-       int16           excluded;
-} CoverPos;
-
+/*
+ * split a cover substring into fragments not longer than max_words
+ *
+ * At entry, *startpos and *endpos are the (remaining) bounds of the cover
+ * substring.  They are updated to hold the bounds of the next fragment.
+ *
+ * *curlen and *poslen are set to the fragment's length, in words and
+ * interesting words respectively.
+ */
  static void
  get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
                                   int *curlen, int *poslen, int max_words)
@@ -2081,17 +2123,17 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
         int                     i;
  
         /*
-        * Objective: Generate a fragment of words between startpos and endpos
-        * such that it has at most max_words and both ends has query words. If
-        * the startpos and endpos are the endpoints of the cover and the cover
-        * has fewer words than max_words, then this function should just return
-        * the cover
+        * Objective: select a fragment of words between startpos and endpos such
+        * that it has at most max_words and both ends have query words. If the
+        * startpos and endpos are the endpoints of the cover and the cover has
+        * fewer words than max_words, then this function should just return the
+        * cover
          */
         /* first move startpos to an item */
         for (i = *startpos; i <= *endpos; i++)
         {
                 *startpos = i;
-               if (prs->words[i].item && !prs->words[i].repeated)
+               if (INTERESTINGWORD(i))
                         break;
         }
         /* cut endpos to have only max_words */
@@ -2101,7 +2143,7 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
         {
                 if (!NONWORDTOKEN(prs->words[i].type))
                         *curlen += 1;
-               if (prs->words[i].item && !prs->words[i].repeated)
+               if (INTERESTINGWORD(i))
                         *poslen += 1;
         }
         /* if the cover was cut then move back endpos to a query item */
@@ -2111,7 +2153,7 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
                 for (i = *endpos; i >= *startpos; i--)
                 {
                         *endpos = i;
-                       if (prs->words[i].item && !prs->words[i].repeated)
+                       if (INTERESTINGWORD(i))
                                 break;
                         if (!NONWORDTOKEN(prs->words[i].type))
                                 *curlen -= 1;
@@ -2119,8 +2161,14 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
         }
  }
  
+/*
+ * Headline selector used when MaxFragments > 0
+ *
+ * Note: in this mode, highlightall is disregarded for phrase selection;
+ * it only controls presentation details.
+ */
  static void
-mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
+mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
                                   int shortword, int min_words,
                                   int max_words, int max_fragments)
  {
@@ -2156,7 +2204,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
  
                 /*
                  * Break the cover into smaller fragments such that each fragment has
-                * at most max_words. Also ensure that each end of the fragment is a
+                * at most max_words. Also ensure that each end of each fragment is a
                  * query word. This will allow us to stretch the fragment in either
                  * direction
                  */
@@ -2173,12 +2221,13 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
                         covers[numcovers].endpos = endpos;
                         covers[numcovers].curlen = curlen;
                         covers[numcovers].poslen = poslen;
-                       covers[numcovers].in = 0;
-                       covers[numcovers].excluded = 0;
+                       covers[numcovers].chosen = false;
+                       covers[numcovers].excluded = false;
                         numcovers++;
                         startpos = endpos + 1;
                         endpos = q;
                 }
+
                 /* move p to generate the next cover */
                 p++;
         }
@@ -2196,9 +2245,10 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
                  */
                 for (i = 0; i < numcovers; i++)
                 {
-                       if (!covers[i].in && !covers[i].excluded &&
-                               (maxitems < covers[i].poslen || (maxitems == covers[i].poslen
-                                                                                                && minwords > covers[i].curlen)))
+                       if (!covers[i].chosen && !covers[i].excluded &&
+                               (maxitems < covers[i].poslen ||
+                                (maxitems == covers[i].poslen &&
+                                 minwords > covers[i].curlen)))
                         {
                                 maxitems = covers[i].poslen;
                                 minwords = covers[i].curlen;
@@ -2208,7 +2258,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
                 /* if a cover was found mark it */
                 if (minI >= 0)
                 {
-                       covers[minI].in = 1;
+                       covers[minI].chosen = true;
                         /* adjust the size of cover */
                         startpos = covers[minI].startpos;
                         endpos = covers[minI].endpos;
@@ -2235,8 +2285,8 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
                                         }
                                         posmarker = i;
                                 }
-                               /* cut back startpos till we find a non short token */
-                               for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
+                               /* cut back startpos till we find a good endpoint */
+                               for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
                                 {
                                         if (!NONWORDTOKEN(prs->words[i].type))
                                                 curlen--;
@@ -2250,8 +2300,8 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
                                                 curlen++;
                                         posmarker = i;
                                 }
-                               /* cut back endpos till we find a non-short token */
-                               for (i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
+                               /* cut back endpos till we find a good endpoint */
+                               for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
                                 {
                                         if (!NONWORDTOKEN(prs->words[i].type))
                                                 curlen--;
@@ -2262,20 +2312,24 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
                         covers[minI].endpos = endpos;
                         covers[minI].curlen = curlen;
                         /* Mark the chosen fragments (covers) */
-                       mark_fragment(prs, highlight, startpos, endpos);
+                       mark_fragment(prs, highlightall, startpos, endpos);
                         num_f++;
                         /* exclude overlapping covers */
                         for (i = 0; i < numcovers; i++)
                         {
-                               if (i != minI && ((covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
-                                       covers[i].excluded = 1;
+                               if (i != minI &&
+                                       ((covers[i].startpos >= covers[minI].startpos &&
+                                         covers[i].startpos <= covers[minI].endpos) ||
+                                        (covers[i].endpos >= covers[minI].startpos &&
+                                         covers[i].endpos <= covers[minI].endpos)))
+                                       covers[i].excluded = true;
                         }
                 }
                 else
                         break;
         }
  
-       /* show at least min_words we have not marked anything */
+       /* show at least min_words if we have not marked anything */
         if (num_f <= 0)
         {
                 startpos = endpos = curlen = 0;
@@ -2285,13 +2339,17 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
                                 curlen++;
                         endpos = i;
                 }
-               mark_fragment(prs, highlight, startpos, endpos);
+               mark_fragment(prs, highlightall, startpos, endpos);
         }
+
         pfree(covers);
  }
  
+/*
+ * Headline selector used when MaxFragments == 0
+ */
  static void
-mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
+mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
                           int shortword, int min_words, int max_words)
  {
         int                     p = 0,
@@ -2299,66 +2357,81 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
         int                     bestb = -1,
                                 beste = -1;
         int                     bestlen = -1;
-       int                     pose = 0,
+       int                     pose,
                                 posb,
                                 poslen,
                                 curlen;
  
         int                     i;
  
-       if (highlight == 0)
+       if (!highlightall)
         {
+               /* examine all covers, select a headline using the best one */
                 while (hlCover(prs, query, &p, &q))
                 {
-                       /* find cover len in words */
+                       /*
+                        * Count words (curlen) and interesting words (poslen) within
+                        * cover, but stop once we reach max_words.  This step doesn't
+                        * consider whether that's a good stopping point.  posb and pose
+                        * are set to the start and end indexes of the possible headline.
+                        */
                         curlen = 0;
                         poslen = 0;
+                       posb = pose = p;
                         for (i = p; i <= q && curlen < max_words; i++)
                         {
                                 if (!NONWORDTOKEN(prs->words[i].type))
                                         curlen++;
-                               if (prs->words[i].item && !prs->words[i].repeated)
+                               if (INTERESTINGWORD(i))
                                         poslen++;
                                 pose = i;
                         }
  
-                       if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
+                       /* XXX this optimization seems unnecessary and wrong */
+                       if (poslen < bestlen && !BADENDPOINT(beste))
                         {
-                               /* best already found, so try one more cover */
+                               /* better cover already found, so try next cover */
                                 p++;
                                 continue;
                         }
  
-                       posb = p;
                         if (curlen < max_words)
-                       {                                       /* find good end */
+                       {
+                               /*
+                                * We have room to lengthen the headline, so search forward
+                                * until it's full or we find a good stopping point.  We'll
+                                * reconsider the word at "q", then move forward.
+                                */
                                 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
                                 {
-                                       if (i != q)
+                                       if (i > q)
                                         {
                                                 if (!NONWORDTOKEN(prs->words[i].type))
                                                         curlen++;
-                                               if (prs->words[i].item && !prs->words[i].repeated)
+                                               if (INTERESTINGWORD(i))
                                                         poslen++;
                                         }
                                         pose = i;
-                                       if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+                                       if (BADENDPOINT(i))
                                                 continue;
                                         if (curlen >= min_words)
                                                 break;
                                 }
-                               if (curlen < min_words && i >= prs->curwords)
-                               {                               /* got end of text and our cover is shorter
-                                                                * than min_words */
+                               if (curlen < min_words)
+                               {
+                                       /*
+                                        * Reached end of text and our headline is still shorter
+                                        * than min_words, so try to extend it to the left.
+                                        */
                                         for (i = p - 1; i >= 0; i--)
                                         {
                                                 if (!NONWORDTOKEN(prs->words[i].type))
                                                         curlen++;
-                                               if (prs->words[i].item && !prs->words[i].repeated)
+                                               if (INTERESTINGWORD(i))
                                                         poslen++;
                                                 if (curlen >= max_words)
                                                         break;
-                                               if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+                                               if (BADENDPOINT(i))
                                                         continue;
                                                 if (curlen >= min_words)
                                                         break;
@@ -2367,34 +2440,48 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
                                 }
                         }
                         else
-                       {                                       /* shorter cover :((( */
+                       {
+                               /*
+                                * Can't make headline longer, so consider making it shorter
+                                * if needed to avoid a bad endpoint.
+                                */
                                 if (i > q)
                                         i = q;
                                 for (; curlen > min_words; i--)
                                 {
                                         if (!NONWORDTOKEN(prs->words[i].type))
                                                 curlen--;
-                                       if (prs->words[i].item && !prs->words[i].repeated)
+                                       if (INTERESTINGWORD(i))
                                                 poslen--;
                                         pose = i;
-                                       if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
-                                               continue;
-                                       break;
+                                       if (!BADENDPOINT(i))
+                                               break;
                                 }
                         }
  
-                       if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
-                               (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
-                                (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
+                       /*
+                        * Adopt this headline if it's the first, or if it has more
+                        * interesting words and isn't ending at a bad endpoint, or if it
+                        * replaces a bad endpoint with a good one (XXX even if it has
+                        * fewer interesting words?  Really?)
+                        */
+                       if (bestlen < 0 ||
+                               (poslen > bestlen && !BADENDPOINT(pose)) ||
+                               (!BADENDPOINT(pose) && BADENDPOINT(beste)))
                         {
                                 bestb = posb;
                                 beste = pose;
                                 bestlen = poslen;
                         }
  
+                       /* move p to generate the next cover */
                         p++;
                 }
  
+               /*
+                * If we found nothing acceptable, select min_words words starting at
+                * the beginning.
+                */
                 if (bestlen < 0)
                 {
                         curlen = 0;
@@ -2410,32 +2497,17 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
         }
         else
         {
+               /* highlightall mode: headline is whole document */
                 bestb = 0;
                 beste = prs->curwords - 1;
         }
  
-       for (i = bestb; i <= beste; i++)
-       {
-               if (prs->words[i].item)
-                       prs->words[i].selected = 1;
-               if (highlight == 0)
-               {
-                       if (HLIDREPLACE(prs->words[i].type))
-                               prs->words[i].replace = 1;
-                       else if (HLIDSKIP(prs->words[i].type))
-                               prs->words[i].skip = 1;
-               }
-               else
-               {
-                       if (XMLHLIDSKIP(prs->words[i].type))
-                               prs->words[i].skip = 1;
-               }
-
-               prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
-       }
-
+       mark_fragment(prs, highlightall, bestb, beste);
  }
  
+/*
+ * Default parser's prsheadline function
+ */
  Datum
  prsd_headline(PG_FUNCTION_ARGS)
  {
@@ -2443,17 +2515,18 @@ prsd_headline(PG_FUNCTION_ARGS)
         List       *prsoptions = (List *) PG_GETARG_POINTER(1);
         TSQuery         query = PG_GETARG_TSQUERY(2);
  
-       /* from opt + start and end tag */
+       /* default option values: */
         int                     min_words = 15;
         int                     max_words = 35;
         int                     shortword = 3;
         int                     max_fragments = 0;
-       int                     highlight = 0;
+       bool            highlightall = false;
         ListCell   *l;
  
-       /* config */
+       /* Extract configuration option values */
         prs->startsel = NULL;
         prs->stopsel = NULL;
+       prs->fragdelim = NULL;
         foreach(l, prsoptions)
         {
                 DefElem    *defel = (DefElem *) lfirst(l);
@@ -2474,12 +2547,12 @@ prsd_headline(PG_FUNCTION_ARGS)
                 else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
                         prs->fragdelim = pstrdup(val);
                 else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
-                       highlight = (pg_strcasecmp(val, "1") == 0 ||
-                                                pg_strcasecmp(val, "on") == 0 ||
-                                                pg_strcasecmp(val, "true") == 0 ||
-                                                pg_strcasecmp(val, "t") == 0 ||
-                                                pg_strcasecmp(val, "y") == 0 ||
-                                                pg_strcasecmp(val, "yes") == 0);
+                       highlightall = (pg_strcasecmp(val, "1") == 0 ||
+                                                       pg_strcasecmp(val, "on") == 0 ||
+                                                       pg_strcasecmp(val, "true") == 0 ||
+                                                       pg_strcasecmp(val, "t") == 0 ||
+                                                       pg_strcasecmp(val, "y") == 0 ||
+                                                       pg_strcasecmp(val, "yes") == 0);
                 else
                         ereport(ERROR,
                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -2487,7 +2560,8 @@ prsd_headline(PG_FUNCTION_ARGS)
                                                         defel->defname)));
         }
  
-       if (highlight == 0)
+       /* in HighlightAll mode these parameters are ignored */
+       if (!highlightall)
         {
                 if (min_words >= max_words)
                         ereport(ERROR,
@@ -2507,18 +2581,23 @@ prsd_headline(PG_FUNCTION_ARGS)
                                          errmsg("MaxFragments should be >= 0")));
         }
  
+       /* Apply appropriate headline selector */
         if (max_fragments == 0)
-               /* call the default headline generator */
-               mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
+               mark_hl_words(prs, query, highlightall, shortword,
+                                         min_words, max_words);
         else
-               mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
+               mark_hl_fragments(prs, query, highlightall, shortword,
+                                                 min_words, max_words, max_fragments);
  
+       /* Fill in default values for string options */
         if (!prs->startsel)
                 prs->startsel = pstrdup("<b>");
         if (!prs->stopsel)
                 prs->stopsel = pstrdup("</b>");
         if (!prs->fragdelim)
                 prs->fragdelim = pstrdup(" ... ");
+
+       /* Caller will need these lengths, too */
         prs->startsellen = strlen(prs->startsel);
         prs->stopsellen = strlen(prs->stopsel);
         prs->fragdelimlen = strlen(prs->fragdelim);
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 9 Apr 2020 16:36:59 +0000 (12:36 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 9 Apr 2020 16:36:59 +0000 (12:36 -0400)