PG_RETURN_VOID();
}
+
+/*
+ * ts_headline support begins here
+ */
+
+/* token type classification macros */
#define LEAVETOKEN(x) ( (x)==SPACE )
#define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
#define ENDPUNCTOKEN(x) ( (x)==SPACE )
#define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
+/*
+ * Macros useful in headline selection. These rely on availability of
+ * "HeadlineParsedText *prs" describing some text, and "int shortword"
+ * describing the "short word" length parameter.
+ */
+
+/* Interesting words are non-repeated search terms */
+#define INTERESTINGWORD(j) \
+ (prs->words[j].item && !prs->words[j].repeated)
+
+/* Don't want to end at a non-word or a short word */
+#define BADENDPOINT(j) \
+ (NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword)
+
typedef struct
{
+ /* one cover (well, really one fragment) for mark_hl_fragments */
+ int32 startpos; /* fragment's starting word index */
+ int32 endpos; /* ending word index (inclusive) */
+ int32 poslen; /* number of interesting words */
+ int32 curlen; /* total number of words */
+ bool chosen; /* chosen? */
+ bool excluded; /* excluded? */
+} CoverPos;
+
+typedef struct
+{
+ /* callback data for checkcondition_HL */
HeadlineWordEntry *words;
int len;
} hlCheck;
+
+/*
+ * TS_execute callback for matching a tsquery operand to headline words
+ */
static bool
checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
{
- int i;
hlCheck *checkval = (hlCheck *) opaque;
+ int i;
+ /* scan words array for marching items */
for (i = 0; i < checkval->len; i++)
{
if (checkval->words[i].item == val)
{
- /* don't need to find all positions */
+ /* if data == NULL, don't need to report positions */
if (!data)
return true;
return false;
}
+/*
+ * Apply suitable highlight marking to words selected by headline selector
+ *
+ * The words from startpos to endpos inclusive are marked per highlightall
+ */
static void
-mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
+mark_fragment(HeadlineParsedText *prs, bool highlightall,
+ int startpos, int endpos)
{
int i;
{
if (prs->words[i].item)
prs->words[i].selected = 1;
- if (highlight == 0)
+ if (!highlightall)
{
if (HLIDREPLACE(prs->words[i].type))
prs->words[i].replace = 1;
}
}
-typedef struct
-{
- int32 startpos;
- int32 endpos;
- int32 poslen;
- int32 curlen;
- int16 in;
- int16 excluded;
-} CoverPos;
-
+/*
+ * split a cover substring into fragments not longer than max_words
+ *
+ * At entry, *startpos and *endpos are the (remaining) bounds of the cover
+ * substring. They are updated to hold the bounds of the next fragment.
+ *
+ * *curlen and *poslen are set to the fragment's length, in words and
+ * interesting words respectively.
+ */
static void
get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
int *curlen, int *poslen, int max_words)
int i;
/*
- * Objective: Generate a fragment of words between startpos and endpos
- * such that it has at most max_words and both ends has query words. If
- * the startpos and endpos are the endpoints of the cover and the cover
- * has fewer words than max_words, then this function should just return
- * the cover
+ * Objective: select a fragment of words between startpos and endpos such
+ * that it has at most max_words and both ends have query words. If the
+ * startpos and endpos are the endpoints of the cover and the cover has
+ * fewer words than max_words, then this function should just return the
+ * cover
*/
/* first move startpos to an item */
for (i = *startpos; i <= *endpos; i++)
{
*startpos = i;
- if (prs->words[i].item && !prs->words[i].repeated)
+ if (INTERESTINGWORD(i))
break;
}
/* cut endpos to have only max_words */
{
if (!NONWORDTOKEN(prs->words[i].type))
*curlen += 1;
- if (prs->words[i].item && !prs->words[i].repeated)
+ if (INTERESTINGWORD(i))
*poslen += 1;
}
/* if the cover was cut then move back endpos to a query item */
for (i = *endpos; i >= *startpos; i--)
{
*endpos = i;
- if (prs->words[i].item && !prs->words[i].repeated)
+ if (INTERESTINGWORD(i))
break;
if (!NONWORDTOKEN(prs->words[i].type))
*curlen -= 1;
}
}
+/*
+ * Headline selector used when MaxFragments > 0
+ *
+ * Note: in this mode, highlightall is disregarded for phrase selection;
+ * it only controls presentation details.
+ */
static void
-mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
+mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
int shortword, int min_words,
int max_words, int max_fragments)
{
/*
* Break the cover into smaller fragments such that each fragment has
- * at most max_words. Also ensure that each end of the fragment is a
+ * at most max_words. Also ensure that each end of each fragment is a
* query word. This will allow us to stretch the fragment in either
* direction
*/
covers[numcovers].endpos = endpos;
covers[numcovers].curlen = curlen;
covers[numcovers].poslen = poslen;
- covers[numcovers].in = 0;
- covers[numcovers].excluded = 0;
+ covers[numcovers].chosen = false;
+ covers[numcovers].excluded = false;
numcovers++;
startpos = endpos + 1;
endpos = q;
}
+
/* move p to generate the next cover */
p++;
}
*/
for (i = 0; i < numcovers; i++)
{
- if (!covers[i].in && !covers[i].excluded &&
- (maxitems < covers[i].poslen || (maxitems == covers[i].poslen
- && minwords > covers[i].curlen)))
+ if (!covers[i].chosen && !covers[i].excluded &&
+ (maxitems < covers[i].poslen ||
+ (maxitems == covers[i].poslen &&
+ minwords > covers[i].curlen)))
{
maxitems = covers[i].poslen;
minwords = covers[i].curlen;
/* if a cover was found mark it */
if (minI >= 0)
{
- covers[minI].in = 1;
+ covers[minI].chosen = true;
/* adjust the size of cover */
startpos = covers[minI].startpos;
endpos = covers[minI].endpos;
}
posmarker = i;
}
- /* cut back startpos till we find a non short token */
- for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
+ /* cut back startpos till we find a good endpoint */
+ for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen--;
curlen++;
posmarker = i;
}
- /* cut back endpos till we find a non-short token */
- for (i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
+ /* cut back endpos till we find a good endpoint */
+ for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen--;
covers[minI].endpos = endpos;
covers[minI].curlen = curlen;
/* Mark the chosen fragments (covers) */
- mark_fragment(prs, highlight, startpos, endpos);
+ mark_fragment(prs, highlightall, startpos, endpos);
num_f++;
/* exclude overlapping covers */
for (i = 0; i < numcovers; i++)
{
- if (i != minI && ((covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
- covers[i].excluded = 1;
+ if (i != minI &&
+ ((covers[i].startpos >= covers[minI].startpos &&
+ covers[i].startpos <= covers[minI].endpos) ||
+ (covers[i].endpos >= covers[minI].startpos &&
+ covers[i].endpos <= covers[minI].endpos)))
+ covers[i].excluded = true;
}
}
else
break;
}
- /* show at least min_words we have not marked anything */
+ /* show at least min_words if we have not marked anything */
if (num_f <= 0)
{
startpos = endpos = curlen = 0;
curlen++;
endpos = i;
}
- mark_fragment(prs, highlight, startpos, endpos);
+ mark_fragment(prs, highlightall, startpos, endpos);
}
+
pfree(covers);
}
+/*
+ * Headline selector used when MaxFragments == 0
+ */
static void
-mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
+mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
int shortword, int min_words, int max_words)
{
int p = 0,
int bestb = -1,
beste = -1;
int bestlen = -1;
- int pose = 0,
+ int pose,
posb,
poslen,
curlen;
int i;
- if (highlight == 0)
+ if (!highlightall)
{
+ /* examine all covers, select a headline using the best one */
while (hlCover(prs, query, &p, &q))
{
- /* find cover len in words */
+ /*
+ * Count words (curlen) and interesting words (poslen) within
+ * cover, but stop once we reach max_words. This step doesn't
+ * consider whether that's a good stopping point. posb and pose
+ * are set to the start and end indexes of the possible headline.
+ */
curlen = 0;
poslen = 0;
+ posb = pose = p;
for (i = p; i <= q && curlen < max_words; i++)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
- if (prs->words[i].item && !prs->words[i].repeated)
+ if (INTERESTINGWORD(i))
poslen++;
pose = i;
}
- if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
+ /* XXX this optimization seems unnecessary and wrong */
+ if (poslen < bestlen && !BADENDPOINT(beste))
{
- /* best already found, so try one more cover */
+ /* better cover already found, so try next cover */
p++;
continue;
}
- posb = p;
if (curlen < max_words)
- { /* find good end */
+ {
+ /*
+ * We have room to lengthen the headline, so search forward
+ * until it's full or we find a good stopping point. We'll
+ * reconsider the word at "q", then move forward.
+ */
for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
{
- if (i != q)
+ if (i > q)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
- if (prs->words[i].item && !prs->words[i].repeated)
+ if (INTERESTINGWORD(i))
poslen++;
}
pose = i;
- if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+ if (BADENDPOINT(i))
continue;
if (curlen >= min_words)
break;
}
- if (curlen < min_words && i >= prs->curwords)
- { /* got end of text and our cover is shorter
- * than min_words */
+ if (curlen < min_words)
+ {
+ /*
+ * Reached end of text and our headline is still shorter
+ * than min_words, so try to extend it to the left.
+ */
for (i = p - 1; i >= 0; i--)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
- if (prs->words[i].item && !prs->words[i].repeated)
+ if (INTERESTINGWORD(i))
poslen++;
if (curlen >= max_words)
break;
- if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+ if (BADENDPOINT(i))
continue;
if (curlen >= min_words)
break;
}
}
else
- { /* shorter cover :((( */
+ {
+ /*
+ * Can't make headline longer, so consider making it shorter
+ * if needed to avoid a bad endpoint.
+ */
if (i > q)
i = q;
for (; curlen > min_words; i--)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen--;
- if (prs->words[i].item && !prs->words[i].repeated)
+ if (INTERESTINGWORD(i))
poslen--;
pose = i;
- if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
- continue;
- break;
+ if (!BADENDPOINT(i))
+ break;
}
}
- if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
- (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
- (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
+ /*
+ * Adopt this headline if it's the first, or if it has more
+ * interesting words and isn't ending at a bad endpoint, or if it
+ * replaces a bad endpoint with a good one (XXX even if it has
+ * fewer interesting words? Really?)
+ */
+ if (bestlen < 0 ||
+ (poslen > bestlen && !BADENDPOINT(pose)) ||
+ (!BADENDPOINT(pose) && BADENDPOINT(beste)))
{
bestb = posb;
beste = pose;
bestlen = poslen;
}
+ /* move p to generate the next cover */
p++;
}
+ /*
+ * If we found nothing acceptable, select min_words words starting at
+ * the beginning.
+ */
if (bestlen < 0)
{
curlen = 0;
}
else
{
+ /* highlightall mode: headline is whole document */
bestb = 0;
beste = prs->curwords - 1;
}
- for (i = bestb; i <= beste; i++)
- {
- if (prs->words[i].item)
- prs->words[i].selected = 1;
- if (highlight == 0)
- {
- if (HLIDREPLACE(prs->words[i].type))
- prs->words[i].replace = 1;
- else if (HLIDSKIP(prs->words[i].type))
- prs->words[i].skip = 1;
- }
- else
- {
- if (XMLHLIDSKIP(prs->words[i].type))
- prs->words[i].skip = 1;
- }
-
- prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
- }
-
+ mark_fragment(prs, highlightall, bestb, beste);
}
+/*
+ * Default parser's prsheadline function
+ */
Datum
prsd_headline(PG_FUNCTION_ARGS)
{
List *prsoptions = (List *) PG_GETARG_POINTER(1);
TSQuery query = PG_GETARG_TSQUERY(2);
- /* from opt + start and end tag */
+ /* default option values: */
int min_words = 15;
int max_words = 35;
int shortword = 3;
int max_fragments = 0;
- int highlight = 0;
+ bool highlightall = false;
ListCell *l;
- /* config */
+ /* Extract configuration option values */
prs->startsel = NULL;
prs->stopsel = NULL;
+ prs->fragdelim = NULL;
foreach(l, prsoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
prs->fragdelim = pstrdup(val);
else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
- highlight = (pg_strcasecmp(val, "1") == 0 ||
- pg_strcasecmp(val, "on") == 0 ||
- pg_strcasecmp(val, "true") == 0 ||
- pg_strcasecmp(val, "t") == 0 ||
- pg_strcasecmp(val, "y") == 0 ||
- pg_strcasecmp(val, "yes") == 0);
+ highlightall = (pg_strcasecmp(val, "1") == 0 ||
+ pg_strcasecmp(val, "on") == 0 ||
+ pg_strcasecmp(val, "true") == 0 ||
+ pg_strcasecmp(val, "t") == 0 ||
+ pg_strcasecmp(val, "y") == 0 ||
+ pg_strcasecmp(val, "yes") == 0);
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
defel->defname)));
}
- if (highlight == 0)
+ /* in HighlightAll mode these parameters are ignored */
+ if (!highlightall)
{
if (min_words >= max_words)
ereport(ERROR,
errmsg("MaxFragments should be >= 0")));
}
+ /* Apply appropriate headline selector */
if (max_fragments == 0)
- /* call the default headline generator */
- mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
+ mark_hl_words(prs, query, highlightall, shortword,
+ min_words, max_words);
else
- mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
+ mark_hl_fragments(prs, query, highlightall, shortword,
+ min_words, max_words, max_fragments);
+ /* Fill in default values for string options */
if (!prs->startsel)
prs->startsel = pstrdup("<b>");
if (!prs->stopsel)
prs->stopsel = pstrdup("</b>");
if (!prs->fragdelim)
prs->fragdelim = pstrdup(" ... ");
+
+ /* Caller will need these lengths, too */
prs->startsellen = strlen(prs->startsel);
prs->stopsellen = strlen(prs->stopsel);
prs->fragdelimlen = strlen(prs->fragdelim);