summaryrefslogtreecommitdiff
path: root/src/backend/tsearch
diff options
context:
space:
mode:
authorTeodor Sigaev2016-04-07 15:44:18 +0000
committerTeodor Sigaev2016-04-07 15:44:18 +0000
commitbb140506df605fab58f48926ee1db1f80bdafb59 (patch)
tree581f9aeb71e3596000af3b4904e0c62a372d77b3 /src/backend/tsearch
parent015e88942aa50f0d419ddac00e63bb06d6e62e86 (diff)
Phrase full text search.
Patch introduces new text search operator (<-> or <DISTANCE>) into tsquery. On-disk and binary in/out format of tsquery are backward compatible. It has two side effect: - change order for tsquery, so, users, who has a btree index over tsquery, should reindex it - less number of parenthesis in tsquery output, and tsquery becomes more readable Authors: Teodor Sigaev, Oleg Bartunov, Dmitry Ivanov Reviewers: Alexander Korotkov, Artur Zakirov
Diffstat (limited to 'src/backend/tsearch')
-rw-r--r--src/backend/tsearch/to_tsany.c187
-rw-r--r--src/backend/tsearch/ts_parse.c15
-rw-r--r--src/backend/tsearch/ts_selfuncs.c3
-rw-r--r--src/backend/tsearch/wparser_def.c31
4 files changed, 137 insertions, 99 deletions
diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c
index aa77ec0728..3f69d74702 100644
--- a/src/backend/tsearch/to_tsany.c
+++ b/src/backend/tsearch/to_tsany.c
@@ -18,6 +18,13 @@
#include "utils/builtins.h"
+typedef struct MorphOpaque
+{
+ Oid cfg_id;
+ int qoperator; /* query operator */
+} MorphOpaque;
+
+
Datum
get_current_ts_config(PG_FUNCTION_ARGS)
{
@@ -262,60 +269,81 @@ to_tsvector(PG_FUNCTION_ARGS)
* to the stack.
*
* All words belonging to the same variant are pushed as an ANDed list,
- * and different variants are ORred together.
+ * and different variants are ORed together.
*/
static void
pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
{
- int32 count = 0;
- ParsedText prs;
- uint32 variant,
- pos,
- cntvar = 0,
- cntpos = 0,
- cnt = 0;
- Oid cfg_id = DatumGetObjectId(opaque); /* the input is actually
- * an Oid, not a pointer */
+ int32 count = 0;
+ ParsedText prs;
+ uint32 variant,
+ pos = 0,
+ cntvar = 0,
+ cntpos = 0,
+ cnt = 0;
+ MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque);
prs.lenwords = 4;
prs.curwords = 0;
prs.pos = 0;
prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
- parsetext(cfg_id, &prs, strval, lenval);
+ parsetext(data->cfg_id, &prs, strval, lenval);
if (prs.curwords > 0)
{
-
while (count < prs.curwords)
{
- pos = prs.words[count].pos.pos;
+ /*
+ * Were any stop words removed? If so, fill empty positions
+ * with placeholders linked by an appropriate operator.
+ */
+ if (pos > 0 && pos + 1 < prs.words[count].pos.pos)
+ {
+ while (pos + 1 < prs.words[count].pos.pos)
+ {
+ /* put placeholders for each missing stop word */
+ pushStop(state);
+ if (cntpos)
+ pushOperator(state, data->qoperator, 1);
+ cntpos++;
+ pos++;
+ }
+ }
+
+ pos = prs.words[count].pos.pos; /* save current word's position */
+
+ /* Go through all variants obtained from this token */
cntvar = 0;
while (count < prs.curwords && pos == prs.words[count].pos.pos)
{
variant = prs.words[count].nvariant;
+ /* Push all words belonging to the same variant */
cnt = 0;
- while (count < prs.curwords && pos == prs.words[count].pos.pos && variant == prs.words[count].nvariant)
+ while (count < prs.curwords &&
+ pos == prs.words[count].pos.pos &&
+ variant == prs.words[count].nvariant)
{
-
- pushValue(state, prs.words[count].word, prs.words[count].len, weight,
- ((prs.words[count].flags & TSL_PREFIX) || prefix) ? true : false);
+ pushValue(state,
+ prs.words[count].word,
+ prs.words[count].len,
+ weight,
+ ((prs.words[count].flags & TSL_PREFIX) || prefix));
pfree(prs.words[count].word);
if (cnt)
- pushOperator(state, OP_AND);
+ pushOperator(state, OP_AND, 0);
cnt++;
count++;
}
if (cntvar)
- pushOperator(state, OP_OR);
+ pushOperator(state, OP_OR, 0);
cntvar++;
}
if (cntpos)
- pushOperator(state, OP_AND);
-
+ pushOperator(state, data->qoperator, 1); /* distance may be useful */
cntpos++;
}
@@ -329,44 +357,18 @@ pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval,
Datum
to_tsquery_byid(PG_FUNCTION_ARGS)
{
- Oid cfgid = PG_GETARG_OID(0);
- text *in = PG_GETARG_TEXT_P(1);
- TSQuery query;
- QueryItem *res;
- int32 len;
-
- query = parse_tsquery(text_to_cstring(in), pushval_morph, ObjectIdGetDatum(cfgid), false);
-
- if (query->size == 0)
- PG_RETURN_TSQUERY(query);
-
- /* clean out any stopword placeholders from the tree */
- res = clean_fakeval(GETQUERY(query), &len);
- if (!res)
- {
- SET_VARSIZE(query, HDRSIZETQ);
- query->size = 0;
- PG_RETURN_POINTER(query);
- }
- memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(QueryItem));
+ text *in = PG_GETARG_TEXT_P(1);
+ TSQuery query;
+ MorphOpaque data;
- /*
- * Removing the stopword placeholders might've resulted in fewer
- * QueryItems. If so, move the operands up accordingly.
- */
- if (len != query->size)
- {
- char *oldoperand = GETOPERAND(query);
- int32 lenoperand = VARSIZE(query) - (oldoperand - (char *) query);
+ data.cfg_id = PG_GETARG_OID(0);
+ data.qoperator = OP_AND;
- Assert(len < query->size);
-
- query->size = len;
- memmove((void *) GETOPERAND(query), oldoperand, VARSIZE(query) - (oldoperand - (char *) query));
- SET_VARSIZE(query, COMPUTESIZE(len, lenoperand));
- }
+ query = parse_tsquery(text_to_cstring(in),
+ pushval_morph,
+ PointerGetDatum(&data),
+ false);
- pfree(res);
PG_RETURN_TSQUERY(query);
}
@@ -385,55 +387,60 @@ to_tsquery(PG_FUNCTION_ARGS)
Datum
plainto_tsquery_byid(PG_FUNCTION_ARGS)
{
- Oid cfgid = PG_GETARG_OID(0);
- text *in = PG_GETARG_TEXT_P(1);
- TSQuery query;
- QueryItem *res;
- int32 len;
+ text *in = PG_GETARG_TEXT_P(1);
+ TSQuery query;
+ MorphOpaque data;
- query = parse_tsquery(text_to_cstring(in), pushval_morph, ObjectIdGetDatum(cfgid), true);
+ data.cfg_id = PG_GETARG_OID(0);
+ data.qoperator = OP_AND;
- if (query->size == 0)
- PG_RETURN_TSQUERY(query);
+ query = parse_tsquery(text_to_cstring(in),
+ pushval_morph,
+ PointerGetDatum(&data),
+ true);
- /* clean out any stopword placeholders from the tree */
- res = clean_fakeval(GETQUERY(query), &len);
- if (!res)
- {
- SET_VARSIZE(query, HDRSIZETQ);
- query->size = 0;
- PG_RETURN_POINTER(query);
- }
- memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(QueryItem));
+ PG_RETURN_POINTER(query);
+}
- /*
- * Removing the stopword placeholders might've resulted in fewer
- * QueryItems. If so, move the operands up accordingly.
- */
- if (len != query->size)
- {
- char *oldoperand = GETOPERAND(query);
- int32 lenoperand = VARSIZE(query) - (oldoperand - (char *) query);
+Datum
+plainto_tsquery(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_P(0);
+ Oid cfgId;
+
+ cfgId = getTSCurrentConfig(true);
+ PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid,
+ ObjectIdGetDatum(cfgId),
+ PointerGetDatum(in)));
+}
- Assert(len < query->size);
- query->size = len;
- memmove((void *) GETOPERAND(query), oldoperand, lenoperand);
- SET_VARSIZE(query, COMPUTESIZE(len, lenoperand));
- }
+Datum
+phraseto_tsquery_byid(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_P(1);
+ TSQuery query;
+ MorphOpaque data;
- pfree(res);
- PG_RETURN_POINTER(query);
+ data.cfg_id = PG_GETARG_OID(0);
+ data.qoperator = OP_PHRASE;
+
+ query = parse_tsquery(text_to_cstring(in),
+ pushval_morph,
+ PointerGetDatum(&data),
+ true);
+
+ PG_RETURN_TSQUERY(query);
}
Datum
-plainto_tsquery(PG_FUNCTION_ARGS)
+phraseto_tsquery(PG_FUNCTION_ARGS)
{
text *in = PG_GETARG_TEXT_P(0);
Oid cfgId;
cfgId = getTSCurrentConfig(true);
- PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid,
+ PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid,
ObjectIdGetDatum(cfgId),
PointerGetDatum(in)));
}
diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c
index 64cf906a5a..f0e4269e84 100644
--- a/src/backend/tsearch/ts_parse.c
+++ b/src/backend/tsearch/ts_parse.c
@@ -454,7 +454,7 @@ hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
}
static void
-hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
+hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
{
int i;
QueryItem *item = GETQUERY(query);
@@ -467,6 +467,7 @@ hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
}
word = &(prs->words[prs->curwords - 1]);
+ word->pos = LIMITPOS(pos);
for (i = 0; i < query->size; i++)
{
if (item->type == QI_VAL &&
@@ -492,17 +493,20 @@ addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme
{
ParsedLex *tmplexs;
TSLexeme *ptr;
+ int32 savedpos;
while (lexs)
{
-
if (lexs->type > 0)
hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
ptr = norms;
+ savedpos = prs->vectorpos;
while (ptr && ptr->lexeme)
{
- hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
+ if (ptr->flags & TSL_ADDPOS)
+ savedpos++;
+ hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
ptr++;
}
@@ -516,6 +520,8 @@ addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme
ptr = norms;
while (ptr->lexeme)
{
+ if (ptr->flags & TSL_ADDPOS)
+ prs->vectorpos++;
pfree(ptr->lexeme);
ptr++;
}
@@ -575,7 +581,10 @@ hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int bu
do
{
if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
+ {
+ prs->vectorpos++;
addHLParsedLex(prs, query, lexs, norms);
+ }
else
addHLParsedLex(prs, query, lexs, NULL);
} while (norms);
diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c
index 7462888b5c..c4118f1db2 100644
--- a/src/backend/tsearch/ts_selfuncs.c
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -261,7 +261,7 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
/*
* Traverse the tsquery in preorder, calculating selectivity as:
*
- * selec(left_oper) * selec(right_oper) in AND nodes,
+ * selec(left_oper) * selec(right_oper) in AND & PHRASE nodes,
*
* selec(left_oper) + selec(right_oper) -
* selec(left_oper) * selec(right_oper) in OR nodes,
@@ -400,6 +400,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
lookup, length, minfreq);
break;
+ case OP_PHRASE:
case OP_AND:
s1 = tsquery_opr_selec(item + 1, operand,
lookup, length, minfreq);
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index 4a28ce7545..2faa15ebd4 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -2030,15 +2030,36 @@ typedef struct
} hlCheck;
static bool
-checkcondition_HL(void *checkval, QueryOperand *val)
+checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
{
int i;
+ hlCheck *checkval = (hlCheck *) opaque;
- for (i = 0; i < ((hlCheck *) checkval)->len; i++)
+ for (i = 0; i < checkval->len; i++)
{
- if (((hlCheck *) checkval)->words[i].item == val)
- return true;
+ if (checkval->words[i].item == val)
+ {
+ /* don't need to find all positions */
+ if (!data)
+ return true;
+
+ if (!data->pos)
+ {
+ data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
+ data->allocated = true;
+ data->npos = 1;
+ data->pos[0] = checkval->words[i].pos;
+ }
+ else if (data->pos[data->npos - 1] < checkval->words[i].pos)
+ {
+ data->pos[data->npos++] = checkval->words[i].pos;
+ }
+ }
}
+
+ if (data && data->npos > 0)
+ return true;
+
return false;
}
@@ -2400,7 +2421,7 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
{
- /* best already finded, so try one more cover */
+ /* best already found, so try one more cover */
p++;
continue;
}