diff options
| author | Teodor Sigaev | 2016-04-07 15:44:18 +0000 |
|---|---|---|
| committer | Teodor Sigaev | 2016-04-07 15:44:18 +0000 |
| commit | bb140506df605fab58f48926ee1db1f80bdafb59 (patch) | |
| tree | 581f9aeb71e3596000af3b4904e0c62a372d77b3 /src/backend/tsearch | |
| parent | 015e88942aa50f0d419ddac00e63bb06d6e62e86 (diff) | |
Phrase full text search.
Patch introduces new text search operator (<-> or <DISTANCE>) into tsquery.
On-disk and binary in/out format of tsquery are backward compatible.
It has two side effect:
- change order for tsquery, so, users, who has a btree index over tsquery,
should reindex it
- less number of parenthesis in tsquery output, and tsquery becomes more
readable
Authors: Teodor Sigaev, Oleg Bartunov, Dmitry Ivanov
Reviewers: Alexander Korotkov, Artur Zakirov
Diffstat (limited to 'src/backend/tsearch')
| -rw-r--r-- | src/backend/tsearch/to_tsany.c | 187 | ||||
| -rw-r--r-- | src/backend/tsearch/ts_parse.c | 15 | ||||
| -rw-r--r-- | src/backend/tsearch/ts_selfuncs.c | 3 | ||||
| -rw-r--r-- | src/backend/tsearch/wparser_def.c | 31 |
4 files changed, 137 insertions, 99 deletions
diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c index aa77ec0728..3f69d74702 100644 --- a/src/backend/tsearch/to_tsany.c +++ b/src/backend/tsearch/to_tsany.c @@ -18,6 +18,13 @@ #include "utils/builtins.h" +typedef struct MorphOpaque +{ + Oid cfg_id; + int qoperator; /* query operator */ +} MorphOpaque; + + Datum get_current_ts_config(PG_FUNCTION_ARGS) { @@ -262,60 +269,81 @@ to_tsvector(PG_FUNCTION_ARGS) * to the stack. * * All words belonging to the same variant are pushed as an ANDed list, - * and different variants are ORred together. + * and different variants are ORed together. */ static void pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix) { - int32 count = 0; - ParsedText prs; - uint32 variant, - pos, - cntvar = 0, - cntpos = 0, - cnt = 0; - Oid cfg_id = DatumGetObjectId(opaque); /* the input is actually - * an Oid, not a pointer */ + int32 count = 0; + ParsedText prs; + uint32 variant, + pos = 0, + cntvar = 0, + cntpos = 0, + cnt = 0; + MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque); prs.lenwords = 4; prs.curwords = 0; prs.pos = 0; prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); - parsetext(cfg_id, &prs, strval, lenval); + parsetext(data->cfg_id, &prs, strval, lenval); if (prs.curwords > 0) { - while (count < prs.curwords) { - pos = prs.words[count].pos.pos; + /* + * Were any stop words removed? If so, fill empty positions + * with placeholders linked by an appropriate operator. + */ + if (pos > 0 && pos + 1 < prs.words[count].pos.pos) + { + while (pos + 1 < prs.words[count].pos.pos) + { + /* put placeholders for each missing stop word */ + pushStop(state); + if (cntpos) + pushOperator(state, data->qoperator, 1); + cntpos++; + pos++; + } + } + + pos = prs.words[count].pos.pos; /* save current word's position */ + + /* Go through all variants obtained from this token */ cntvar = 0; while (count < prs.curwords && pos == prs.words[count].pos.pos) { variant = prs.words[count].nvariant; + /* Push all words belonging to the same variant */ cnt = 0; - while (count < prs.curwords && pos == prs.words[count].pos.pos && variant == prs.words[count].nvariant) + while (count < prs.curwords && + pos == prs.words[count].pos.pos && + variant == prs.words[count].nvariant) { - - pushValue(state, prs.words[count].word, prs.words[count].len, weight, - ((prs.words[count].flags & TSL_PREFIX) || prefix) ? true : false); + pushValue(state, + prs.words[count].word, + prs.words[count].len, + weight, + ((prs.words[count].flags & TSL_PREFIX) || prefix)); pfree(prs.words[count].word); if (cnt) - pushOperator(state, OP_AND); + pushOperator(state, OP_AND, 0); cnt++; count++; } if (cntvar) - pushOperator(state, OP_OR); + pushOperator(state, OP_OR, 0); cntvar++; } if (cntpos) - pushOperator(state, OP_AND); - + pushOperator(state, data->qoperator, 1); /* distance may be useful */ cntpos++; } @@ -329,44 +357,18 @@ pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, Datum to_tsquery_byid(PG_FUNCTION_ARGS) { - Oid cfgid = PG_GETARG_OID(0); - text *in = PG_GETARG_TEXT_P(1); - TSQuery query; - QueryItem *res; - int32 len; - - query = parse_tsquery(text_to_cstring(in), pushval_morph, ObjectIdGetDatum(cfgid), false); - - if (query->size == 0) - PG_RETURN_TSQUERY(query); - - /* clean out any stopword placeholders from the tree */ - res = clean_fakeval(GETQUERY(query), &len); - if (!res) - { - SET_VARSIZE(query, HDRSIZETQ); - query->size = 0; - PG_RETURN_POINTER(query); - } - memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(QueryItem)); + text *in = PG_GETARG_TEXT_P(1); + TSQuery query; + MorphOpaque data; - /* - * Removing the stopword placeholders might've resulted in fewer - * QueryItems. If so, move the operands up accordingly. - */ - if (len != query->size) - { - char *oldoperand = GETOPERAND(query); - int32 lenoperand = VARSIZE(query) - (oldoperand - (char *) query); + data.cfg_id = PG_GETARG_OID(0); + data.qoperator = OP_AND; - Assert(len < query->size); - - query->size = len; - memmove((void *) GETOPERAND(query), oldoperand, VARSIZE(query) - (oldoperand - (char *) query)); - SET_VARSIZE(query, COMPUTESIZE(len, lenoperand)); - } + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + false); - pfree(res); PG_RETURN_TSQUERY(query); } @@ -385,55 +387,60 @@ to_tsquery(PG_FUNCTION_ARGS) Datum plainto_tsquery_byid(PG_FUNCTION_ARGS) { - Oid cfgid = PG_GETARG_OID(0); - text *in = PG_GETARG_TEXT_P(1); - TSQuery query; - QueryItem *res; - int32 len; + text *in = PG_GETARG_TEXT_P(1); + TSQuery query; + MorphOpaque data; - query = parse_tsquery(text_to_cstring(in), pushval_morph, ObjectIdGetDatum(cfgid), true); + data.cfg_id = PG_GETARG_OID(0); + data.qoperator = OP_AND; - if (query->size == 0) - PG_RETURN_TSQUERY(query); + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + true); - /* clean out any stopword placeholders from the tree */ - res = clean_fakeval(GETQUERY(query), &len); - if (!res) - { - SET_VARSIZE(query, HDRSIZETQ); - query->size = 0; - PG_RETURN_POINTER(query); - } - memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(QueryItem)); + PG_RETURN_POINTER(query); +} - /* - * Removing the stopword placeholders might've resulted in fewer - * QueryItems. If so, move the operands up accordingly. - */ - if (len != query->size) - { - char *oldoperand = GETOPERAND(query); - int32 lenoperand = VARSIZE(query) - (oldoperand - (char *) query); +Datum +plainto_tsquery(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_P(0); + Oid cfgId; + + cfgId = getTSCurrentConfig(true); + PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); +} - Assert(len < query->size); - query->size = len; - memmove((void *) GETOPERAND(query), oldoperand, lenoperand); - SET_VARSIZE(query, COMPUTESIZE(len, lenoperand)); - } +Datum +phraseto_tsquery_byid(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_P(1); + TSQuery query; + MorphOpaque data; - pfree(res); - PG_RETURN_POINTER(query); + data.cfg_id = PG_GETARG_OID(0); + data.qoperator = OP_PHRASE; + + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + true); + + PG_RETURN_TSQUERY(query); } Datum -plainto_tsquery(PG_FUNCTION_ARGS) +phraseto_tsquery(PG_FUNCTION_ARGS) { text *in = PG_GETARG_TEXT_P(0); Oid cfgId; cfgId = getTSCurrentConfig(true); - PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid, + PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid, ObjectIdGetDatum(cfgId), PointerGetDatum(in))); } diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c index 64cf906a5a..f0e4269e84 100644 --- a/src/backend/tsearch/ts_parse.c +++ b/src/backend/tsearch/ts_parse.c @@ -454,7 +454,7 @@ hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type) } static void -hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen) +hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen) { int i; QueryItem *item = GETQUERY(query); @@ -467,6 +467,7 @@ hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen) } word = &(prs->words[prs->curwords - 1]); + word->pos = LIMITPOS(pos); for (i = 0; i < query->size; i++) { if (item->type == QI_VAL && @@ -492,17 +493,20 @@ addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme { ParsedLex *tmplexs; TSLexeme *ptr; + int32 savedpos; while (lexs) { - if (lexs->type > 0) hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type); ptr = norms; + savedpos = prs->vectorpos; while (ptr && ptr->lexeme) { - hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme)); + if (ptr->flags & TSL_ADDPOS) + savedpos++; + hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme)); ptr++; } @@ -516,6 +520,8 @@ addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme ptr = norms; while (ptr->lexeme) { + if (ptr->flags & TSL_ADDPOS) + prs->vectorpos++; pfree(ptr->lexeme); ptr++; } @@ -575,7 +581,10 @@ hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int bu do { if ((norms = LexizeExec(&ldata, &lexs)) != NULL) + { + prs->vectorpos++; addHLParsedLex(prs, query, lexs, norms); + } else addHLParsedLex(prs, query, lexs, NULL); } while (norms); diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c index 7462888b5c..c4118f1db2 100644 --- a/src/backend/tsearch/ts_selfuncs.c +++ b/src/backend/tsearch/ts_selfuncs.c @@ -261,7 +261,7 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, /* * Traverse the tsquery in preorder, calculating selectivity as: * - * selec(left_oper) * selec(right_oper) in AND nodes, + * selec(left_oper) * selec(right_oper) in AND & PHRASE nodes, * * selec(left_oper) + selec(right_oper) - * selec(left_oper) * selec(right_oper) in OR nodes, @@ -400,6 +400,7 @@ tsquery_opr_selec(QueryItem *item, char *operand, lookup, length, minfreq); break; + case OP_PHRASE: case OP_AND: s1 = tsquery_opr_selec(item + 1, operand, lookup, length, minfreq); diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 4a28ce7545..2faa15ebd4 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -2030,15 +2030,36 @@ typedef struct } hlCheck; static bool -checkcondition_HL(void *checkval, QueryOperand *val) +checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data) { int i; + hlCheck *checkval = (hlCheck *) opaque; - for (i = 0; i < ((hlCheck *) checkval)->len; i++) + for (i = 0; i < checkval->len; i++) { - if (((hlCheck *) checkval)->words[i].item == val) - return true; + if (checkval->words[i].item == val) + { + /* don't need to find all positions */ + if (!data) + return true; + + if (!data->pos) + { + data->pos = palloc(sizeof(WordEntryPos) * checkval->len); + data->allocated = true; + data->npos = 1; + data->pos[0] = checkval->words[i].pos; + } + else if (data->pos[data->npos - 1] < checkval->words[i].pos) + { + data->pos[data->npos++] = checkval->words[i].pos; + } + } } + + if (data && data->npos > 0) + return true; + return false; } @@ -2400,7 +2421,7 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)) { - /* best already finded, so try one more cover */ + /* best already found, so try one more cover */ p++; continue; } |
