From e6dbcb72fafa4031c73cc914e829a6dec96ab6b6 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 16 May 2008 16:31:02 +0000 Subject: Extend GIN to support partial-match searches, and extend tsquery to support prefix matching using this facility. Teodor Sigaev and Oleg Bartunov --- doc/src/sgml/datatype.sgml | 28 ++++++++--- doc/src/sgml/gin.sgml | 113 ++++++++++++++++++++++++++++++++++++------- doc/src/sgml/textsearch.sgml | 19 +++++++- doc/src/sgml/xindex.sgml | 9 +++- 4 files changed, 140 insertions(+), 29 deletions(-) (limited to 'doc/src/sgml') diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml index fb813d7042..48dfe0a9c4 100644 --- a/doc/src/sgml/datatype.sgml +++ b/doc/src/sgml/datatype.sgml @@ -1,4 +1,4 @@ - + Data Types @@ -3298,18 +3298,17 @@ SELECT * FROM test; SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector; tsvector ---------------------------------------------------- - 'a' 'on' 'and' 'ate' 'cat' 'fat' 'mat' 'rat' 'sat' + 'a' 'and' 'ate' 'cat' 'fat' 'mat' 'on' 'rat' 'sat' - (As the example shows, the sorting is first by length and then - alphabetically, but that detail is seldom important.) To represent + To represent lexemes containing whitespace or punctuation, surround them with quotes: SELECT $$the lexeme ' ' contains spaces$$::tsvector; tsvector ------------------------------------------- - 'the' ' ' 'lexeme' 'spaces' 'contains' + ' ' 'contains' 'lexeme' 'spaces' 'the' (We use dollar-quoted string literals in this example and the next one, @@ -3320,7 +3319,7 @@ SELECT $$the lexeme ' ' contains spaces$$::tsvector; SELECT $$the lexeme 'Joe''s' contains a quote$$::tsvector; tsvector ------------------------------------------------ - 'a' 'the' 'Joe''s' 'quote' 'lexeme' 'contains' + 'Joe''s' 'a' 'contains' 'lexeme' 'quote' 'the' Optionally, integer position(s) @@ -3330,7 +3329,7 @@ SELECT $$the lexeme 'Joe''s' contains a quote$$::tsvector; SELECT 'a:1 fat:2 cat:3 sat:4 on:5 a:6 mat:7 and:8 ate:9 a:10 fat:11 rat:12'::tsvector; tsvector ------------------------------------------------------------------------------- - 'a':1,6,10 'on':5 'and':8 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4 + 'a':1,6,10 'and':8 'ate':9 'cat':3 'fat':2,11 'mat':7 'on':5 'rat':12 'sat':4 A position normally indicates the source word's location in the @@ -3369,7 +3368,7 @@ SELECT 'a:1A fat:2B,4C cat:5D'::tsvector; select 'The Fat Rats'::tsvector; tsvector -------------------- - 'Fat' 'The' 'Rats' + 'Fat' 'Rats' 'The' For most English-text-searching applications the above words would @@ -3439,6 +3438,19 @@ SELECT 'fat:ab & cat'::tsquery; + + Also, lexemes in a tsquery can be labeled with * + to specify prefix matching: + +SELECT 'super:*'::tsquery; + tsquery +----------- + 'super':* + + This query will match any word in a tsvector that begins + with super. + + Quoting rules for lexemes are the same as described above for lexemes in tsvector; and, as with tsvector, diff --git a/doc/src/sgml/gin.sgml b/doc/src/sgml/gin.sgml index ad82da6b38..961451f714 100644 --- a/doc/src/sgml/gin.sgml +++ b/doc/src/sgml/gin.sgml @@ -1,4 +1,4 @@ - + GIN Indexes @@ -52,15 +52,15 @@ - All it takes to get a GIN access method working - is to implement four user-defined methods, which define the behavior of + All it takes to get a GIN access method working is to + implement four (or five) user-defined methods, which define the behavior of keys in the tree and the relationships between keys, indexed values, and indexable queries. In short, GIN combines extensibility with generality, code reuse, and a clean interface. - The four methods that an index operator class for + The four methods that an operator class for GIN must provide are: @@ -77,7 +77,7 @@ - Datum* extractValue(Datum inputValue, int32 *nkeys) + Datum *extractValue(Datum inputValue, int32 *nkeys) Returns an array of keys given a value to be indexed. The @@ -87,8 +87,8 @@ - Datum* extractQuery(Datum query, int32 *nkeys, - StrategyNumber n) + Datum *extractQuery(Datum query, int32 *nkeys, + StrategyNumber n, bool **pmatch) Returns an array of keys given a value to be queried; that is, @@ -100,13 +100,22 @@ to consult n to determine the data type of query and the key values that need to be extracted. The number of returned keys must be stored into *nkeys. - If number of keys is equal to zero then extractQuery - should store 0 or -1 into *nkeys. 0 means that any - row matches the query and sequence scan should be - produced. -1 means nothing can satisfy query. - Choice of value should be based on semantics meaning of operation with - given strategy number. + If the query contains no keys then extractQuery + should store 0 or -1 into *nkeys, depending on the + semantics of the operator. 0 means that every + value matches the query and a sequential scan should be + produced. -1 means nothing can match the query. + pmatch is an output argument for use when partial match + is supported. To use it, extractQuery must allocate + an array of *nkeys booleans and store its address at + *pmatch. Each element of the array should be set to TRUE + if the corresponding key requires partial match, FALSE if not. + If *pmatch is set to NULL then GIN assumes partial match + is not required. The variable is initialized to NULL before call, + so this argument can simply be ignored by operator classes that do + not support partial match. + @@ -133,6 +142,39 @@ + + Optionally, an operator class for + GIN can supply a fifth method: + + + + + + int comparePartial(Datum partial_key, Datum key, StrategyNumber n) + + + Compare a partial-match query to an index key. Returns an integer + whose sign indicates the result: less than zero means the index key + does not match the query, but the index scan should continue; zero + means that the index key does match the query; greater than zero + indicates that the index scan should stop because no more matches + are possible. The strategy number n of the operator + that generated the partial match query is provided, in case its + semantics are needed to determine when to end the scan. + + + + + + + + To support partial match queries, an operator class must + provide the comparePartial method, and its + extractQuery method must set the pmatch + parameter when a partial-match query is encountered. See + for details. + + @@ -146,6 +188,33 @@ list of heap pointers (PL, posting list) if the list is small enough. + + Partial match algorithm + + + GIN can support partial match queries, in which the query + does not determine an exact match for one or more keys, but the possible + matches fall within a reasonably narrow range of key values (within the + key sorting order determined by the compare support method). + The extractQuery method, instead of returning a key value + to be matched exactly, returns a key value that is the lower bound of + the range to be searched, and sets the pmatch flag true. + The key range is then searched using the comparePartial + method. comparePartial must return zero for an actual + match, less than zero for a non-match that is still within the range + to be searched, or greater than zero if the index key is past the range + that could match. + + + + During a partial-match scan, all itemPointers for matching keys + are OR'ed into a TIDBitmap. + The scan fails if the TIDBitmap becomes lossy. + In this case an error message will be reported with advice + to increase work_mem. + + + @@ -236,8 +305,14 @@ - GIN searches keys only by equality matching. This might - be improved in future. + It is possible for an operator class to circumvent the restriction against + full index scan. To do that, extractValue must return at least + one (possibly dummy) key for every indexed value, and + extractQuery must convert an unrestricted search into + a partial-match query that will scan the whole index. This is inefficient + but might be necessary to avoid corner-case failures with operators such + as LIKE. Note however that failure could still occur if the intermediate + TIDBitmap becomes lossy. @@ -247,9 +322,11 @@ The PostgreSQL source distribution includes GIN operator classes for tsvector and - for one-dimensional arrays of all internal types. The following - contrib modules also contain GIN - operator classes: + for one-dimensional arrays of all internal types. Prefix searching in + tsvector is implemented using the GIN partial match + feature. + The following contrib modules also contain + GIN operator classes: diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index caa8847ef8..41db566b6c 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1,4 +1,4 @@ - + Full Text Search @@ -754,6 +754,20 @@ SELECT to_tsquery('english', 'Fat | Rats:AB'); 'fat' | 'rat':AB + Also, * can be attached to a lexeme to specify prefix matching: + + +SELECT to_tsquery('supern:*A & star:A*B'); + to_tsquery +-------------------------- + 'supern':*A & 'star':*AB + + + Such a lexeme will match any word in a tsvector that begins + with the given string. + + + to_tsquery can also accept single-quoted phrases. This is primarily useful when the configuration includes a thesaurus dictionary that may trigger on such phrases. @@ -798,7 +812,8 @@ SELECT to_tsquery('''supernovae stars'' & !crab'); Note that plainto_tsquery cannot - recognize either Boolean operators or weight labels in its input: + recognize Boolean operators, weight labels, or prefix-match labels + in its input: SELECT plainto_tsquery('english', 'The Fat & Rats:C'); diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml index 6bf7535b63..84b2c9050a 100644 --- a/doc/src/sgml/xindex.sgml +++ b/doc/src/sgml/xindex.sgml @@ -1,4 +1,4 @@ - + Interfacing Extensions To Indexes @@ -444,6 +444,13 @@ consistent - determine whether value matches query condition 4 + + comparePartial - (optional method) compare partial key from + query and key from index, and return an integer less than zero, zero, + or greater than zero, indicating whether GIN should ignore this index + entry, treat the entry as a match, or stop the index scan + 5 + -- cgit v1.2.3