diff options
Diffstat (limited to 'contrib/pg_trgm')
-rw-r--r-- | contrib/pg_trgm/Makefile | 4 | ||||
-rw-r--r-- | contrib/pg_trgm/expected/pg_trgm.out | 289 | ||||
-rw-r--r-- | contrib/pg_trgm/pg_trgm--1.0--1.1.sql | 12 | ||||
-rw-r--r-- | contrib/pg_trgm/pg_trgm--1.1.sql (renamed from contrib/pg_trgm/pg_trgm--1.0.sql) | 14 | ||||
-rw-r--r-- | contrib/pg_trgm/pg_trgm.control | 2 | ||||
-rw-r--r-- | contrib/pg_trgm/sql/pg_trgm.sql | 53 | ||||
-rw-r--r-- | contrib/pg_trgm/trgm.h | 36 | ||||
-rw-r--r-- | contrib/pg_trgm/trgm_gin.c | 57 | ||||
-rw-r--r-- | contrib/pg_trgm/trgm_gist.c | 229 | ||||
-rw-r--r-- | contrib/pg_trgm/trgm_op.c | 182 | ||||
-rw-r--r-- | contrib/pg_trgm/trgm_regexp.c | 2247 |
11 files changed, 2973 insertions, 152 deletions
diff --git a/contrib/pg_trgm/Makefile b/contrib/pg_trgm/Makefile index 64fd69f2cb..0d549f8b6c 100644 --- a/contrib/pg_trgm/Makefile +++ b/contrib/pg_trgm/Makefile @@ -1,10 +1,10 @@ # contrib/pg_trgm/Makefile MODULE_big = pg_trgm -OBJS = trgm_op.o trgm_gist.o trgm_gin.o +OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o EXTENSION = pg_trgm -DATA = pg_trgm--1.0.sql pg_trgm--unpackaged--1.0.sql +DATA = pg_trgm--1.1.sql pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql REGRESS = pg_trgm diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out index e7af7d4890..13b1fde1b8 100644 --- a/contrib/pg_trgm/expected/pg_trgm.out +++ b/contrib/pg_trgm/expected/pg_trgm.out @@ -53,8 +53,14 @@ select similarity('wow',' WOW '); 1 (1 row) +select similarity('---', '####---'); + similarity +------------ + 0 +(1 row) + CREATE TABLE test_trgm(t text); -\copy test_trgm from 'data/trgm.data +\copy test_trgm from 'data/trgm.data' select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t; t | sml -------------+---------- @@ -3464,6 +3470,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198 create table test2(t text); insert into test2 values ('abcdef'); insert into test2 values ('quark'); +insert into test2 values (' z foo bar'); create index test2_idx_gin on test2 using gin (t gin_trgm_ops); set enable_seqscan=off; explain (costs off) @@ -3497,6 +3504,12 @@ select * from test2 where t like '%bcd%'; abcdef (1 row) +select * from test2 where t like E'%\\bcd%'; + t +-------- + abcdef +(1 row) + select * from test2 where t ilike '%BCD%'; t -------- @@ -3509,6 +3522,142 @@ select * from test2 where t ilike 'qua%'; quark (1 row) +select * from test2 where t like '%z foo bar%'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t like ' z foo%'; + t +------------- + z foo bar +(1 row) + +explain (costs off) + select * from test2 where t ~ '[abc]{3}'; + QUERY PLAN +-------------------------------------------- + Bitmap Heap Scan on test2 + Recheck Cond: (t ~ '[abc]{3}'::text) + -> Bitmap Index Scan on test2_idx_gin + Index Cond: (t ~ '[abc]{3}'::text) +(4 rows) + +explain (costs off) + select * from test2 where t ~* 'DEF'; + QUERY PLAN +------------------------------------------ + Bitmap Heap Scan on test2 + Recheck Cond: (t ~* 'DEF'::text) + -> Bitmap Index Scan on test2_idx_gin + Index Cond: (t ~* 'DEF'::text) +(4 rows) + +select * from test2 where t ~ '[abc]{3}'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~ 'a[bc]+d'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~ '(abc)*$'; + t +------------- + abcdef + quark + z foo bar +(3 rows) + +select * from test2 where t ~* 'DEF'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~ 'dEf'; + t +--- +(0 rows) + +select * from test2 where t ~* '^q'; + t +------- + quark +(1 row) + +select * from test2 where t ~* '[abc]{3}[def]{3}'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~* 'ab[a-z]{3}'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~* '(^| )qua'; + t +------- + quark +(1 row) + +select * from test2 where t ~ 'q.*rk$'; + t +------- + quark +(1 row) + +select * from test2 where t ~ 'q'; + t +------- + quark +(1 row) + +select * from test2 where t ~ '[a-z]{3}'; + t +------------- + abcdef + quark + z foo bar +(3 rows) + +select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}'; + t +--- +(0 rows) + +select * from test2 where t ~ 'z foo bar'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t ~ ' z foo bar'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t ~ ' z foo bar'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t ~ ' z foo'; + t +------------- + z foo bar +(1 row) + drop index test2_idx_gin; create index test2_idx_gist on test2 using gist (t gist_trgm_ops); set enable_seqscan=off; @@ -3539,6 +3688,12 @@ select * from test2 where t like '%bcd%'; abcdef (1 row) +select * from test2 where t like E'%\\bcd%'; + t +-------- + abcdef +(1 row) + select * from test2 where t ilike '%BCD%'; t -------- @@ -3551,3 +3706,135 @@ select * from test2 where t ilike 'qua%'; quark (1 row) +select * from test2 where t like '%z foo bar%'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t like ' z foo%'; + t +------------- + z foo bar +(1 row) + +explain (costs off) + select * from test2 where t ~ '[abc]{3}'; + QUERY PLAN +------------------------------------------ + Index Scan using test2_idx_gist on test2 + Index Cond: (t ~ '[abc]{3}'::text) +(2 rows) + +explain (costs off) + select * from test2 where t ~* 'DEF'; + QUERY PLAN +------------------------------------------ + Index Scan using test2_idx_gist on test2 + Index Cond: (t ~* 'DEF'::text) +(2 rows) + +select * from test2 where t ~ '[abc]{3}'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~ 'a[bc]+d'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~ '(abc)*$'; + t +------------- + abcdef + quark + z foo bar +(3 rows) + +select * from test2 where t ~* 'DEF'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~ 'dEf'; + t +--- +(0 rows) + +select * from test2 where t ~* '^q'; + t +------- + quark +(1 row) + +select * from test2 where t ~* '[abc]{3}[def]{3}'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~* 'ab[a-z]{3}'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~* '(^| )qua'; + t +------- + quark +(1 row) + +select * from test2 where t ~ 'q.*rk$'; + t +------- + quark +(1 row) + +select * from test2 where t ~ 'q'; + t +------- + quark +(1 row) + +select * from test2 where t ~ '[a-z]{3}'; + t +------------- + abcdef + quark + z foo bar +(3 rows) + +select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}'; + t +--- +(0 rows) + +select * from test2 where t ~ 'z foo bar'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t ~ ' z foo bar'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t ~ ' z foo bar'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t ~ ' z foo'; + t +------------- + z foo bar +(1 row) + diff --git a/contrib/pg_trgm/pg_trgm--1.0--1.1.sql b/contrib/pg_trgm/pg_trgm--1.0--1.1.sql new file mode 100644 index 0000000000..b4e3e26037 --- /dev/null +++ b/contrib/pg_trgm/pg_trgm--1.0--1.1.sql @@ -0,0 +1,12 @@ +/* contrib/pg_trgm/pg_trgm--1.0--1.1.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.1'" to load this file. \quit + +ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD + OPERATOR 5 pg_catalog.~ (text, text), + OPERATOR 6 pg_catalog.~* (text, text); + +ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD + OPERATOR 5 pg_catalog.~ (text, text), + OPERATOR 6 pg_catalog.~* (text, text); diff --git a/contrib/pg_trgm/pg_trgm--1.0.sql b/contrib/pg_trgm/pg_trgm--1.1.sql index 8067bd6033..1fff7af2c4 100644 --- a/contrib/pg_trgm/pg_trgm--1.0.sql +++ b/contrib/pg_trgm/pg_trgm--1.1.sql @@ -1,4 +1,4 @@ -/* contrib/pg_trgm/pg_trgm--1.0.sql */ +/* contrib/pg_trgm/pg_trgm--1.1.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_trgm" to load this file. \quit @@ -132,6 +132,12 @@ ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD OPERATOR 4 pg_catalog.~~* (text, text), FUNCTION 8 (text, text) gtrgm_distance (internal, text, int, oid); +-- Add operators that are new in 9.3. + +ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD + OPERATOR 5 pg_catalog.~ (text, text), + OPERATOR 6 pg_catalog.~* (text, text); + -- support functions for gin CREATE FUNCTION gin_extract_value_trgm(text, internal) RETURNS internal @@ -164,3 +170,9 @@ AS ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD OPERATOR 3 pg_catalog.~~ (text, text), OPERATOR 4 pg_catalog.~~* (text, text); + +-- Add operators that are new in 9.3. + +ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD + OPERATOR 5 pg_catalog.~ (text, text), + OPERATOR 6 pg_catalog.~* (text, text); diff --git a/contrib/pg_trgm/pg_trgm.control b/contrib/pg_trgm/pg_trgm.control index 70404d881d..2ac51e6890 100644 --- a/contrib/pg_trgm/pg_trgm.control +++ b/contrib/pg_trgm/pg_trgm.control @@ -1,5 +1,5 @@ # pg_trgm extension comment = 'text similarity measurement and index searching based on trigrams' -default_version = '1.0' +default_version = '1.1' module_pathname = '$libdir/pg_trgm' relocatable = true diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql index ea902f602f..7b02d98818 100644 --- a/contrib/pg_trgm/sql/pg_trgm.sql +++ b/contrib/pg_trgm/sql/pg_trgm.sql @@ -11,9 +11,11 @@ select show_trgm('a b C0*%^'); select similarity('wow','WOWa '); select similarity('wow',' WOW '); +select similarity('---', '####---'); + CREATE TABLE test_trgm(t text); -\copy test_trgm from 'data/trgm.data +\copy test_trgm from 'data/trgm.data' select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t; select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t; @@ -41,6 +43,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198 create table test2(t text); insert into test2 values ('abcdef'); insert into test2 values ('quark'); +insert into test2 values (' z foo bar'); create index test2_idx_gin on test2 using gin (t gin_trgm_ops); set enable_seqscan=off; explain (costs off) @@ -49,8 +52,32 @@ explain (costs off) select * from test2 where t ilike '%BCD%'; select * from test2 where t like '%BCD%'; select * from test2 where t like '%bcd%'; +select * from test2 where t like E'%\\bcd%'; select * from test2 where t ilike '%BCD%'; select * from test2 where t ilike 'qua%'; +select * from test2 where t like '%z foo bar%'; +select * from test2 where t like ' z foo%'; +explain (costs off) + select * from test2 where t ~ '[abc]{3}'; +explain (costs off) + select * from test2 where t ~* 'DEF'; +select * from test2 where t ~ '[abc]{3}'; +select * from test2 where t ~ 'a[bc]+d'; +select * from test2 where t ~ '(abc)*$'; +select * from test2 where t ~* 'DEF'; +select * from test2 where t ~ 'dEf'; +select * from test2 where t ~* '^q'; +select * from test2 where t ~* '[abc]{3}[def]{3}'; +select * from test2 where t ~* 'ab[a-z]{3}'; +select * from test2 where t ~* '(^| )qua'; +select * from test2 where t ~ 'q.*rk$'; +select * from test2 where t ~ 'q'; +select * from test2 where t ~ '[a-z]{3}'; +select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}'; +select * from test2 where t ~ 'z foo bar'; +select * from test2 where t ~ ' z foo bar'; +select * from test2 where t ~ ' z foo bar'; +select * from test2 where t ~ ' z foo'; drop index test2_idx_gin; create index test2_idx_gist on test2 using gist (t gist_trgm_ops); set enable_seqscan=off; @@ -60,5 +87,29 @@ explain (costs off) select * from test2 where t ilike '%BCD%'; select * from test2 where t like '%BCD%'; select * from test2 where t like '%bcd%'; +select * from test2 where t like E'%\\bcd%'; select * from test2 where t ilike '%BCD%'; select * from test2 where t ilike 'qua%'; +select * from test2 where t like '%z foo bar%'; +select * from test2 where t like ' z foo%'; +explain (costs off) + select * from test2 where t ~ '[abc]{3}'; +explain (costs off) + select * from test2 where t ~* 'DEF'; +select * from test2 where t ~ '[abc]{3}'; +select * from test2 where t ~ 'a[bc]+d'; +select * from test2 where t ~ '(abc)*$'; +select * from test2 where t ~* 'DEF'; +select * from test2 where t ~ 'dEf'; +select * from test2 where t ~* '^q'; +select * from test2 where t ~* '[abc]{3}[def]{3}'; +select * from test2 where t ~* 'ab[a-z]{3}'; +select * from test2 where t ~* '(^| )qua'; +select * from test2 where t ~ 'q.*rk$'; +select * from test2 where t ~ 'q'; +select * from test2 where t ~ '[a-z]{3}'; +select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}'; +select * from test2 where t ~ 'z foo bar'; +select * from test2 where t ~ ' z foo bar'; +select * from test2 where t ~ ' z foo bar'; +select * from test2 where t ~ ' z foo'; diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h index 067f29d4da..ed649b8dcc 100644 --- a/contrib/pg_trgm/trgm.h +++ b/contrib/pg_trgm/trgm.h @@ -7,18 +7,20 @@ #include "access/gist.h" #include "access/itup.h" #include "storage/bufpage.h" -#include "utils/builtins.h" -/* options */ +/* + * Options ... but note that trgm_regexp.c effectively assumes these values + * of LPADDING and RPADDING. + */ #define LPADDING 2 #define RPADDING 1 #define KEEPONLYALNUM /* * Caution: IGNORECASE macro means that trigrams are case-insensitive. - * If this macro is disabled, the ~~* operator must be removed from the - * operator classes, because we can't handle case-insensitive wildcard search - * with case-sensitive trigrams. Failure to do this will result in "cannot - * handle ~~* with case-sensitive trigrams" errors. + * If this macro is disabled, the ~* and ~~* operators must be removed from + * the operator classes, because we can't handle case-insensitive wildcard + * search with case-sensitive trigrams. Failure to do this will result in + * "cannot handle ~*(~~*) with case-sensitive trigrams" errors. */ #define IGNORECASE #define DIVUNION @@ -28,6 +30,8 @@ #define DistanceStrategyNumber 2 #define LikeStrategyNumber 3 #define ILikeStrategyNumber 4 +#define RegExpStrategyNumber 5 +#define RegExpICaseStrategyNumber 6 typedef char trgm[3]; @@ -42,11 +46,11 @@ typedef char trgm[3]; *(((char*)(a))+2) = *(((char*)(b))+2); \ } while(0); -uint32 trgm2int(trgm *ptr); - #ifdef KEEPONLYALNUM +#define ISWORDCHR(c) (t_isalpha(c) || t_isdigit(c)) #define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') ) #else +#define ISWORDCHR(c) (!t_isspace(c)) #define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) ) #endif #define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) ) @@ -99,11 +103,19 @@ typedef char *BITVECP; #define GETARR(x) ( (trgm*)( (char*)x+TRGMHDRSIZE ) ) #define ARRNELEM(x) ( ( VARSIZE(x) - TRGMHDRSIZE )/sizeof(trgm) ) +typedef struct TrgmPackedGraph TrgmPackedGraph; + extern float4 trgm_limit; -TRGM *generate_trgm(char *str, int slen); -TRGM *generate_wildcard_trgm(const char *str, int slen); -float4 cnt_sml(TRGM *trg1, TRGM *trg2); -bool trgm_contained_by(TRGM *trg1, TRGM *trg2); +extern uint32 trgm2int(trgm *ptr); +extern void compact_trigram(trgm *tptr, char *str, int bytelen); +extern TRGM *generate_trgm(char *str, int slen); +extern TRGM *generate_wildcard_trgm(const char *str, int slen); +extern float4 cnt_sml(TRGM *trg1, TRGM *trg2); +extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2); +extern bool *trgm_presence_map(TRGM *query, TRGM *key); +extern TRGM *createTrgmNFA(text *text_re, Oid collation, + TrgmPackedGraph **graph, MemoryContext rcontext); +extern bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check); #endif /* __TRGM_H__ */ diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c index 114fb784c4..c59925c575 100644 --- a/contrib/pg_trgm/trgm_gin.c +++ b/contrib/pg_trgm/trgm_gin.c @@ -10,16 +10,9 @@ PG_FUNCTION_INFO_V1(gin_extract_trgm); -Datum gin_extract_trgm(PG_FUNCTION_ARGS); - PG_FUNCTION_INFO_V1(gin_extract_value_trgm); -Datum gin_extract_value_trgm(PG_FUNCTION_ARGS); - PG_FUNCTION_INFO_V1(gin_extract_query_trgm); -Datum gin_extract_query_trgm(PG_FUNCTION_ARGS); - PG_FUNCTION_INFO_V1(gin_trgm_consistent); -Datum gin_trgm_consistent(PG_FUNCTION_ARGS); /* * This function can only be called if a pre-9.1 version of the GIN operator @@ -80,13 +73,15 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS) StrategyNumber strategy = PG_GETARG_UINT16(2); /* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */ - /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ + Pointer **extra_data = (Pointer **) PG_GETARG_POINTER(4); + /* bool **nullFlags = (bool **) PG_GETARG_POINTER(5); */ int32 *searchMode = (int32 *) PG_GETARG_POINTER(6); Datum *entries = NULL; TRGM *trg; int32 trglen; trgm *ptr; + TrgmPackedGraph *graph; int32 i; switch (strategy) @@ -107,6 +102,34 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS) */ trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ); break; + case RegExpICaseStrategyNumber: +#ifndef IGNORECASE + elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); +#endif + /* FALL THRU */ + case RegExpStrategyNumber: + trg = createTrgmNFA(val, PG_GET_COLLATION(), + &graph, CurrentMemoryContext); + if (trg && ARRNELEM(trg) > 0) + { + /* + * Successful regex processing: store NFA-like graph as + * extra_data. GIN API requires an array of nentries + * Pointers, but we just put the same value in each element. + */ + trglen = ARRNELEM(trg); + *extra_data = (Pointer *) palloc(sizeof(Pointer) * trglen); + for (i = 0; i < trglen; i++) + (*extra_data)[i] = (Pointer) graph; + } + else + { + /* No result: have to do full index scan. */ + *nentries = 0; + *searchMode = GIN_SEARCH_MODE_ALL; + PG_RETURN_POINTER(entries); + } + break; default: elog(ERROR, "unrecognized strategy number: %d", strategy); trg = NULL; /* keep compiler quiet */ @@ -146,8 +169,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS) /* text *query = PG_GETARG_TEXT_P(2); */ int32 nkeys = PG_GETARG_INT32(3); - - /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ + Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); bool *recheck = (bool *) PG_GETARG_POINTER(5); bool res; int32 i, @@ -189,6 +211,21 @@ gin_trgm_consistent(PG_FUNCTION_ARGS) } } break; + case RegExpICaseStrategyNumber: +#ifndef IGNORECASE + elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); +#endif + /* FALL THRU */ + case RegExpStrategyNumber: + if (nkeys < 1) + { + /* Regex processing gave no result: do full index scan */ + res = true; + } + else + res = trigramsMatchGraph((TrgmPackedGraph *) extra_data[0], + check); + break; default: elog(ERROR, "unrecognized strategy number: %d", strategy); res = false; /* keep compiler quiet */ diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c index d59c8eb670..69dc7f71f0 100644 --- a/contrib/pg_trgm/trgm_gist.c +++ b/contrib/pg_trgm/trgm_gist.c @@ -8,37 +8,35 @@ #include "access/skey.h" -PG_FUNCTION_INFO_V1(gtrgm_in); -Datum gtrgm_in(PG_FUNCTION_ARGS); +typedef struct +{ + /* most recent inputs to gtrgm_consistent */ + StrategyNumber strategy; + text *query; + /* extracted trigrams for query */ + TRGM *trigrams; + /* if a regex operator, the extracted graph */ + TrgmPackedGraph *graph; -PG_FUNCTION_INFO_V1(gtrgm_out); -Datum gtrgm_out(PG_FUNCTION_ARGS); + /* + * The "query" and "trigrams" are stored in the same palloc block as this + * cache struct, at MAXALIGN'ed offsets. The graph however isn't. + */ +} gtrgm_consistent_cache; -PG_FUNCTION_INFO_V1(gtrgm_compress); -Datum gtrgm_compress(PG_FUNCTION_ARGS); +#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key)) -PG_FUNCTION_INFO_V1(gtrgm_decompress); -Datum gtrgm_decompress(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(gtrgm_in); +PG_FUNCTION_INFO_V1(gtrgm_out); +PG_FUNCTION_INFO_V1(gtrgm_compress); +PG_FUNCTION_INFO_V1(gtrgm_decompress); PG_FUNCTION_INFO_V1(gtrgm_consistent); -Datum gtrgm_consistent(PG_FUNCTION_ARGS); - PG_FUNCTION_INFO_V1(gtrgm_distance); -Datum gtrgm_distance(PG_FUNCTION_ARGS); - PG_FUNCTION_INFO_V1(gtrgm_union); -Datum gtrgm_union(PG_FUNCTION_ARGS); - PG_FUNCTION_INFO_V1(gtrgm_same); -Datum gtrgm_same(PG_FUNCTION_ARGS); - PG_FUNCTION_INFO_V1(gtrgm_penalty); -Datum gtrgm_penalty(PG_FUNCTION_ARGS); - PG_FUNCTION_INFO_V1(gtrgm_picksplit); -Datum gtrgm_picksplit(PG_FUNCTION_ARGS); - -#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key)) /* Number of one-bits in an unsigned byte */ static const uint8 number_of_ones[256] = { @@ -78,10 +76,10 @@ gtrgm_out(PG_FUNCTION_ARGS) static void makesign(BITVECP sign, TRGM *a) { - int4 k, + int32 k, len = ARRNELEM(a); trgm *ptr = GETARR(a); - int4 tmp = 0; + int32 tmp = 0; MemSet((void *) sign, 0, sizeof(BITVEC)); SETBIT(sign, SIGLENBIT); /* set last unused bit */ @@ -112,7 +110,7 @@ gtrgm_compress(PG_FUNCTION_ARGS) else if (ISSIGNKEY(DatumGetPointer(entry->key)) && !ISALLTRUE(DatumGetPointer(entry->key))) { - int4 i, + int32 i, len; TRGM *res; BITVECP sign = GETSIGN(DatumGetPointer(entry->key)); @@ -160,14 +158,14 @@ gtrgm_decompress(PG_FUNCTION_ARGS) } } -static int4 +static int32 cnt_sml_sign_common(TRGM *qtrg, BITVECP sign) { - int4 count = 0; - int4 k, + int32 count = 0; + int32 k, len = ARRNELEM(qtrg); trgm *ptr = GETARR(qtrg); - int4 tmp = 0; + int32 tmp = 0; for (k = 0; k < len; k++) { @@ -191,24 +189,30 @@ gtrgm_consistent(PG_FUNCTION_ARGS) TRGM *qtrg; bool res; Size querysize = VARSIZE(query); - char *cache = (char *) fcinfo->flinfo->fn_extra, - *cachedQuery = cache + MAXALIGN(sizeof(StrategyNumber)); + gtrgm_consistent_cache *cache; /* - * Store both the strategy number and extracted trigrams in cache, because - * trigram extraction is relatively CPU-expensive. We must include - * strategy number because trigram extraction depends on strategy. + * We keep the extracted trigrams in cache, because trigram extraction is + * relatively CPU-expensive. When trying to reuse a cached value, check + * strategy number not just query itself, because trigram extraction + * depends on strategy. * - * The cached structure contains the strategy number, then the input query - * (starting at a MAXALIGN boundary), then the TRGM value (also starting - * at a MAXALIGN boundary). + * The cached structure is a single palloc chunk containing the + * gtrgm_consistent_cache header, then the input query (starting at a + * MAXALIGN boundary), then the TRGM value (also starting at a MAXALIGN + * boundary). However we don't try to include the regex graph (if any) in + * that struct. (XXX currently, this approach can leak regex graphs + * across index rescans. Not clear if that's worth fixing.) */ + cache = (gtrgm_consistent_cache *) fcinfo->flinfo->fn_extra; if (cache == NULL || - strategy != *((StrategyNumber *) cache) || - VARSIZE(cachedQuery) != querysize || - memcmp(cachedQuery, query, querysize) != 0) + cache->strategy != strategy || + VARSIZE(cache->query) != querysize || + memcmp((char *) cache->query, (char *) query, querysize) != 0) { - char *newcache; + gtrgm_consistent_cache *newcache; + TrgmPackedGraph *graph = NULL; + Size qtrgsize; switch (strategy) { @@ -225,28 +229,58 @@ gtrgm_consistent(PG_FUNCTION_ARGS) qtrg = generate_wildcard_trgm(VARDATA(query), querysize - VARHDRSZ); break; + case RegExpICaseStrategyNumber: +#ifndef IGNORECASE + elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); +#endif + /* FALL THRU */ + case RegExpStrategyNumber: + qtrg = createTrgmNFA(query, PG_GET_COLLATION(), + &graph, fcinfo->flinfo->fn_mcxt); + /* just in case an empty array is returned ... */ + if (qtrg && ARRNELEM(qtrg) <= 0) + { + pfree(qtrg); + qtrg = NULL; + } + break; default: elog(ERROR, "unrecognized strategy number: %d", strategy); qtrg = NULL; /* keep compiler quiet */ break; } - newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, - MAXALIGN(sizeof(StrategyNumber)) + - MAXALIGN(querysize) + - VARSIZE(qtrg)); - cachedQuery = newcache + MAXALIGN(sizeof(StrategyNumber)); + qtrgsize = qtrg ? VARSIZE(qtrg) : 0; - *((StrategyNumber *) newcache) = strategy; - memcpy(cachedQuery, query, querysize); - memcpy(cachedQuery + MAXALIGN(querysize), qtrg, VARSIZE(qtrg)); + newcache = (gtrgm_consistent_cache *) + MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, + MAXALIGN(sizeof(gtrgm_consistent_cache)) + + MAXALIGN(querysize) + + qtrgsize); + + newcache->strategy = strategy; + newcache->query = (text *) + ((char *) newcache + MAXALIGN(sizeof(gtrgm_consistent_cache))); + memcpy((char *) newcache->query, (char *) query, querysize); + if (qtrg) + { + newcache->trigrams = (TRGM *) + ((char *) newcache->query + MAXALIGN(querysize)); + memcpy((char *) newcache->trigrams, (char *) qtrg, qtrgsize); + /* release qtrg in case it was made in fn_mcxt */ + pfree(qtrg); + } + else + newcache->trigrams = NULL; + newcache->graph = graph; if (cache) pfree(cache); - fcinfo->flinfo->fn_extra = newcache; + fcinfo->flinfo->fn_extra = (void *) newcache; + cache = newcache; } - qtrg = (TRGM *) (cachedQuery + MAXALIGN(querysize)); + qtrg = cache->trigrams; switch (strategy) { @@ -267,8 +301,8 @@ gtrgm_consistent(PG_FUNCTION_ARGS) } else { /* non-leaf contains signature */ - int4 count = cnt_sml_sign_common(qtrg, GETSIGN(key)); - int4 len = ARRNELEM(qtrg); + int32 count = cnt_sml_sign_common(qtrg, GETSIGN(key)); + int32 len = ARRNELEM(qtrg); if (len == 0) res = false; @@ -317,6 +351,63 @@ gtrgm_consistent(PG_FUNCTION_ARGS) } } break; + case RegExpICaseStrategyNumber: +#ifndef IGNORECASE + elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); +#endif + /* FALL THRU */ + case RegExpStrategyNumber: + /* Regexp search is inexact */ + *recheck = true; + + /* Check regex match as much as we can with available info */ + if (qtrg) + { + if (GIST_LEAF(entry)) + { /* all leafs contains orig trgm */ + bool *check; + + check = trgm_presence_map(qtrg, key); + res = trigramsMatchGraph(cache->graph, check); + pfree(check); + } + else if (ISALLTRUE(key)) + { /* non-leaf contains signature */ + res = true; + } + else + { /* non-leaf contains signature */ + int32 k, + tmp = 0, + len = ARRNELEM(qtrg); + trgm *ptr = GETARR(qtrg); + BITVECP sign = GETSIGN(key); + bool *check; + + /* + * GETBIT() tests may give false positives, due to limited + * size of the sign array. But since trigramsMatchGraph() + * implements a monotone boolean function, false positives + * in the check array can't lead to false negative answer. + * So we can apply trigramsMatchGraph despite uncertainty, + * and that usefully improves the quality of the search. + */ + check = (bool *) palloc(len * sizeof(bool)); + for (k = 0; k < len; k++) + { + CPTRGM(((char *) &tmp), ptr + k); + check[k] = GETBIT(sign, HASHVAL(tmp)); + } + res = trigramsMatchGraph(cache->graph, check); + pfree(check); + } + } + else + { + /* trigram-free query must be rechecked everywhere */ + res = true; + } + break; default: elog(ERROR, "unrecognized strategy number: %d", strategy); res = false; /* keep compiler quiet */ @@ -379,8 +470,8 @@ gtrgm_distance(PG_FUNCTION_ARGS) } else { /* non-leaf contains signature */ - int4 count = cnt_sml_sign_common(qtrg, GETSIGN(key)); - int4 len = ARRNELEM(qtrg); + int32 count = cnt_sml_sign_common(qtrg, GETSIGN(key)); + int32 len = ARRNELEM(qtrg); res = (len == 0) ? -1.0 : 1.0 - ((float8) count) / ((float8) len); } @@ -394,10 +485,10 @@ gtrgm_distance(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(res); } -static int4 +static int32 unionkey(BITVECP sbase, TRGM *add) { - int4 i; + int32 i; if (ISSIGNKEY(add)) { @@ -412,7 +503,7 @@ unionkey(BITVECP sbase, TRGM *add) else { trgm *ptr = GETARR(add); - int4 tmp = 0; + int32 tmp = 0; for (i = 0; i < ARRNELEM(add); i++) { @@ -428,11 +519,11 @@ Datum gtrgm_union(PG_FUNCTION_ARGS) { GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); - int4 len = entryvec->n; + int32 len = entryvec->n; int *size = (int *) PG_GETARG_POINTER(1); BITVEC base; - int4 i; - int4 flag = 0; + int32 i; + int32 flag = 0; TRGM *result; MemSet((void *) base, 0, sizeof(BITVEC)); @@ -474,7 +565,7 @@ gtrgm_same(PG_FUNCTION_ARGS) *result = false; else { - int4 i; + int32 i; BITVECP sa = GETSIGN(a), sb = GETSIGN(b); @@ -491,7 +582,7 @@ gtrgm_same(PG_FUNCTION_ARGS) } else { /* a and b ISARRKEY */ - int4 lena = ARRNELEM(a), + int32 lena = ARRNELEM(a), lenb = ARRNELEM(b); if (lena != lenb) @@ -500,7 +591,7 @@ gtrgm_same(PG_FUNCTION_ARGS) { trgm *ptra = GETARR(a), *ptrb = GETARR(b); - int4 i; + int32 i; *result = true; for (i = 0; i < lena; i++) @@ -515,10 +606,10 @@ gtrgm_same(PG_FUNCTION_ARGS) PG_RETURN_POINTER(result); } -static int4 +static int32 sizebitvec(BITVECP sign) { - int4 size = 0, + int32 size = 0, i; LOOPBYTE @@ -634,7 +725,7 @@ fillcache(CACHESIGN *item, TRGM *key) typedef struct { OffsetNumber pos; - int4 cost; + int32 cost; } SPLITCOST; static int @@ -675,11 +766,11 @@ gtrgm_picksplit(PG_FUNCTION_ARGS) *datum_r; BITVECP union_l, union_r; - int4 size_alpha, + int32 size_alpha, size_beta; - int4 size_waste, + int32 size_waste, waste = -1; - int4 nbytes; + int32 nbytes; OffsetNumber seed_1 = 0, seed_2 = 0; OffsetNumber *left, diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index 4e32c6f654..c385e09edd 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -9,6 +9,7 @@ #include "catalog/pg_type.h" #include "tsearch/ts_locale.h" +#include "utils/memutils.h" PG_MODULE_MAGIC; @@ -16,22 +17,11 @@ PG_MODULE_MAGIC; float4 trgm_limit = 0.3f; PG_FUNCTION_INFO_V1(set_limit); -Datum set_limit(PG_FUNCTION_ARGS); - PG_FUNCTION_INFO_V1(show_limit); -Datum show_limit(PG_FUNCTION_ARGS); - PG_FUNCTION_INFO_V1(show_trgm); -Datum show_trgm(PG_FUNCTION_ARGS); - PG_FUNCTION_INFO_V1(similarity); -Datum similarity(PG_FUNCTION_ARGS); - PG_FUNCTION_INFO_V1(similarity_dist); -Datum similarity_dist(PG_FUNCTION_ARGS); - PG_FUNCTION_INFO_V1(similarity_op); -Datum similarity_op(PG_FUNCTION_ARGS); Datum @@ -77,12 +67,6 @@ unique_array(trgm *a, int len) return curend + 1 - a; } -#ifdef KEEPONLYALNUM -#define iswordchr(c) (t_isalpha(c) || t_isdigit(c)) -#else -#define iswordchr(c) (!t_isspace(c)) -#endif - /* * Finds first word in string, returns pointer to the word, * endword points to the character after word @@ -92,7 +76,7 @@ find_word(char *str, int lenstr, char **endword, int *charlen) { char *beginword = str; - while (beginword - str < lenstr && !iswordchr(beginword)) + while (beginword - str < lenstr && !ISWORDCHR(beginword)) beginword += pg_mblen(beginword); if (beginword - str >= lenstr) @@ -100,7 +84,7 @@ find_word(char *str, int lenstr, char **endword, int *charlen) *endword = beginword; *charlen = 0; - while (*endword - str < lenstr && iswordchr(*endword)) + while (*endword - str < lenstr && ISWORDCHR(*endword)) { *endword += pg_mblen(*endword); (*charlen)++; @@ -109,9 +93,13 @@ find_word(char *str, int lenstr, char **endword, int *charlen) return beginword; } -#ifdef USE_WIDE_UPPER_LOWER -static void -cnt_trigram(trgm *tptr, char *str, int bytelen) +/* + * Reduce a trigram (three possibly multi-byte characters) to a trgm, + * which is always exactly three bytes. If we have three single-byte + * characters, we just use them as-is; otherwise we form a hash value. + */ +void +compact_trigram(trgm *tptr, char *str, int bytelen) { if (bytelen == 3) { @@ -131,7 +119,6 @@ cnt_trigram(trgm *tptr, char *str, int bytelen) CPTRGM(tptr, &crc); } } -#endif /* * Adds trigrams from words (already padded). @@ -144,16 +131,16 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) if (charlen < 3) return tptr; -#ifdef USE_WIDE_UPPER_LOWER - if (pg_database_encoding_max_length() > 1) + if (bytelen > charlen) { + /* Find multibyte character boundaries and apply compact_trigram */ int lenfirst = pg_mblen(str), lenmiddle = pg_mblen(str + lenfirst), lenlast = pg_mblen(str + lenfirst + lenmiddle); while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen) { - cnt_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast); + compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast); ptr += lenfirst; tptr++; @@ -164,8 +151,8 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) } } else -#endif { + /* Fast path when there are no multibyte characters */ Assert(bytelen == charlen); while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ ) @@ -191,6 +178,18 @@ generate_trgm(char *str, int slen) char *bword, *eword; + /* + * Guard against possible overflow in the palloc requests below. (We + * don't worry about the additive constants, since palloc can detect + * requests that are a little above MaxAllocSize --- we just need to + * prevent integer overflow in the multiplications.) + */ + if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) || + (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length())) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of memory"))); + trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3); trg->flag = ARRKEY; SET_VARSIZE(trg, TRGMHDRSIZE); @@ -200,7 +199,8 @@ generate_trgm(char *str, int slen) tptr = GETARR(trg); - buf = palloc(sizeof(char) * (slen + 4)); + /* Allocate a buffer for case-folded, blank-padded words */ + buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4); if (LPADDING > 0) { @@ -224,6 +224,7 @@ generate_trgm(char *str, int slen) #ifdef IGNORECASE pfree(bword); #endif + buf[LPADDING + bytelen] = ' '; buf[LPADDING + bytelen + 1] = ' '; @@ -239,7 +240,10 @@ generate_trgm(char *str, int slen) if ((len = tptr - GETARR(trg)) == 0) return trg; - if (len > 0) + /* + * Make trigrams unique. + */ + if (len > 1) { qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm); len = unique_array(GETARR(trg), len); @@ -272,33 +276,36 @@ get_wildcard_part(const char *str, int lenstr, const char *beginword = str; const char *endword; char *s = buf; - bool in_wildcard_meta = false; + bool in_leading_wildcard_meta = false; + bool in_trailing_wildcard_meta = false; bool in_escape = false; int clen; /* - * Find the first word character remembering whether last character was - * wildcard meta-character. + * Find the first word character, remembering whether preceding character + * was wildcard meta-character. Note that the in_escape state persists + * from this loop to the next one, since we may exit at a word character + * that is in_escape. */ while (beginword - str < lenstr) { if (in_escape) { - in_escape = false; - in_wildcard_meta = false; - if (iswordchr(beginword)) + if (ISWORDCHR(beginword)) break; + in_escape = false; + in_leading_wildcard_meta = false; } else { if (ISESCAPECHAR(beginword)) in_escape = true; else if (ISWILDCARDCHAR(beginword)) - in_wildcard_meta = true; - else if (iswordchr(beginword)) + in_leading_wildcard_meta = true; + else if (ISWORDCHR(beginword)) break; else - in_wildcard_meta = false; + in_leading_wildcard_meta = false; } beginword += pg_mblen(beginword); } @@ -310,11 +317,11 @@ get_wildcard_part(const char *str, int lenstr, return NULL; /* - * Add left padding spaces if last character wasn't wildcard + * Add left padding spaces if preceding character wasn't wildcard * meta-character. */ *charlen = 0; - if (!in_wildcard_meta) + if (!in_leading_wildcard_meta) { if (LPADDING > 0) { @@ -333,23 +340,29 @@ get_wildcard_part(const char *str, int lenstr, * string boundary. Strip escapes during copy. */ endword = beginword; - in_wildcard_meta = false; - in_escape = false; while (endword - str < lenstr) { clen = pg_mblen(endword); if (in_escape) { - in_escape = false; - in_wildcard_meta = false; - if (iswordchr(endword)) + if (ISWORDCHR(endword)) { memcpy(s, endword, clen); (*charlen)++; s += clen; } else + { + /* + * Back up endword to the escape character when stopping at an + * escaped char, so that subsequent get_wildcard_part will + * restart from the escape character. We assume here that + * escape chars are single-byte. + */ + endword--; break; + } + in_escape = false; } else { @@ -357,29 +370,26 @@ get_wildcard_part(const char *str, int lenstr, in_escape = true; else if (ISWILDCARDCHAR(endword)) { - in_wildcard_meta = true; + in_trailing_wildcard_meta = true; break; } - else if (iswordchr(endword)) + else if (ISWORDCHR(endword)) { memcpy(s, endword, clen); (*charlen)++; s += clen; } else - { - in_wildcard_meta = false; break; - } } endword += clen; } /* - * Add right padding spaces if last character wasn't wildcard + * Add right padding spaces if next character isn't wildcard * meta-character. */ - if (!in_wildcard_meta) + if (!in_trailing_wildcard_meta) { if (RPADDING > 0) { @@ -416,6 +426,18 @@ generate_wildcard_trgm(const char *str, int slen) bytelen; const char *eword; + /* + * Guard against possible overflow in the palloc requests below. (We + * don't worry about the additive constants, since palloc can detect + * requests that are a little above MaxAllocSize --- we just need to + * prevent integer overflow in the multiplications.) + */ + if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) || + (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length())) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of memory"))); + trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3); trg->flag = ARRKEY; SET_VARSIZE(trg, TRGMHDRSIZE); @@ -425,6 +447,7 @@ generate_wildcard_trgm(const char *str, int slen) tptr = GETARR(trg); + /* Allocate a buffer for blank-padded, but not yet case-folded, words */ buf = palloc(sizeof(char) * (slen + 4)); /* @@ -445,6 +468,7 @@ generate_wildcard_trgm(const char *str, int slen) * count trigrams */ tptr = make_trigrams(tptr, buf2, bytelen, charlen); + #ifdef IGNORECASE pfree(buf2); #endif @@ -458,7 +482,7 @@ generate_wildcard_trgm(const char *str, int slen) /* * Make trigrams unique. */ - if (len > 0) + if (len > 1) { qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm); len = unique_array(GETARR(trg), len); @@ -547,6 +571,10 @@ cnt_sml(TRGM *trg1, TRGM *trg2) len1 = ARRNELEM(trg1); len2 = ARRNELEM(trg2); + /* explicit test is needed to avoid 0/0 division when both lengths are 0 */ + if (len1 <= 0 || len2 <= 0) + return (float4) 0.0; + while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2) { int res = CMPTRGM(ptr1, ptr2); @@ -564,9 +592,9 @@ cnt_sml(TRGM *trg1, TRGM *trg2) } #ifdef DIVUNION - return ((((float4) count) / ((float4) (len1 + len2 - count)))); + return ((float4) count) / ((float4) (len1 + len2 - count)); #else - return (((float) count) / ((float) ((len1 > len2) ? len1 : len2))); + return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2)); #endif } @@ -609,6 +637,50 @@ trgm_contained_by(TRGM *trg1, TRGM *trg2) return true; } +/* + * Return a palloc'd boolean array showing, for each trigram in "query", + * whether it is present in the trigram array "key". + * This relies on the "key" array being sorted, but "query" need not be. + */ +bool * +trgm_presence_map(TRGM *query, TRGM *key) +{ + bool *result; + trgm *ptrq = GETARR(query), + *ptrk = GETARR(key); + int lenq = ARRNELEM(query), + lenk = ARRNELEM(key), + i; + + result = (bool *) palloc0(lenq * sizeof(bool)); + + /* for each query trigram, do a binary search in the key array */ + for (i = 0; i < lenq; i++) + { + int lo = 0; + int hi = lenk; + + while (lo < hi) + { + int mid = (lo + hi) / 2; + int res = CMPTRGM(ptrq, ptrk + mid); + + if (res < 0) + hi = mid; + else if (res > 0) + lo = mid + 1; + else + { + result[i] = true; + break; + } + } + ptrq++; + } + + return result; +} + Datum similarity(PG_FUNCTION_ARGS) { diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c new file mode 100644 index 0000000000..9f050533c5 --- /dev/null +++ b/contrib/pg_trgm/trgm_regexp.c @@ -0,0 +1,2247 @@ +/*------------------------------------------------------------------------- + * + * trgm_regexp.c + * Regular expression matching using trigrams. + * + * The general idea of trigram index support for a regular expression (regex) + * search is to transform the regex into a logical expression on trigrams. + * For example: + * + * (ab|cd)efg => ((abe & bef) | (cde & def)) & efg + * + * If a string matches the regex, then it must match the logical expression on + * trigrams. The opposite is not necessarily true, however: a string that + * matches the logical expression might not match the original regex. Such + * false positives are removed via recheck, by running the regular regex match + * operator on the retrieved heap tuple. + * + * Since the trigram expression involves both AND and OR operators, we can't + * expect the core index machinery to evaluate it completely. Instead, the + * result of regex analysis is a list of trigrams to be sought in the index, + * plus a simplified graph that is used by trigramsMatchGraph() to determine + * whether a particular indexed value matches the expression. + * + * Converting a regex to a trigram expression is based on analysis of an + * automaton corresponding to the regex. The algorithm consists of four + * stages: + * + * 1) Compile the regexp to NFA form. This is handled by the PostgreSQL + * regexp library, which provides accessors for its opaque regex_t struct + * to expose the NFA state graph and the "colors" (sets of equivalent + * characters) used as state transition labels. + * + * 2) Transform the original NFA into an expanded graph, where arcs + * are labeled with trigrams that must be present in order to move from + * one state to another via the arcs. The trigrams used in this stage + * consist of colors, not characters, as in the original NFA. + * + * 3) Expand the color trigrams into regular trigrams consisting of + * characters. If too many distinct trigrams are produced, trigrams are + * eliminated and the graph is simplified until it's simple enough. + * + * 4) Finally, the resulting graph is packed into a TrgmPackedGraph struct, + * and returned to the caller. + * + * 1) Compile the regexp to NFA form + * --------------------------------- + * The automaton returned by the regexp compiler is a graph where vertices + * are "states" and arcs are labeled with colors. Each color represents + * a set of characters, so that all characters assigned to the same color + * are interchangeable, so far as matching the regexp is concerned. There + * are two special states: "initial" and "final". A state can have multiple + * outgoing arcs labeled with the same color, which makes the automaton + * non-deterministic, because it can be in many states simultaneously. + * + * Note that this NFA is already lossy compared to the original regexp, + * since it ignores some regex features such as lookahead constraints and + * backref matching. This is OK for our purposes since it's still the case + * that only strings matching the NFA can possibly satisfy the regexp. + * + * 2) Transform the original NFA into an expanded graph + * ---------------------------------------------------- + * In the 2nd stage, the automaton is transformed into a graph based on the + * original NFA. Each state in the expanded graph represents a state from + * the original NFA, plus a prefix identifying the last two characters + * (colors, to be precise) seen before entering the state. There can be + * multiple states in the expanded graph for each state in the original NFA, + * depending on what characters can precede it. A prefix position can be + * "unknown" if it's uncertain what the preceding character was, or "blank" + * if the character was a non-word character (we don't need to distinguish + * which non-word character it was, so just think of all of them as blanks). + * + * For convenience in description, call an expanded-state identifier + * (two prefix colors plus a state number from the original NFA) an + * "enter key". + * + * Each arc of the expanded graph is labelled with a trigram that must be + * present in the string to match. We can construct this from an out-arc of + * the underlying NFA state by combining the expanded state's prefix with the + * color label of the underlying out-arc, if neither prefix position is + * "unknown". But note that some of the colors in the trigram might be + * "blank". This is OK since we want to generate word-boundary trigrams as + * the regular trigram machinery would, if we know that some word characters + * must be adjacent to a word boundary in all strings matching the NFA. + * + * The expanded graph can also have fewer states than the original NFA, + * because we don't bother to make a separate state entry unless the state + * is reachable by a valid arc. When an enter key is reachable from a state + * of the expanded graph, but we do not know a complete trigram associated + * with that transition, we cannot make a valid arc; instead we insert the + * enter key into the enterKeys list of the source state. This effectively + * means that the two expanded states are not reliably distinguishable based + * on examining trigrams. + * + * So the expanded graph resembles the original NFA, but the arcs are + * labeled with trigrams instead of individual characters, and there may be + * more or fewer states. It is a lossy representation of the original NFA: + * any string that matches the original regexp must match the expanded graph, + * but the reverse is not true. + * + * We build the expanded graph through a breadth-first traversal of states + * reachable from the initial state. At each reachable state, we identify the + * states reachable from it without traversing a predictable trigram, and add + * those states' enter keys to the current state. Then we generate all + * out-arcs leading out of this collection of states that have predictable + * trigrams, adding their target states to the queue of states to examine. + * + * When building the graph, if the number of states or arcs exceed pre-defined + * limits, we give up and simply mark any states not yet processed as final + * states. Roughly speaking, that means that we make use of some portion from + * the beginning of the regexp. Also, any colors that have too many member + * characters are treated as "unknown", so that we can't derive trigrams + * from them. + * + * 3) Expand the color trigrams into regular trigrams + * -------------------------------------------------- + * The trigrams in the expanded graph are "color trigrams", consisting + * of three consecutive colors that must be present in the string. But for + * search, we need regular trigrams consisting of characters. In the 3rd + * stage, the color trigrams are expanded into regular trigrams. Since each + * color can represent many characters, the total number of regular trigrams + * after expansion could be very large. Because searching the index for + * thousands of trigrams would be slow, and would likely produce so many + * false positives that we would have to traverse a large fraction of the + * index, the graph is simplified further in a lossy fashion by removing + * color trigrams. When a color trigram is removed, the states connected by + * any arcs labelled with that trigram are merged. + * + * Trigrams do not all have equivalent value for searching: some of them are + * more frequent and some of them are less frequent. Ideally, we would like + * to know the distribution of trigrams, but we don't. But because of padding + * we know for sure that the empty character is more frequent than others, + * so we can penalize trigrams according to presence of whitespace. The + * penalty assigned to each color trigram is the number of simple trigrams + * it would produce, times the penalties[] multiplier associated with its + * whitespace content. (The penalties[] constants were calculated by analysis + * of some real-life text.) We eliminate color trigrams starting with the + * highest-penalty one, until we get to a total penalty of no more than + * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would + * lead to merging the initial and final states, so we may not be able to + * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than + * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail. + * + * 4) Pack the graph into a compact representation + * ----------------------------------------------- + * The 2nd and 3rd stages might have eliminated or merged many of the states + * and trigrams created earlier, so in this final stage, the graph is + * compacted and packed into a simpler struct that contains only the + * information needed to evaluate it. + * + * ALGORITHM EXAMPLE: + * + * Consider the example regex "ab[cd]". This regex is transformed into the + * following NFA (for simplicity we show colors as their single members): + * + * 4# + * c/ + * a b / + * 1* --- 2 ---- 3 + * \ + * d\ + * 5# + * + * We use * to mark initial state and # to mark final state. It's not depicted, + * but states 1, 4, 5 have self-referencing arcs for all possible characters, + * because this pattern can match to any part of a string. + * + * As the result of stage 2 we will have the following graph: + * + * abc abd + * 2# <---- 1* ----> 3# + * + * The process for generating this graph is: + * 1) Create state 1 with enter key (UNKNOWN, UNKNOWN, 1). + * 2) Add key (UNKNOWN, "a", 2) to state 1. + * 3) Add key ("a", "b", 3) to state 1. + * 4) Create new state 2 with enter key ("b", "c", 4). Add an arc + * from state 1 to state 2 with label trigram "abc". + * 5) Mark state 2 final because state 4 of source NFA is marked as final. + * 6) Create new state 3 with enter key ("b", "d", 5). Add an arc + * from state 1 to state 3 with label trigram "abd". + * 7) Mark state 3 final because state 5 of source NFA is marked as final. + * + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * contrib/pg_trgm/trgm_regexp.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "trgm.h" + +#include "regex/regexport.h" +#include "tsearch/ts_locale.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" + + +/* + * Uncomment to print intermediate stages, for exploring and debugging the + * algorithm implementation. This produces three graph files in /tmp, + * in Graphviz .dot format. + */ +/* #define TRGM_REGEXP_DEBUG */ + +/* + * These parameters are used to limit the amount of work done. + * Otherwise regex processing could be too slow and memory-consuming. + * + * MAX_EXPANDED_STATES - How many states we allow in expanded graph + * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph + * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted + * WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties + * COLOR_COUNT_LIMIT - Maximum number of characters per color + */ +#define MAX_EXPANDED_STATES 128 +#define MAX_EXPANDED_ARCS 1024 +#define MAX_TRGM_COUNT 256 +#define WISH_TRGM_PENALTY 16 +#define COLOR_COUNT_LIMIT 256 + +/* + * Penalty multipliers for trigram counts depending on whitespace contents. + * Numbers based on analysis of real-life texts. + */ +const float4 penalties[8] = { + 1.0f, /* "aaa" */ + 3.5f, /* "aa " */ + 0.0f, /* "a a" (impossible) */ + 0.0f, /* "a " (impossible) */ + 4.2f, /* " aa" */ + 2.1f, /* " a " */ + 25.0f, /* " a" */ + 0.0f /* " " (impossible) */ +}; + +/* Struct representing a single pg_wchar, converted back to multibyte form */ +typedef struct +{ + char bytes[MAX_MULTIBYTE_CHAR_LEN]; +} trgm_mb_char; + +/* + * Attributes of NFA colors: + * + * expandable - we know the character expansion of this color + * containsNonWord - color contains non-word characters + * (which will not be extracted into trigrams) + * wordCharsCount - count of word characters in color + * wordChars - array of this color's word characters + * (which can be extracted into trigrams) + * + * When expandable is false, the other attributes don't matter; we just + * assume this color represents unknown character(s). + */ +typedef struct +{ + bool expandable; + bool containsNonWord; + int wordCharsCount; + trgm_mb_char *wordChars; +} TrgmColorInfo; + +/* + * A "prefix" is information about the colors of the last two characters read + * before reaching a specific NFA state. These colors can have special values + * COLOR_UNKNOWN and COLOR_BLANK. COLOR_UNKNOWN means that we have no + * information, for example because we read some character of an unexpandable + * color. COLOR_BLANK means that we read a non-word character. + * + * We call a prefix ambiguous if at least one of its colors is unknown. It's + * fully ambiguous if both are unknown, partially ambiguous if only the first + * is unknown. (The case of first color known, second unknown is not valid.) + * + * Wholly- or partly-blank prefixes are mostly handled the same as regular + * color prefixes. This allows us to generate appropriate partly-blank + * trigrams when the NFA requires word character(s) to appear adjacent to + * non-word character(s). + */ +typedef int TrgmColor; + +/* We assume that colors returned by the regexp engine cannot be these: */ +#define COLOR_UNKNOWN (-1) +#define COLOR_BLANK (-2) + +typedef struct +{ + TrgmColor colors[2]; +} TrgmPrefix; + +/* + * Color-trigram data type. Note that some elements of the trigram can be + * COLOR_BLANK, but we don't allow COLOR_UNKNOWN. + */ +typedef struct +{ + TrgmColor colors[3]; +} ColorTrgm; + +/* + * Key identifying a state of our expanded graph: color prefix, and number + * of the corresponding state in the underlying regex NFA. The color prefix + * shows how we reached the regex state (to the extent that we know it). + */ +typedef struct +{ + TrgmPrefix prefix; + int nstate; +} TrgmStateKey; + +/* + * One state of the expanded graph. + * + * stateKey - ID of this state + * arcs - outgoing arcs of this state (List of TrgmArc) + * enterKeys - enter keys reachable from this state without reading any + * predictable trigram (List of TrgmStateKey) + * fin - flag indicating this state is final + * init - flag indicating this state is initial + * parent - parent state, if this state has been merged into another + * children - child states (states that have been merged into this one) + * number - number of this state (used at the packaging stage) + */ +typedef struct TrgmState +{ + TrgmStateKey stateKey; /* hashtable key: must be first field */ + List *arcs; + List *enterKeys; + bool fin; + bool init; + struct TrgmState *parent; + List *children; + int number; +} TrgmState; + +/* + * One arc in the expanded graph. + */ +typedef struct +{ + ColorTrgm ctrgm; /* trigram needed to traverse arc */ + TrgmState *target; /* next state */ +} TrgmArc; + +/* + * Information about arc of specific color trigram (used in stage 3) + * + * Contains pointers to the source and target states. + */ +typedef struct +{ + TrgmState *source; + TrgmState *target; +} TrgmArcInfo; + +/* + * Information about color trigram (used in stage 3) + * + * ctrgm - trigram itself + * number - number of this trigram (used in the packaging stage) + * count - number of simple trigrams created from this color trigram + * expanded - indicates this color trigram is expanded into simple trigrams + * arcs - list of all arcs labeled with this color trigram. + */ +typedef struct +{ + ColorTrgm ctrgm; + int number; + int count; + float4 penalty; + bool expanded; + List *arcs; +} ColorTrgmInfo; + +/* + * Data structure representing all the data we need during regex processing. + * + * regex - compiled regex + * colorInfo - extracted information about regex's colors + * ncolors - number of colors in colorInfo[] + * states - hashtable of TrgmStates (states of expanded graph) + * initState - pointer to initial state of expanded graph + * queue - queue of to-be-processed TrgmStates + * keysQueue - queue of to-be-processed TrgmStateKeys + * arcsCount - total number of arcs of expanded graph (for resource + * limiting) + * overflowed - we have exceeded resource limit for transformation + * colorTrgms - array of all color trigrams present in graph + * colorTrgmsCount - count of those color trigrams + * totalTrgmCount - total count of extracted simple trigrams + */ +typedef struct +{ + /* Source regexp, and color information extracted from it (stage 1) */ + regex_t *regex; + TrgmColorInfo *colorInfo; + int ncolors; + + /* Expanded graph (stage 2) */ + HTAB *states; + TrgmState *initState; + + /* Workspace for stage 2 */ + List *queue; + List *keysQueue; + int arcsCount; + bool overflowed; + + /* Information about distinct color trigrams in the graph (stage 3) */ + ColorTrgmInfo *colorTrgms; + int colorTrgmsCount; + int totalTrgmCount; +} TrgmNFA; + +/* + * Final, compact representation of expanded graph. + */ +typedef struct +{ + int targetState; /* index of target state (zero-based) */ + int colorTrgm; /* index of color trigram for transition */ +} TrgmPackedArc; + +typedef struct +{ + int arcsCount; /* number of out-arcs for this state */ + TrgmPackedArc *arcs; /* array of arcsCount packed arcs */ +} TrgmPackedState; + +/* "typedef struct TrgmPackedGraph TrgmPackedGraph" appears in trgm.h */ +struct TrgmPackedGraph +{ + /* + * colorTrigramsCount and colorTrigramsGroups contain information about + * how trigrams are grouped into color trigrams. "colorTrigramsCount" is + * the count of color trigrams and "colorTrigramGroups" contains number of + * simple trigrams for each color trigram. The array of simple trigrams + * (stored separately from this struct) is ordered so that the simple + * trigrams for each color trigram are consecutive, and they're in order + * by color trigram number. + */ + int colorTrigramsCount; + int *colorTrigramGroups; /* array of size colorTrigramsCount */ + + /* + * The states of the simplified NFA. State number 0 is always initial + * state and state number 1 is always final state. + */ + int statesCount; + TrgmPackedState *states; /* array of size statesCount */ + + /* Temporary work space for trigramsMatchGraph() */ + bool *colorTrigramsActive; /* array of size colorTrigramsCount */ + bool *statesActive; /* array of size statesCount */ + int *statesQueue; /* array of size statesCount */ +}; + +/* + * Temporary structure for representing an arc during packaging. + */ +typedef struct +{ + int sourceState; + int targetState; + int colorTrgm; +} TrgmPackArcInfo; + + +/* prototypes for private functions */ +static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, + MemoryContext rcontext); +static void RE_compile(regex_t *regex, text *text_re, + int cflags, Oid collation); +static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA); +static bool convertPgWchar(pg_wchar c, trgm_mb_char *result); +static void transformGraph(TrgmNFA *trgmNFA); +static void processState(TrgmNFA *trgmNFA, TrgmState *state); +static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key); +static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key); +static void addArcs(TrgmNFA *trgmNFA, TrgmState *state); +static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key, + TrgmColor co, TrgmStateKey *destKey); +static bool validArcLabel(TrgmStateKey *key, TrgmColor co); +static TrgmState *getState(TrgmNFA *trgmNFA, TrgmStateKey *key); +static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2); +static bool selectColorTrigrams(TrgmNFA *trgmNFA); +static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext); +static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]); +static void mergeStates(TrgmState *state1, TrgmState *state2); +static int colorTrgmInfoCmp(const void *p1, const void *p2); +static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2); +static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext); +static int packArcInfoCmp(const void *a1, const void *a2); + +#ifdef TRGM_REGEXP_DEBUG +static void printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors); +static void printTrgmNFA(TrgmNFA *trgmNFA); +static void printTrgmColor(StringInfo buf, TrgmColor co); +static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams); +#endif + + +/* + * Main entry point to process a regular expression. + * + * Returns an array of trigrams required by the regular expression, or NULL if + * the regular expression was too complex to analyze. In addition, a packed + * graph representation of the regex is returned into *graph. The results + * must be allocated in rcontext (which might or might not be the current + * context). + */ +TRGM * +createTrgmNFA(text *text_re, Oid collation, + TrgmPackedGraph **graph, MemoryContext rcontext) +{ + TRGM *trg; + regex_t regex; + MemoryContext tmpcontext; + MemoryContext oldcontext; + + /* + * This processing generates a great deal of cruft, which we'd like to + * clean up before returning (since this function may be called in a + * query-lifespan memory context). Make a temp context we can work in so + * that cleanup is easy. + */ + tmpcontext = AllocSetContextCreate(CurrentMemoryContext, + "createTrgmNFA temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcontext = MemoryContextSwitchTo(tmpcontext); + + /* + * Stage 1: Compile the regexp into a NFA, using the regexp library. + */ +#ifdef IGNORECASE + RE_compile(®ex, text_re, REG_ADVANCED | REG_ICASE, collation); +#else + RE_compile(®ex, text_re, REG_ADVANCED, collation); +#endif + + /* + * Since the regexp library allocates its internal data structures with + * malloc, we need to use a PG_TRY block to ensure that pg_regfree() gets + * done even if there's an error. + */ + PG_TRY(); + { + trg = createTrgmNFAInternal(®ex, graph, rcontext); + } + PG_CATCH(); + { + pg_regfree(®ex); + PG_RE_THROW(); + } + PG_END_TRY(); + + pg_regfree(®ex); + + /* Clean up all the cruft we created */ + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(tmpcontext); + + return trg; +} + +/* + * Body of createTrgmNFA, exclusive of regex compilation/freeing. + */ +static TRGM * +createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, + MemoryContext rcontext) +{ + TRGM *trg; + TrgmNFA trgmNFA; + + trgmNFA.regex = regex; + + /* Collect color information from the regex */ + getColorInfo(regex, &trgmNFA); + +#ifdef TRGM_REGEXP_DEBUG + printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors); +#endif + + /* + * Stage 2: Create an expanded graph from the source NFA. + */ + transformGraph(&trgmNFA); + +#ifdef TRGM_REGEXP_DEBUG + printTrgmNFA(&trgmNFA); +#endif + + /* + * Fail if we were unable to make a nontrivial graph, ie it is possible to + * get from the initial state to the final state without reading any + * predictable trigram. + */ + if (trgmNFA.initState->fin) + return NULL; + + /* + * Stage 3: Select color trigrams to expand. Fail if too many trigrams. + */ + if (!selectColorTrigrams(&trgmNFA)) + return NULL; + + /* + * Stage 4: Expand color trigrams and pack graph into final + * representation. + */ + trg = expandColorTrigrams(&trgmNFA, rcontext); + + *graph = packGraph(&trgmNFA, rcontext); + +#ifdef TRGM_REGEXP_DEBUG + printTrgmPackedGraph(*graph, trg); +#endif + + return trg; +} + +/* + * Main entry point for evaluating a graph during index scanning. + * + * The check[] array is indexed by trigram number (in the array of simple + * trigrams returned by createTrgmNFA), and holds TRUE for those trigrams + * that are present in the index entry being checked. + */ +bool +trigramsMatchGraph(TrgmPackedGraph *graph, bool *check) +{ + int i, + j, + k, + queueIn, + queueOut; + + /* + * Reset temporary working areas. + */ + memset(graph->colorTrigramsActive, 0, + sizeof(bool) * graph->colorTrigramsCount); + memset(graph->statesActive, 0, sizeof(bool) * graph->statesCount); + + /* + * Check which color trigrams were matched. A match for any simple + * trigram associated with a color trigram counts as a match of the color + * trigram. + */ + j = 0; + for (i = 0; i < graph->colorTrigramsCount; i++) + { + int cnt = graph->colorTrigramGroups[i]; + + for (k = j; k < j + cnt; k++) + { + if (check[k]) + { + /* + * Found one matched trigram in the group. Can skip the rest + * of them and go to the next group. + */ + graph->colorTrigramsActive[i] = true; + break; + } + } + j = j + cnt; + } + + /* + * Initialize the statesQueue to hold just the initial state. Note: + * statesQueue has room for statesCount entries, which is certainly enough + * since no state will be put in the queue more than once. The + * statesActive array marks which states have been queued. + */ + graph->statesActive[0] = true; + graph->statesQueue[0] = 0; + queueIn = 0; + queueOut = 1; + + /* Process queued states as long as there are any. */ + while (queueIn < queueOut) + { + int stateno = graph->statesQueue[queueIn++]; + TrgmPackedState *state = &graph->states[stateno]; + int cnt = state->arcsCount; + + /* Loop over state's out-arcs */ + for (i = 0; i < cnt; i++) + { + TrgmPackedArc *arc = &state->arcs[i]; + + /* + * If corresponding color trigram is present then activate the + * corresponding state. We're done if that's the final state, + * otherwise queue the state if it's not been queued already. + */ + if (graph->colorTrigramsActive[arc->colorTrgm]) + { + int nextstate = arc->targetState; + + if (nextstate == 1) + return true; /* success: final state is reachable */ + + if (!graph->statesActive[nextstate]) + { + graph->statesActive[nextstate] = true; + graph->statesQueue[queueOut++] = nextstate; + } + } + } + } + + /* Queue is empty, so match fails. */ + return false; +} + +/* + * Compile regex string into struct at *regex. + * NB: pg_regfree must be applied to regex if this completes successfully. + */ +static void +RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation) +{ + int text_re_len = VARSIZE_ANY_EXHDR(text_re); + char *text_re_val = VARDATA_ANY(text_re); + pg_wchar *pattern; + int pattern_len; + int regcomp_result; + char errMsg[100]; + + /* Convert pattern string to wide characters */ + pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar)); + pattern_len = pg_mb2wchar_with_len(text_re_val, + pattern, + text_re_len); + + /* Compile regex */ + regcomp_result = pg_regcomp(regex, + pattern, + pattern_len, + cflags, + collation); + + pfree(pattern); + + if (regcomp_result != REG_OKAY) + { + /* re didn't compile (no need for pg_regfree, if so) */ + pg_regerror(regcomp_result, regex, errMsg, sizeof(errMsg)); + ereport(ERROR, + (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), + errmsg("invalid regular expression: %s", errMsg))); + } +} + + +/*--------------------- + * Subroutines for pre-processing the color map (stage 1). + *--------------------- + */ + +/* + * Fill TrgmColorInfo structure for each color using regex export functions. + */ +static void +getColorInfo(regex_t *regex, TrgmNFA *trgmNFA) +{ + int colorsCount = pg_reg_getnumcolors(regex); + int i; + + trgmNFA->ncolors = colorsCount; + trgmNFA->colorInfo = (TrgmColorInfo *) + palloc0(colorsCount * sizeof(TrgmColorInfo)); + + /* + * Loop over colors, filling TrgmColorInfo about each. + */ + for (i = 0; i < colorsCount; i++) + { + TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[i]; + int charsCount = pg_reg_getnumcharacters(regex, i); + pg_wchar *chars; + int j; + + if (charsCount < 0 || charsCount > COLOR_COUNT_LIMIT) + { + /* Non expandable, or too large to work with */ + colorInfo->expandable = false; + continue; + } + + colorInfo->expandable = true; + colorInfo->containsNonWord = false; + colorInfo->wordChars = (trgm_mb_char *) + palloc(sizeof(trgm_mb_char) * charsCount); + colorInfo->wordCharsCount = 0; + + /* Extract all the chars in this color */ + chars = (pg_wchar *) palloc(sizeof(pg_wchar) * charsCount); + pg_reg_getcharacters(regex, i, chars, charsCount); + + /* + * Convert characters back to multibyte form, and save only those that + * are word characters. Set "containsNonWord" if any non-word + * character. (Note: it'd probably be nicer to keep the chars in + * pg_wchar format for now, but ISWORDCHR wants to see multibyte.) + */ + for (j = 0; j < charsCount; j++) + { + trgm_mb_char c; + + if (!convertPgWchar(chars[j], &c)) + continue; /* ok to ignore it altogether */ + if (ISWORDCHR(c.bytes)) + colorInfo->wordChars[colorInfo->wordCharsCount++] = c; + else + colorInfo->containsNonWord = true; + } + + pfree(chars); + } +} + +/* + * Convert pg_wchar to multibyte format. + * Returns false if the character should be ignored completely. + */ +static bool +convertPgWchar(pg_wchar c, trgm_mb_char *result) +{ + /* "s" has enough space for a multibyte character and a trailing NUL */ + char s[MAX_MULTIBYTE_CHAR_LEN + 1]; + + /* + * We can ignore the NUL character, since it can never appear in a PG text + * string. This avoids the need for various special cases when + * reconstructing trigrams. + */ + if (c == 0) + return false; + + /* Do the conversion, making sure the result is NUL-terminated */ + memset(s, 0, sizeof(s)); + pg_wchar2mb_with_len(&c, s, 1); + + /* + * In IGNORECASE mode, we can ignore uppercase characters. We assume that + * the regex engine generated both uppercase and lowercase equivalents + * within each color, since we used the REG_ICASE option; so there's no + * need to process the uppercase version. + * + * XXX this code is dependent on the assumption that lowerstr() works the + * same as the regex engine's internal case folding machinery. Might be + * wiser to expose pg_wc_tolower and test whether c == pg_wc_tolower(c). + * On the other hand, the trigrams in the index were created using + * lowerstr(), so we're probably screwed if there's any incompatibility + * anyway. + */ +#ifdef IGNORECASE + { + char *lowerCased = lowerstr(s); + + if (strcmp(lowerCased, s) != 0) + { + pfree(lowerCased); + return false; + } + pfree(lowerCased); + } +#endif + + /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */ + strncpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN); + return true; +} + + +/*--------------------- + * Subroutines for expanding original NFA graph into a trigram graph (stage 2). + *--------------------- + */ + +/* + * Transform the graph, given a regex and extracted color information. + * + * We create and process a queue of expanded-graph states until all the states + * are processed. + * + * This algorithm may be stopped due to resource limitation. In this case we + * force every unprocessed branch to immediately finish with matching (this + * can give us false positives but no false negatives) by marking all + * unprocessed states as final. + */ +static void +transformGraph(TrgmNFA *trgmNFA) +{ + HASHCTL hashCtl; + TrgmStateKey initkey; + TrgmState *initstate; + + /* Initialize this stage's workspace in trgmNFA struct */ + trgmNFA->queue = NIL; + trgmNFA->keysQueue = NIL; + trgmNFA->arcsCount = 0; + trgmNFA->overflowed = false; + + /* Create hashtable for states */ + hashCtl.keysize = sizeof(TrgmStateKey); + hashCtl.entrysize = sizeof(TrgmState); + hashCtl.hcxt = CurrentMemoryContext; + hashCtl.hash = tag_hash; + trgmNFA->states = hash_create("Trigram NFA", + 1024, + &hashCtl, + HASH_ELEM | HASH_CONTEXT | HASH_FUNCTION); + + /* Create initial state: ambiguous prefix, NFA's initial state */ + MemSet(&initkey, 0, sizeof(initkey)); + initkey.prefix.colors[0] = COLOR_UNKNOWN; + initkey.prefix.colors[1] = COLOR_UNKNOWN; + initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex); + + initstate = getState(trgmNFA, &initkey); + initstate->init = true; + trgmNFA->initState = initstate; + + /* + * Recursively build the expanded graph by processing queue of states + * (breadth-first search). getState already put initstate in the queue. + */ + while (trgmNFA->queue != NIL) + { + TrgmState *state = (TrgmState *) linitial(trgmNFA->queue); + + trgmNFA->queue = list_delete_first(trgmNFA->queue); + + /* + * If we overflowed then just mark state as final. Otherwise do + * actual processing. + */ + if (trgmNFA->overflowed) + state->fin = true; + else + processState(trgmNFA, state); + + /* Did we overflow? */ + if (trgmNFA->arcsCount > MAX_EXPANDED_ARCS || + hash_get_num_entries(trgmNFA->states) > MAX_EXPANDED_STATES) + trgmNFA->overflowed = true; + } +} + +/* + * Process one state: add enter keys and then add outgoing arcs. + */ +static void +processState(TrgmNFA *trgmNFA, TrgmState *state) +{ + /* keysQueue should be NIL already, but make sure */ + trgmNFA->keysQueue = NIL; + + /* + * Add state's own key, and then process all keys added to keysQueue until + * queue is empty. But we can quit if the state gets marked final. + */ + addKey(trgmNFA, state, &state->stateKey); + while (trgmNFA->keysQueue != NIL && !state->fin) + { + TrgmStateKey *key = (TrgmStateKey *) linitial(trgmNFA->keysQueue); + + trgmNFA->keysQueue = list_delete_first(trgmNFA->keysQueue); + addKey(trgmNFA, state, key); + } + + /* + * Add outgoing arcs only if state isn't final (we have no interest in + * outgoing arcs if we already match) + */ + if (!state->fin) + addArcs(trgmNFA, state); +} + +/* + * Add the given enter key into the state's enterKeys list, and determine + * whether this should result in any further enter keys being added. + * If so, add those keys to keysQueue so that processState will handle them. + * + * If the enter key is for the NFA's final state, set state->fin = TRUE. + * This situation means that we can reach the final state from this expanded + * state without reading any predictable trigram, so we must consider this + * state as an accepting one. + * + * The given key could be a duplicate of one already in enterKeys, or be + * redundant with some enterKeys. So we check that before doing anything. + * + * Note that we don't generate any actual arcs here. addArcs will do that + * later, after we have identified all the enter keys for this state. + */ +static void +addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key) +{ + regex_arc_t *arcs; + TrgmStateKey destKey; + ListCell *cell, + *prev, + *next; + int i, + arcsCount; + + /* + * Ensure any pad bytes in destKey are zero, since it may get used as a + * hashtable key by getState. + */ + MemSet(&destKey, 0, sizeof(destKey)); + + /* + * Compare key to each existing enter key of the state to check for + * redundancy. We can drop either old key(s) or the new key if we find + * redundancy. + */ + prev = NULL; + cell = list_head(state->enterKeys); + while (cell) + { + TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell); + + next = lnext(cell); + if (existingKey->nstate == key->nstate) + { + if (prefixContains(&existingKey->prefix, &key->prefix)) + { + /* This old key already covers the new key. Nothing to do */ + return; + } + if (prefixContains(&key->prefix, &existingKey->prefix)) + { + /* + * The new key covers this old key. Remove the old key, it's + * no longer needed once we add this key to the list. + */ + state->enterKeys = list_delete_cell(state->enterKeys, + cell, prev); + } + else + prev = cell; + } + else + prev = cell; + cell = next; + } + + /* No redundancy, so add this key to the state's list */ + state->enterKeys = lappend(state->enterKeys, key); + + /* If state is now known final, mark it and we're done */ + if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex)) + { + state->fin = true; + return; + } + + /* + * Loop through all outgoing arcs of the corresponding state in the + * original NFA. + */ + arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate); + arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount); + pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount); + + for (i = 0; i < arcsCount; i++) + { + regex_arc_t *arc = &arcs[i]; + + if (pg_reg_colorisbegin(trgmNFA->regex, arc->co)) + { + /* + * Start of line/string (^). Trigram extraction treats start of + * line same as start of word: double space prefix is added. + * Hence, make an enter key showing we can reach the arc + * destination with all-blank prefix. + */ + destKey.prefix.colors[0] = COLOR_BLANK; + destKey.prefix.colors[1] = COLOR_BLANK; + destKey.nstate = arc->to; + + /* Add enter key to this state */ + addKeyToQueue(trgmNFA, &destKey); + } + else if (pg_reg_colorisend(trgmNFA->regex, arc->co)) + { + /* + * End of line/string ($). We must consider this arc as a + * transition that doesn't read anything. The reason for adding + * this enter key to the state is that if the arc leads to the + * NFA's final state, we must mark this expanded state as final. + */ + destKey.prefix.colors[0] = COLOR_UNKNOWN; + destKey.prefix.colors[1] = COLOR_UNKNOWN; + destKey.nstate = arc->to; + + /* Add enter key to this state */ + addKeyToQueue(trgmNFA, &destKey); + } + else + { + /* Regular color */ + TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co]; + + if (colorInfo->expandable) + { + if (colorInfo->containsNonWord && + !validArcLabel(key, COLOR_BLANK)) + { + /* + * We can reach the arc destination after reading a + * non-word character, but the prefix is not something + * that addArc will accept with COLOR_BLANK, so no trigram + * arc can get made for this transition. We must make an + * enter key to show that the arc destination is + * reachable. Set it up with an all-blank prefix, since + * that corresponds to what the trigram extraction code + * will do at a word starting boundary. + */ + destKey.prefix.colors[0] = COLOR_BLANK; + destKey.prefix.colors[1] = COLOR_BLANK; + destKey.nstate = arc->to; + addKeyToQueue(trgmNFA, &destKey); + } + + if (colorInfo->wordCharsCount > 0 && + !validArcLabel(key, arc->co)) + { + /* + * We can reach the arc destination after reading a word + * character, but the prefix is not something that addArc + * will accept, so no trigram arc can get made for this + * transition. We must make an enter key to show that the + * arc destination is reachable. The prefix for the enter + * key should reflect the info we have for this arc. + */ + destKey.prefix.colors[0] = key->prefix.colors[1]; + destKey.prefix.colors[1] = arc->co; + destKey.nstate = arc->to; + addKeyToQueue(trgmNFA, &destKey); + } + } + else + { + /* + * Unexpandable color. Add enter key with ambiguous prefix, + * showing we can reach the destination from this state, but + * the preceding colors will be uncertain. (We do not set the + * first prefix color to key->prefix.colors[1], because a + * prefix of known followed by unknown is invalid.) + */ + destKey.prefix.colors[0] = COLOR_UNKNOWN; + destKey.prefix.colors[1] = COLOR_UNKNOWN; + destKey.nstate = arc->to; + addKeyToQueue(trgmNFA, &destKey); + } + } + } + + pfree(arcs); +} + +/* + * Add copy of given key to keysQueue for later processing. + */ +static void +addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key) +{ + TrgmStateKey *keyCopy = (TrgmStateKey *) palloc(sizeof(TrgmStateKey)); + + memcpy(keyCopy, key, sizeof(TrgmStateKey)); + trgmNFA->keysQueue = lappend(trgmNFA->keysQueue, keyCopy); +} + +/* + * Add outgoing arcs from given state, whose enter keys are all now known. + */ +static void +addArcs(TrgmNFA *trgmNFA, TrgmState *state) +{ + TrgmStateKey destKey; + ListCell *cell; + regex_arc_t *arcs; + int arcsCount, + i; + + /* + * Ensure any pad bytes in destKey are zero, since it may get used as a + * hashtable key by getState. + */ + MemSet(&destKey, 0, sizeof(destKey)); + + /* + * Iterate over enter keys associated with this expanded-graph state. This + * includes both the state's own stateKey, and any enter keys we added to + * it during addKey (which represent expanded-graph states that are not + * distinguishable from this one by means of trigrams). For each such + * enter key, examine all the out-arcs of the key's underlying NFA state, + * and try to make a trigram arc leading to where the out-arc leads. + * (addArc will deal with whether the arc is valid or not.) + */ + foreach(cell, state->enterKeys) + { + TrgmStateKey *key = (TrgmStateKey *) lfirst(cell); + + arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate); + arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount); + pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount); + + for (i = 0; i < arcsCount; i++) + { + regex_arc_t *arc = &arcs[i]; + TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co]; + + /* + * Ignore non-expandable colors; addKey already handled the case. + * + * We need no special check for begin/end pseudocolors here. We + * don't need to do any processing for them, and they will be + * marked non-expandable since the regex engine will have reported + * them that way. + */ + if (!colorInfo->expandable) + continue; + + if (colorInfo->containsNonWord) + { + /* + * Color includes non-word character(s). + * + * Generate an arc, treating this transition as occurring on + * BLANK. This allows word-ending trigrams to be manufactured + * if possible. + */ + destKey.prefix.colors[0] = key->prefix.colors[1]; + destKey.prefix.colors[1] = COLOR_BLANK; + destKey.nstate = arc->to; + + addArc(trgmNFA, state, key, COLOR_BLANK, &destKey); + } + + if (colorInfo->wordCharsCount > 0) + { + /* + * Color includes word character(s). + * + * Generate an arc. Color is pushed into prefix of target + * state. + */ + destKey.prefix.colors[0] = key->prefix.colors[1]; + destKey.prefix.colors[1] = arc->co; + destKey.nstate = arc->to; + + addArc(trgmNFA, state, key, arc->co, &destKey); + } + } + + pfree(arcs); + } +} + +/* + * Generate an out-arc of the expanded graph, if it's valid and not redundant. + * + * state: expanded-graph state we want to add an out-arc to + * key: provides prefix colors (key->nstate is not used) + * co: transition color + * destKey: identifier for destination state of expanded graph + */ +static void +addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key, + TrgmColor co, TrgmStateKey *destKey) +{ + TrgmArc *arc; + ListCell *cell; + + /* Do nothing if this wouldn't be a valid arc label trigram */ + if (!validArcLabel(key, co)) + return; + + /* + * Check if we are going to reach key which is covered by a key which is + * already listed in this state. If so arc is useless: the NFA can bypass + * it through a path that doesn't require any predictable trigram, so + * whether the arc's trigram is present or not doesn't really matter. + */ + foreach(cell, state->enterKeys) + { + TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell); + + if (existingKey->nstate == destKey->nstate && + prefixContains(&existingKey->prefix, &destKey->prefix)) + return; + } + + /* Checks were successful, add new arc */ + arc = (TrgmArc *) palloc(sizeof(TrgmArc)); + arc->target = getState(trgmNFA, destKey); + arc->ctrgm.colors[0] = key->prefix.colors[0]; + arc->ctrgm.colors[1] = key->prefix.colors[1]; + arc->ctrgm.colors[2] = co; + + state->arcs = lappend(state->arcs, arc); + trgmNFA->arcsCount++; +} + +/* + * Can we make a valid trigram arc label from the given prefix and arc color? + * + * This is split out so that tests in addKey and addArc will stay in sync. + */ +static bool +validArcLabel(TrgmStateKey *key, TrgmColor co) +{ + /* + * We have to know full trigram in order to add outgoing arc. So we can't + * do it if prefix is ambiguous. + */ + if (key->prefix.colors[0] == COLOR_UNKNOWN) + return false; + + /* If key->prefix.colors[0] isn't unknown, its second color isn't either */ + Assert(key->prefix.colors[1] != COLOR_UNKNOWN); + /* And we should not be called with an unknown arc color anytime */ + Assert(co != COLOR_UNKNOWN); + + /* + * We don't bother with making arcs representing three non-word + * characters, since that's useless for trigram extraction. + */ + if (key->prefix.colors[0] == COLOR_BLANK && + key->prefix.colors[1] == COLOR_BLANK && + co == COLOR_BLANK) + return false; + + /* + * We also reject nonblank-blank-anything. The nonblank-blank-nonblank + * case doesn't correspond to any trigram the trigram extraction code + * would make. The nonblank-blank-blank case is also not possible with + * RPADDING = 1. (Note that in many cases we'd fail to generate such a + * trigram even if it were valid, for example processing "foo bar" will + * not result in considering the trigram "o ". So if you want to support + * RPADDING = 2, there's more to do than just twiddle this test.) + */ + if (key->prefix.colors[0] != COLOR_BLANK && + key->prefix.colors[1] == COLOR_BLANK) + return false; + + /* + * Other combinations involving blank are valid, in particular we assume + * blank-blank-nonblank is valid, which presumes that LPADDING is 2. + * + * Note: Using again the example "foo bar", we will not consider the + * trigram " b", though this trigram would be found by the trigram + * extraction code. Since we will find " ba", it doesn't seem worth + * trying to hack the algorithm to generate the additional trigram. + */ + + /* arc label is valid */ + return true; +} + +/* + * Get state of expanded graph for given state key, + * and queue the state for processing if it didn't already exist. + */ +static TrgmState * +getState(TrgmNFA *trgmNFA, TrgmStateKey *key) +{ + TrgmState *state; + bool found; + + state = (TrgmState *) hash_search(trgmNFA->states, key, HASH_ENTER, + &found); + if (!found) + { + /* New state: initialize and queue it */ + state->arcs = NIL; + state->enterKeys = NIL; + state->init = false; + state->fin = false; + state->parent = NULL; + state->children = NIL; + state->number = -1; + + trgmNFA->queue = lappend(trgmNFA->queue, state); + } + return state; +} + +/* + * Check if prefix1 "contains" prefix2. + * + * "contains" means that any exact prefix (with no ambiguity) that satisfies + * prefix2 also satisfies prefix1. + */ +static bool +prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2) +{ + if (prefix1->colors[1] == COLOR_UNKNOWN) + { + /* Fully ambiguous prefix contains everything */ + return true; + } + else if (prefix1->colors[0] == COLOR_UNKNOWN) + { + /* + * Prefix with only first unknown color contains every prefix with + * same second color. + */ + if (prefix1->colors[1] == prefix2->colors[1]) + return true; + else + return false; + } + else + { + /* Exact prefix contains only the exact same prefix */ + if (prefix1->colors[0] == prefix2->colors[0] && + prefix1->colors[1] == prefix2->colors[1]) + return true; + else + return false; + } +} + + +/*--------------------- + * Subroutines for expanding color trigrams into regular trigrams (stage 3). + *--------------------- + */ + +/* + * Get vector of all color trigrams in graph and select which of them + * to expand into simple trigrams. + * + * Returns TRUE if OK, FALSE if exhausted resource limits. + */ +static bool +selectColorTrigrams(TrgmNFA *trgmNFA) +{ + HASH_SEQ_STATUS scan_status; + int arcsCount = trgmNFA->arcsCount, + i; + TrgmState *state; + ColorTrgmInfo *colorTrgms; + int64 totalTrgmCount; + float4 totalTrgmPenalty; + int number; + + /* Collect color trigrams from all arcs */ + colorTrgms = (ColorTrgmInfo *) palloc(sizeof(ColorTrgmInfo) * arcsCount); + trgmNFA->colorTrgms = colorTrgms; + + i = 0; + hash_seq_init(&scan_status, trgmNFA->states); + while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL) + { + ListCell *cell; + + foreach(cell, state->arcs) + { + TrgmArc *arc = (TrgmArc *) lfirst(cell); + TrgmArcInfo *arcInfo = (TrgmArcInfo *) palloc(sizeof(TrgmArcInfo)); + + arcInfo->source = state; + arcInfo->target = arc->target; + colorTrgms[i].arcs = list_make1(arcInfo); + colorTrgms[i].expanded = true; + colorTrgms[i].ctrgm = arc->ctrgm; + i++; + } + } + Assert(i == arcsCount); + + /* Remove duplicates, merging their arcs lists */ + if (arcsCount >= 2) + { + ColorTrgmInfo *p1, + *p2; + + /* Sort trigrams to ease duplicate detection */ + qsort(colorTrgms, arcsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp); + + /* p1 is probe point, p2 is last known non-duplicate. */ + p2 = colorTrgms; + for (p1 = colorTrgms + 1; p1 < colorTrgms + arcsCount; p1++) + { + if (colorTrgmInfoCmp(p1, p2) > 0) + { + p2++; + *p2 = *p1; + } + else + { + p2->arcs = list_concat(p2->arcs, p1->arcs); + } + } + trgmNFA->colorTrgmsCount = (p2 - colorTrgms) + 1; + } + else + { + trgmNFA->colorTrgmsCount = arcsCount; + } + + /* + * Count number of simple trigrams generated by each color trigram, and + * also compute a penalty value, which is the number of simple trigrams + * times a multiplier that depends on its whitespace content. + * + * Note: per-color-trigram counts cannot overflow an int so long as + * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about + * 1290. However, the grand total totalTrgmCount might conceivably + * overflow an int, so we use int64 for that within this routine. Also, + * penalties are calculated in float4 arithmetic to avoid any overflow + * worries. + */ + totalTrgmCount = 0; + totalTrgmPenalty = 0.0f; + for (i = 0; i < trgmNFA->colorTrgmsCount; i++) + { + ColorTrgmInfo *trgmInfo = &colorTrgms[i]; + int j, + count = 1, + typeIndex = 0; + + for (j = 0; j < 3; j++) + { + TrgmColor c = trgmInfo->ctrgm.colors[j]; + + typeIndex *= 2; + if (c == COLOR_BLANK) + typeIndex++; + else + count *= trgmNFA->colorInfo[c].wordCharsCount; + } + trgmInfo->count = count; + totalTrgmCount += count; + trgmInfo->penalty = penalties[typeIndex] * (float4) count; + totalTrgmPenalty += trgmInfo->penalty; + } + + /* Sort color trigrams in descending order of their penalties */ + qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo), + colorTrgmInfoPenaltyCmp); + + /* + * Remove color trigrams from the graph so long as total penalty of color + * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to + * WISH_TRGM_PENALTY, it's OK so long as total count is no more than + * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher + * penalty, since those are the most promising for reducing the total + * penalty. When removing a color trigram we have to merge states + * connected by arcs labeled with that trigram. It's necessary to not + * merge initial and final states, because our graph becomes useless if + * that happens; so we cannot always remove the trigram we'd prefer to. + */ + for (i = 0; i < trgmNFA->colorTrgmsCount; i++) + { + ColorTrgmInfo *trgmInfo = &colorTrgms[i]; + bool canRemove = true; + ListCell *cell; + + /* Done if we've reached the target */ + if (totalTrgmPenalty <= WISH_TRGM_PENALTY) + break; + + /* + * Does any arc of this color trigram connect initial and final + * states? If so we can't remove it. + */ + foreach(cell, trgmInfo->arcs) + { + TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell); + TrgmState *source = arcInfo->source, + *target = arcInfo->target; + + /* examine parent states, if any merging has already happened */ + while (source->parent) + source = source->parent; + while (target->parent) + target = target->parent; + + if ((source->init || target->init) && + (source->fin || target->fin)) + { + canRemove = false; + break; + } + } + if (!canRemove) + continue; + + /* OK, merge states linked by each arc labeled by the trigram */ + foreach(cell, trgmInfo->arcs) + { + TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell); + TrgmState *source = arcInfo->source, + *target = arcInfo->target; + + while (source->parent) + source = source->parent; + while (target->parent) + target = target->parent; + if (source != target) + mergeStates(source, target); + } + + /* Mark trigram unexpanded, and update totals */ + trgmInfo->expanded = false; + totalTrgmCount -= trgmInfo->count; + totalTrgmPenalty -= trgmInfo->penalty; + } + + /* Did we succeed in fitting into MAX_TRGM_COUNT? */ + if (totalTrgmCount > MAX_TRGM_COUNT) + return false; + + trgmNFA->totalTrgmCount = (int) totalTrgmCount; + + /* + * Sort color trigrams by colors (will be useful for bsearch in packGraph) + * and enumerate the color trigrams that are expanded. + */ + number = 0; + qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo), + colorTrgmInfoCmp); + for (i = 0; i < trgmNFA->colorTrgmsCount; i++) + { + if (colorTrgms[i].expanded) + { + colorTrgms[i].number = number; + number++; + } + } + + return true; +} + +/* + * Expand selected color trigrams into regular trigrams. + * + * Returns the TRGM array to be passed to the index machinery. + * The array must be allocated in rcontext. + */ +static TRGM * +expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext) +{ + TRGM *trg; + trgm *p; + int i; + TrgmColorInfo blankColor; + trgm_mb_char blankChar; + + /* Set up "blank" color structure containing a single zero character */ + memset(blankChar.bytes, 0, sizeof(blankChar.bytes)); + blankColor.wordCharsCount = 1; + blankColor.wordChars = &blankChar; + + /* Construct the trgm array */ + trg = (TRGM *) + MemoryContextAllocZero(rcontext, + TRGMHDRSIZE + + trgmNFA->totalTrgmCount * sizeof(trgm)); + trg->flag = ARRKEY; + SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, trgmNFA->totalTrgmCount)); + p = GETARR(trg); + for (i = 0; i < trgmNFA->colorTrgmsCount; i++) + { + ColorTrgmInfo *colorTrgm = &trgmNFA->colorTrgms[i]; + TrgmColorInfo *c[3]; + trgm_mb_char s[3]; + int j, + i1, + i2, + i3; + + /* Ignore any unexpanded trigrams ... */ + if (!colorTrgm->expanded) + continue; + + /* Get colors, substituting the dummy struct for COLOR_BLANK */ + for (j = 0; j < 3; j++) + { + if (colorTrgm->ctrgm.colors[j] != COLOR_BLANK) + c[j] = &trgmNFA->colorInfo[colorTrgm->ctrgm.colors[j]]; + else + c[j] = &blankColor; + } + + /* Iterate over all possible combinations of colors' characters */ + for (i1 = 0; i1 < c[0]->wordCharsCount; i1++) + { + s[0] = c[0]->wordChars[i1]; + for (i2 = 0; i2 < c[1]->wordCharsCount; i2++) + { + s[1] = c[1]->wordChars[i2]; + for (i3 = 0; i3 < c[2]->wordCharsCount; i3++) + { + s[2] = c[2]->wordChars[i3]; + fillTrgm(p, s); + p++; + } + } + } + } + + return trg; +} + +/* + * Convert trigram into trgm datatype. + */ +static void +fillTrgm(trgm *ptrgm, trgm_mb_char s[3]) +{ + char str[3 * MAX_MULTIBYTE_CHAR_LEN], + *p; + int i, + j; + + /* Write multibyte string into "str" (we don't need null termination) */ + p = str; + + for (i = 0; i < 3; i++) + { + if (s[i].bytes[0] != 0) + { + for (j = 0; j < MAX_MULTIBYTE_CHAR_LEN && s[i].bytes[j]; j++) + *p++ = s[i].bytes[j]; + } + else + { + /* Emit a space in place of COLOR_BLANK */ + *p++ = ' '; + } + } + + /* Convert "str" to a standard trigram (possibly hashing it) */ + compact_trigram(ptrgm, str, p - str); +} + +/* + * Merge two states of graph. + */ +static void +mergeStates(TrgmState *state1, TrgmState *state2) +{ + ListCell *cell; + + Assert(state1 != state2); + Assert(!state1->parent); + Assert(!state2->parent); + + /* state1 absorbs state2's init/fin flags */ + state1->init |= state2->init; + state1->fin |= state2->fin; + + /* state2, and all its children, become children of state1 */ + foreach(cell, state2->children) + { + TrgmState *state = (TrgmState *) lfirst(cell); + + state->parent = state1; + } + state2->parent = state1; + state1->children = list_concat(state1->children, state2->children); + state1->children = lappend(state1->children, state2); + state2->children = NIL; +} + +/* + * Compare function for sorting of color trigrams by their colors. + */ +static int +colorTrgmInfoCmp(const void *p1, const void *p2) +{ + const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1; + const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2; + + return memcmp(&c1->ctrgm, &c2->ctrgm, sizeof(ColorTrgm)); +} + +/* + * Compare function for sorting color trigrams in descending order of + * their penalty fields. + */ +static int +colorTrgmInfoPenaltyCmp(const void *p1, const void *p2) +{ + float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty; + float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty; + + if (penalty1 < penalty2) + return 1; + else if (penalty1 == penalty2) + return 0; + else + return -1; +} + + +/*--------------------- + * Subroutines for packing the graph into final representation (stage 4). + *--------------------- + */ + +/* + * Pack expanded graph into final representation. + * + * The result data must be allocated in rcontext. + */ +static TrgmPackedGraph * +packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext) +{ + int number = 2, + arcIndex, + arcsCount; + HASH_SEQ_STATUS scan_status; + TrgmState *state; + TrgmPackArcInfo *arcs, + *p1, + *p2; + TrgmPackedArc *packedArcs; + TrgmPackedGraph *result; + int i, + j; + + /* Enumerate surviving states, giving init and fin reserved numbers */ + hash_seq_init(&scan_status, trgmNFA->states); + while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL) + { + while (state->parent) + state = state->parent; + + if (state->number < 0) + { + if (state->init) + state->number = 0; + else if (state->fin) + state->number = 1; + else + { + state->number = number; + number++; + } + } + } + + /* Collect array of all arcs */ + arcs = (TrgmPackArcInfo *) + palloc(sizeof(TrgmPackArcInfo) * trgmNFA->arcsCount); + arcIndex = 0; + hash_seq_init(&scan_status, trgmNFA->states); + while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL) + { + TrgmState *source = state; + ListCell *cell; + + while (source->parent) + source = source->parent; + + foreach(cell, state->arcs) + { + TrgmArc *arc = (TrgmArc *) lfirst(cell); + TrgmState *target = arc->target; + + while (target->parent) + target = target->parent; + + if (source->number != target->number) + { + ColorTrgmInfo *ctrgm; + + ctrgm = (ColorTrgmInfo *) bsearch(&arc->ctrgm, + trgmNFA->colorTrgms, + trgmNFA->colorTrgmsCount, + sizeof(ColorTrgmInfo), + colorTrgmInfoCmp); + Assert(ctrgm != NULL); + Assert(ctrgm->expanded); + + arcs[arcIndex].sourceState = source->number; + arcs[arcIndex].targetState = target->number; + arcs[arcIndex].colorTrgm = ctrgm->number; + arcIndex++; + } + } + } + + /* Sort arcs to ease duplicate detection */ + qsort(arcs, arcIndex, sizeof(TrgmPackArcInfo), packArcInfoCmp); + + /* We could have duplicates because states were merged. Remove them. */ + /* p1 is probe point, p2 is last known non-duplicate. */ + p2 = arcs; + for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++) + { + if (packArcInfoCmp(p1, p2) > 0) + { + p2++; + *p2 = *p1; + } + } + arcsCount = (p2 - arcs) + 1; + + /* Create packed representation */ + result = (TrgmPackedGraph *) + MemoryContextAlloc(rcontext, sizeof(TrgmPackedGraph)); + + /* Pack color trigrams information */ + result->colorTrigramsCount = 0; + for (i = 0; i < trgmNFA->colorTrgmsCount; i++) + { + if (trgmNFA->colorTrgms[i].expanded) + result->colorTrigramsCount++; + } + result->colorTrigramGroups = (int *) + MemoryContextAlloc(rcontext, sizeof(int) * result->colorTrigramsCount); + j = 0; + for (i = 0; i < trgmNFA->colorTrgmsCount; i++) + { + if (trgmNFA->colorTrgms[i].expanded) + { + result->colorTrigramGroups[j] = trgmNFA->colorTrgms[i].count; + j++; + } + } + + /* Pack states and arcs information */ + result->statesCount = number; + result->states = (TrgmPackedState *) + MemoryContextAlloc(rcontext, number * sizeof(TrgmPackedState)); + packedArcs = (TrgmPackedArc *) + MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc)); + j = 0; + for (i = 0; i < number; i++) + { + int cnt = 0; + + result->states[i].arcs = &packedArcs[j]; + while (j < arcsCount && arcs[j].sourceState == i) + { + packedArcs[j].targetState = arcs[j].targetState; + packedArcs[j].colorTrgm = arcs[j].colorTrgm; + cnt++; + j++; + } + result->states[i].arcsCount = cnt; + } + + /* Allocate working memory for trigramsMatchGraph() */ + result->colorTrigramsActive = (bool *) + MemoryContextAlloc(rcontext, sizeof(bool) * result->colorTrigramsCount); + result->statesActive = (bool *) + MemoryContextAlloc(rcontext, sizeof(bool) * result->statesCount); + result->statesQueue = (int *) + MemoryContextAlloc(rcontext, sizeof(int) * result->statesCount); + + return result; +} + +/* + * Comparison function for sorting TrgmPackArcInfos. + * + * Compares arcs in following order: sourceState, colorTrgm, targetState. + */ +static int +packArcInfoCmp(const void *a1, const void *a2) +{ + const TrgmPackArcInfo *p1 = (const TrgmPackArcInfo *) a1; + const TrgmPackArcInfo *p2 = (const TrgmPackArcInfo *) a2; + + if (p1->sourceState < p2->sourceState) + return -1; + if (p1->sourceState > p2->sourceState) + return 1; + if (p1->colorTrgm < p2->colorTrgm) + return -1; + if (p1->colorTrgm > p2->colorTrgm) + return 1; + if (p1->targetState < p2->targetState) + return -1; + if (p1->targetState > p2->targetState) + return 1; + return 0; +} + + +/*--------------------- + * Debugging functions + * + * These are designed to emit GraphViz files. + *--------------------- + */ + +#ifdef TRGM_REGEXP_DEBUG + +/* + * Print initial NFA, in regexp library's representation + */ +static void +printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors) +{ + StringInfoData buf; + int nstates = pg_reg_getnumstates(regex); + int state; + int i; + + initStringInfo(&buf); + + appendStringInfoString(&buf, "\ndigraph sourceNFA {\n"); + + for (state = 0; state < nstates; state++) + { + regex_arc_t *arcs; + int i, + arcsCount; + + appendStringInfo(&buf, "s%d", state); + if (pg_reg_getfinalstate(regex) == state) + appendStringInfoString(&buf, " [shape = doublecircle]"); + appendStringInfoString(&buf, ";\n"); + + arcsCount = pg_reg_getnumoutarcs(regex, state); + arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount); + pg_reg_getoutarcs(regex, state, arcs, arcsCount); + + for (i = 0; i < arcsCount; i++) + { + appendStringInfo(&buf, " s%d -> s%d [label = \"%d\"];\n", + state, arcs[i].to, arcs[i].co); + } + + pfree(arcs); + } + + appendStringInfoString(&buf, " node [shape = point ]; initial;\n"); + appendStringInfo(&buf, " initial -> s%d;\n", + pg_reg_getinitialstate(regex)); + + /* Print colors */ + appendStringInfoString(&buf, " { rank = sink;\n"); + appendStringInfoString(&buf, " Colors [shape = none, margin=0, label=<\n"); + + for (i = 0; i < ncolors; i++) + { + TrgmColorInfo *color = &colors[i]; + int j; + + appendStringInfo(&buf, "<br/>Color %d: ", i); + if (color->expandable) + { + for (j = 0; j < color->wordCharsCount; j++) + { + char s[MAX_MULTIBYTE_CHAR_LEN + 1]; + + memcpy(s, color->wordChars[j].bytes, MAX_MULTIBYTE_CHAR_LEN); + s[MAX_MULTIBYTE_CHAR_LEN] = '\0'; + appendStringInfoString(&buf, s); + } + } + else + appendStringInfoString(&buf, "not expandable"); + appendStringInfoChar(&buf, '\n'); + } + + appendStringInfoString(&buf, " >];\n"); + appendStringInfoString(&buf, " }\n"); + appendStringInfoString(&buf, "}\n"); + + { + /* dot -Tpng -o /tmp/source.png < /tmp/source.dot */ + FILE *fp = fopen("/tmp/source.dot", "w"); + + fprintf(fp, "%s", buf.data); + fclose(fp); + } + + pfree(buf.data); +} + +/* + * Print expanded graph. + */ +static void +printTrgmNFA(TrgmNFA *trgmNFA) +{ + StringInfoData buf; + HASH_SEQ_STATUS scan_status; + TrgmState *state; + TrgmState *initstate = NULL; + + initStringInfo(&buf); + + appendStringInfoString(&buf, "\ndigraph transformedNFA {\n"); + + hash_seq_init(&scan_status, trgmNFA->states); + while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL) + { + ListCell *cell; + + appendStringInfo(&buf, "s%p", (void *) state); + if (state->fin) + appendStringInfoString(&buf, " [shape = doublecircle]"); + if (state->init) + initstate = state; + appendStringInfo(&buf, " [label = \"%d\"]", state->stateKey.nstate); + appendStringInfoString(&buf, ";\n"); + + foreach(cell, state->arcs) + { + TrgmArc *arc = (TrgmArc *) lfirst(cell); + + appendStringInfo(&buf, " s%p -> s%p [label = \"", + (void *) state, (void *) arc->target); + printTrgmColor(&buf, arc->ctrgm.colors[0]); + appendStringInfoChar(&buf, ' '); + printTrgmColor(&buf, arc->ctrgm.colors[1]); + appendStringInfoChar(&buf, ' '); + printTrgmColor(&buf, arc->ctrgm.colors[2]); + appendStringInfoString(&buf, "\"];\n"); + } + } + + if (initstate) + { + appendStringInfoString(&buf, " node [shape = point ]; initial;\n"); + appendStringInfo(&buf, " initial -> s%p;\n", (void *) initstate); + } + + appendStringInfoString(&buf, "}\n"); + + { + /* dot -Tpng -o /tmp/transformed.png < /tmp/transformed.dot */ + FILE *fp = fopen("/tmp/transformed.dot", "w"); + + fprintf(fp, "%s", buf.data); + fclose(fp); + } + + pfree(buf.data); +} + +/* + * Print a TrgmColor readably. + */ +static void +printTrgmColor(StringInfo buf, TrgmColor co) +{ + if (co == COLOR_UNKNOWN) + appendStringInfoChar(buf, 'u'); + else if (co == COLOR_BLANK) + appendStringInfoChar(buf, 'b'); + else + appendStringInfo(buf, "%d", (int) co); +} + +/* + * Print final packed representation of trigram-based expanded graph. + */ +static void +printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams) +{ + StringInfoData buf; + trgm *p; + int i; + + initStringInfo(&buf); + + appendStringInfoString(&buf, "\ndigraph packedGraph {\n"); + + for (i = 0; i < packedGraph->statesCount; i++) + { + TrgmPackedState *state = &packedGraph->states[i]; + int j; + + appendStringInfo(&buf, " s%d", i); + if (i == 1) + appendStringInfoString(&buf, " [shape = doublecircle]"); + + appendStringInfo(&buf, " [label = <s%d>];\n", i); + + for (j = 0; j < state->arcsCount; j++) + { + TrgmPackedArc *arc = &state->arcs[j]; + + appendStringInfo(&buf, " s%d -> s%d [label = \"trigram %d\"];\n", + i, arc->targetState, arc->colorTrgm); + } + } + + appendStringInfoString(&buf, " node [shape = point ]; initial;\n"); + appendStringInfo(&buf, " initial -> s%d;\n", 0); + + /* Print trigrams */ + appendStringInfoString(&buf, " { rank = sink;\n"); + appendStringInfoString(&buf, " Trigrams [shape = none, margin=0, label=<\n"); + + p = GETARR(trigrams); + for (i = 0; i < packedGraph->colorTrigramsCount; i++) + { + int count = packedGraph->colorTrigramGroups[i]; + int j; + + appendStringInfo(&buf, "<br/>Trigram %d: ", i); + + for (j = 0; j < count; j++) + { + if (j > 0) + appendStringInfoString(&buf, ", "); + + /* + * XXX This representation is nice only for all-ASCII trigrams. + */ + appendStringInfo(&buf, "\"%c%c%c\"", (*p)[0], (*p)[1], (*p)[2]); + p++; + } + } + + appendStringInfoString(&buf, " >];\n"); + appendStringInfoString(&buf, " }\n"); + appendStringInfoString(&buf, "}\n"); + + { + /* dot -Tpng -o /tmp/packed.png < /tmp/packed.dot */ + FILE *fp = fopen("/tmp/packed.dot", "w"); + + fprintf(fp, "%s", buf.data); + fclose(fp); + } + + pfree(buf.data); +} + +#endif /* TRGM_REGEXP_DEBUG */ |