diff options
| author | Tom Lane | 2007-10-15 21:36:50 +0000 |
|---|---|---|
| committer | Tom Lane | 2007-10-15 21:36:50 +0000 |
| commit | 5fcb079858bb392e87067b5526e9df950db38024 (patch) | |
| tree | 4ffb764af092be94fbe0e033dce2f492e6c937f7 /contrib/dict_xsyn | |
| parent | fb631dba2a3c2c183bb99f2098491ecf96fb6664 (diff) | |
Add sample text search dictionary templates and parsers, to replace the
hard-to-maintain textual examples currently in the SGML docs. From
Sergey Karpov.
Diffstat (limited to 'contrib/dict_xsyn')
| -rw-r--r-- | contrib/dict_xsyn/Makefile | 38 | ||||
| -rw-r--r-- | contrib/dict_xsyn/README.dict_xsyn | 52 | ||||
| -rw-r--r-- | contrib/dict_xsyn/dict_xsyn.c | 235 | ||||
| -rw-r--r-- | contrib/dict_xsyn/dict_xsyn.sql.in | 29 | ||||
| -rw-r--r-- | contrib/dict_xsyn/expected/dict_xsyn.out | 22 | ||||
| -rw-r--r-- | contrib/dict_xsyn/sql/dict_xsyn.sql | 16 | ||||
| -rw-r--r-- | contrib/dict_xsyn/uninstall_dict_xsyn.sql | 9 | ||||
| -rw-r--r-- | contrib/dict_xsyn/xsyn_sample.rules | 6 |
8 files changed, 407 insertions, 0 deletions
diff --git a/contrib/dict_xsyn/Makefile b/contrib/dict_xsyn/Makefile new file mode 100644 index 00000000000..563f039e468 --- /dev/null +++ b/contrib/dict_xsyn/Makefile @@ -0,0 +1,38 @@ +# $PostgreSQL: pgsql/contrib/dict_xsyn/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $ + +MODULE_big = dict_xsyn +OBJS = dict_xsyn.o +DATA_built = dict_xsyn.sql +DATA = uninstall_dict_xsyn.sql +DOCS = README.dict_xsyn +REGRESS = dict_xsyn + +DICTDIR = tsearch_data +DICTFILES = xsyn_sample.rules + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/dict_xsyn +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +install: install-data + +.PHONY: install-data +install-data: $(DICTFILES) + for i in $(DICTFILES); \ + do $(INSTALL_DATA) $(srcdir)/$$i '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i; \ + done + +uninstall: uninstall-data + +.PHONY: uninstall-data +uninstall-data: + for i in $(DICTFILES); \ + do rm -rf '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i ; \ + done diff --git a/contrib/dict_xsyn/README.dict_xsyn b/contrib/dict_xsyn/README.dict_xsyn new file mode 100644 index 00000000000..9565eefefbc --- /dev/null +++ b/contrib/dict_xsyn/README.dict_xsyn @@ -0,0 +1,52 @@ +Extended Synonym dictionary +=========================== + +This is a simple synonym dictionary. It replaces words with groups of their +synonyms, and so makes it possible to search for a word using any of its +synonyms. + +* Configuration + +It accepts the following options: + + - KEEPORIG controls whether the original word is included, or only its + synonyms. Default is 'true'. + + - RULES is the base name of the file containing the list of synonyms. + This file must be in $(prefix)/share/tsearch_data/, and its name must + end in ".rules" (which is not included in the RULES parameter). + +The rules file has the following format: + + - Each line represents a group of synonyms for a single word, which is + given first on the line. Synonyms are separated by whitespace: + + word syn1 syn2 syn3 + + - Sharp ('#') sign is a comment delimiter. It may appear at any position + inside the line. The rest of the line will be skipped. + +Look at xsyn_sample.rules, which is installed in $(prefix)/share/tsearch_data/, +for an example. + +* Usage + +1. Compile and install + +2. Load dictionary + + psql mydb < dict_xsyn.sql + +3. Test it + + mydb=# SELECT ts_lexize('xsyn','word'); + ts_lexize + ---------------- + {word,syn1,syn2,syn3) + +4. Change the dictionary options as you wish + + mydb# ALTER TEXT SEARCH DICTIONARY xsyn (KEEPORIG=false); + ALTER TEXT SEARCH DICTIONARY + +That's all. diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c new file mode 100644 index 00000000000..1cd53a26bd1 --- /dev/null +++ b/contrib/dict_xsyn/dict_xsyn.c @@ -0,0 +1,235 @@ +/*------------------------------------------------------------------------- + * + * dict_xsyn.c + * Extended synonym dictionary + * + * Copyright (c) 2007, PostgreSQL Global Development Group + * + * IDENTIFICATION + * $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.c,v 1.1 2007/10/15 21:36:50 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <ctype.h> + +#include "commands/defrem.h" +#include "fmgr.h" +#include "storage/fd.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" + +PG_MODULE_MAGIC; + +typedef struct +{ + char *key; /* Word */ + char *value; /* Unparsed list of synonyms, including the word itself */ +} Syn; + +typedef struct +{ + int len; + Syn *syn; + + bool keeporig; +} DictSyn; + + +PG_FUNCTION_INFO_V1(dxsyn_init); +Datum dxsyn_init(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(dxsyn_lexize); +Datum dxsyn_lexize(PG_FUNCTION_ARGS); + +static char * +find_word(char *in, char **end) +{ + char *start; + + *end = NULL; + while (*in && t_isspace(in)) + in += pg_mblen(in); + + if (!*in || *in == '#') + return NULL; + start = in; + + while (*in && !t_isspace(in)) + in += pg_mblen(in); + + *end = in; + + return start; +} + +static int +compare_syn(const void *a, const void *b) +{ + return strcmp(((Syn *) a)->key, ((Syn *) b)->key); +} + +static void +read_dictionary(DictSyn *d, char *filename) +{ + char *real_filename = get_tsearch_config_filename(filename, "rules"); + FILE *fin; + char *line; + int cur = 0; + + if ((fin = AllocateFile(real_filename, "r")) == NULL) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open synonym file \"%s\": %m", + real_filename))); + + while ((line = t_readline(fin)) != NULL) + { + char *value; + char *key; + char *end = NULL; + + if (*line == '\0') + continue; + + value = lowerstr(line); + pfree(line); + + key = find_word(value, &end); + if (!key) + { + pfree(value); + continue; + } + + if (cur == d->len) + { + d->len = (d->len > 0) ? 2 * d->len : 16; + if (d->syn) + d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); + else + d->syn = (Syn *) palloc(sizeof(Syn) * d->len); + } + + d->syn[cur].key = pnstrdup(key, end - key); + d->syn[cur].value = value; + + cur++; + } + + FreeFile(fin); + + d->len = cur; + if (cur > 1) + qsort(d->syn, d->len, sizeof(Syn), compare_syn); + + pfree(real_filename); +} + +Datum +dxsyn_init(PG_FUNCTION_ARGS) +{ + List *dictoptions = (List *) PG_GETARG_POINTER(0); + DictSyn *d; + ListCell *l; + + d = (DictSyn *) palloc0(sizeof(DictSyn)); + d->len = 0; + d->syn = NULL; + d->keeporig = true; + + foreach(l, dictoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + + if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0) + { + d->keeporig = defGetBoolean(defel); + } + else if (pg_strcasecmp(defel->defname, "RULES") == 0) + { + read_dictionary(d, defGetString(defel)); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized xsyn parameter: \"%s\"", + defel->defname))); + } + } + + PG_RETURN_POINTER(d); +} + +Datum +dxsyn_lexize(PG_FUNCTION_ARGS) +{ + DictSyn *d = (DictSyn *) PG_GETARG_POINTER(0); + char *in = (char *) PG_GETARG_POINTER(1); + int length = PG_GETARG_INT32(2); + Syn word; + Syn *found; + TSLexeme *res = NULL; + + if (!length || d->len == 0) + PG_RETURN_POINTER(NULL); + + /* Create search pattern */ + { + char *temp = pnstrdup(in, length); + + word.key = lowerstr(temp); + pfree(temp); + word.value = NULL; + } + + /* Look for matching syn */ + found = (Syn *)bsearch(&word, d->syn, d->len, sizeof(Syn), compare_syn); + pfree(word.key); + + if (!found) + PG_RETURN_POINTER(NULL); + + /* Parse string of synonyms and return array of words */ + { + char *value = pstrdup(found->value); + int value_length = strlen(value); + char *pos = value; + int nsyns = 0; + bool is_first = true; + + res = palloc(0); + + while(pos < value + value_length) + { + char *end; + char *syn = find_word(pos, &end); + + if (!syn) + break; + *end = '\0'; + + res = repalloc(res, sizeof(TSLexeme)*(nsyns + 2)); + res[nsyns].lexeme = NULL; + + /* first word is added to result only if KEEPORIG flag is set */ + if(d->keeporig || !is_first) + { + res[nsyns].lexeme = pstrdup(syn); + res[nsyns + 1].lexeme = NULL; + + nsyns++; + } + + is_first = false; + + pos = end + 1; + } + + pfree(value); + } + + PG_RETURN_POINTER(res); +} diff --git a/contrib/dict_xsyn/dict_xsyn.sql.in b/contrib/dict_xsyn/dict_xsyn.sql.in new file mode 100644 index 00000000000..0e5755e5b17 --- /dev/null +++ b/contrib/dict_xsyn/dict_xsyn.sql.in @@ -0,0 +1,29 @@ +-- $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $ + +-- Adjust this setting to control where the objects get created. +SET search_path = public; + +BEGIN; + +CREATE FUNCTION dxsyn_init(internal) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C STRICT; + +CREATE FUNCTION dxsyn_lexize(internal, internal, internal, internal) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C STRICT; + +CREATE TEXT SEARCH TEMPLATE xsyn_template ( + LEXIZE = dxsyn_lexize, + INIT = dxsyn_init +); + +CREATE TEXT SEARCH DICTIONARY xsyn ( + TEMPLATE = xsyn_template +); + +COMMENT ON TEXT SEARCH DICTIONARY xsyn IS 'eXtended synonym dictionary'; + +END; diff --git a/contrib/dict_xsyn/expected/dict_xsyn.out b/contrib/dict_xsyn/expected/dict_xsyn.out new file mode 100644 index 00000000000..99071ea8c74 --- /dev/null +++ b/contrib/dict_xsyn/expected/dict_xsyn.out @@ -0,0 +1,22 @@ +-- +-- first, define the datatype. Turn off echoing so that expected file +-- does not depend on contents of this file. +-- +SET client_min_messages = warning; +\set ECHO none +RESET client_min_messages; +--configuration +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false); +--lexize +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +---------------- + {sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + diff --git a/contrib/dict_xsyn/sql/dict_xsyn.sql b/contrib/dict_xsyn/sql/dict_xsyn.sql new file mode 100644 index 00000000000..17f6df9cf3d --- /dev/null +++ b/contrib/dict_xsyn/sql/dict_xsyn.sql @@ -0,0 +1,16 @@ +-- +-- first, define the datatype. Turn off echoing so that expected file +-- does not depend on contents of this file. +-- +SET client_min_messages = warning; +\set ECHO none +\i dict_xsyn.sql +\set ECHO all +RESET client_min_messages; + +--configuration +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false); + +--lexize +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'grb'); diff --git a/contrib/dict_xsyn/uninstall_dict_xsyn.sql b/contrib/dict_xsyn/uninstall_dict_xsyn.sql new file mode 100644 index 00000000000..7b7acea0d14 --- /dev/null +++ b/contrib/dict_xsyn/uninstall_dict_xsyn.sql @@ -0,0 +1,9 @@ +SET search_path = public; + +DROP TEXT SEARCH DICTIONARY xsyn; + +DROP TEXT SEARCH TEMPLATE xsyn_template; + +DROP FUNCTION dxsyn_init(internal); + +DROP FUNCTION dxsyn_lexize(internal,internal,internal,internal); diff --git a/contrib/dict_xsyn/xsyn_sample.rules b/contrib/dict_xsyn/xsyn_sample.rules new file mode 100644 index 00000000000..203bec793a1 --- /dev/null +++ b/contrib/dict_xsyn/xsyn_sample.rules @@ -0,0 +1,6 @@ +# Sample rules file for eXtended Synonym (xsyn) dictionary +# format is as follows: +# +# word synonym1 synonym2 ... +# +supernova sn sne 1987a |
