Unaccent dictionary.

author Teodor Sigaev <teodor@sigaev.ru>

Tue, 18 Aug 2009 10:34:39 +0000 (10:34 +0000)

committer Teodor Sigaev <teodor@sigaev.ru>

Tue, 18 Aug 2009 10:34:39 +0000 (10:34 +0000)
author Teodor Sigaev <teodor@sigaev.ru>
Tue, 18 Aug 2009 10:34:39 +0000 (10:34 +0000)
committer Teodor Sigaev <teodor@sigaev.ru>
Tue, 18 Aug 2009 10:34:39 +0000 (10:34 +0000)
diff --git a/contrib/Makefile b/contrib/Makefile

index 85cabd8618aab973ca400fea66229b40f52daa22..8543b5287fe57c4bf267706eff389d0ef608d304 100644 (file)
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -1,4 +1,4 @@
-# $PostgreSQL: pgsql/contrib/Makefile,v 1.88 2009/08/07 20:50:21 petere Exp $
+# $PostgreSQL: pgsql/contrib/Makefile,v 1.89 2009/08/18 10:34:39 teodor Exp $
  
  subdir = contrib
  top_builddir = ..
@@ -39,6 +39,7 @@ SUBDIRS = \
         tablefunc   \
         test_parser \
         tsearch2    \
+       unaccent    \
         vacuumlo
  
  ifeq ($(with_openssl),yes)
diff --git a/contrib/README b/contrib/README

index 1ae49adc704530d3f07e734116ef273efb08618a..a8396a5bfadf513ab5133da0d70a27a71ca5f961 100644 (file)
--- a/contrib/README
+++ b/contrib/README
@@ -169,6 +169,10 @@ tsearch2 -
     Pavel Stehule <pavel.stehule@gmail.com>, based on code originally by
     Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.
  
+unaccent -
+   Unaccent dictionary for text search
+   Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.
+
  uuid-ossp -
     UUID generation functions
     by Peter Eisentraut <peter_e@gmx.net>
diff --git a/contrib/unaccent/Makefile b/contrib/unaccent/Makefile

new file mode 100644 (file)

index 0000000..91b04fc
--- /dev/null
+++ b/contrib/unaccent/Makefile
@@ -0,0 +1,24 @@
+# $PostgreSQL: pgsql/contrib/unaccent/Makefile,v 1.1 2009/08/18 10:34:39 teodor Exp $
+
+MODULE_big = unaccent
+OBJS = unaccent.o
+
+DATA_built = unaccent.sql
+DATA = uninstall_unaccent.sql
+DATA_TSEARCH = unaccent.rules
+REGRESS = unaccent
+
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_trgm
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+#redefine REGRESS_OPTS because of needings of UTF8 database
+REGRESS_OPTS = --dbname=$(CONTRIB_TESTDB) --multibyte=UTF8 --no-locale 
diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out

new file mode 100644 (file)

index 0000000..8d197c5
--- /dev/null
+++ b/contrib/unaccent/expected/unaccent.out
@@ -0,0 +1,58 @@
+SET client_min_messages = warning;
+\set ECHO none
+RESET client_min_messages;
+SET client_encoding TO 'KOI8';
+SELECT unaccent('foobar');
+ unaccent 
+----------
+ foobar
+(1 row)
+
+SELECT unaccent('£ÌËÁ');
+ unaccent 
+----------
+ ÅÌËÁ
+(1 row)
+
+SELECT unaccent('³öéë');
+ unaccent 
+----------
+ åöéë
+(1 row)
+
+SELECT unaccent('unaccent', 'foobar');
+ unaccent 
+----------
+ foobar
+(1 row)
+
+SELECT unaccent('unaccent', '£ÌËÁ');
+ unaccent 
+----------
+ ÅÌËÁ
+(1 row)
+
+SELECT unaccent('unaccent', '³öéë');
+ unaccent 
+----------
+ åöéë
+(1 row)
+
+SELECT ts_lexize('unaccent', 'foobar');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+SELECT ts_lexize('unaccent', '£ÌËÁ');
+ ts_lexize 
+-----------
+ {ÅÌËÁ}
+(1 row)
+
+SELECT ts_lexize('unaccent', '³öéë');
+ ts_lexize 
+-----------
+ {åöéë}
+(1 row)
+
diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql

new file mode 100644 (file)

index 0000000..71ab5bb
--- /dev/null
+++ b/contrib/unaccent/sql/unaccent.sql
@@ -0,0 +1,19 @@
+SET client_min_messages = warning;
+\set ECHO none
+\i unaccent.sql
+\set ECHO all
+RESET client_min_messages;
+
+SET client_encoding TO 'KOI8';
+
+SELECT unaccent('foobar');
+SELECT unaccent('£ÌËÁ');
+SELECT unaccent('³öéë');
+
+SELECT unaccent('unaccent', 'foobar');
+SELECT unaccent('unaccent', '£ÌËÁ');
+SELECT unaccent('unaccent', '³öéë');
+
+SELECT ts_lexize('unaccent', 'foobar');
+SELECT ts_lexize('unaccent', '£ÌËÁ');
+SELECT ts_lexize('unaccent', '³öéë');
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c

new file mode 100644 (file)

index 0000000..7b5086b
--- /dev/null
+++ b/contrib/unaccent/unaccent.c
@@ -0,0 +1,318 @@
+/*-------------------------------------------------------------------------
+ *
+ * unaccent.c
+ *    Text search unaccent dictionary
+ *
+ * Copyright (c) 2009, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *    $PostgreSQL: pgsql/contrib/unaccent/unaccent.c,v 1.1 2009/08/18 10:34:39 teodor Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "catalog/namespace.h"
+#include "commands/defrem.h"
+#include "mb/pg_wchar.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+#include "utils/builtins.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * Unaccent dictionary uses uncompressed suffix tree to find a 
+ * character to replace. Each node of tree is an array of 
+ * SuffixChar struct with length = 256 (n-th element of array
+ * corresponds to byte)
+ */
+typedef struct SuffixChar {
+   struct SuffixChar   *nextChar;
+   char                *replaceTo;
+   int                 replacelen;
+} SuffixChar;
+
+/*
+ * placeChar - put str into tree's structure, byte by byte.
+ */
+static SuffixChar*
+placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
+{
+   SuffixChar  *curnode;
+
+   if ( !node )
+   {
+       node = palloc(sizeof(SuffixChar) * 256);
+       memset(node, 0, sizeof(SuffixChar) * 256);
+   }
+
+   curnode = node + *str;
+
+   if ( lenstr == 1 )
+   {
+       if ( curnode->replaceTo )
+           elog(WARNING, "duplicate TO argument, use first one");
+       else
+       {
+           curnode->replacelen = replacelen;
+           curnode->replaceTo = palloc( replacelen );
+           memcpy(curnode->replaceTo, replaceTo, replacelen);
+       }
+   }
+   else
+   {
+       curnode->nextChar = placeChar( curnode->nextChar, str+1, lenstr-1, replaceTo, replacelen);
+   }
+
+   return node;
+}
+
+/*
+ * initSuffixTree  - create suffix tree from file. Function converts
+ * UTF8-encoded file into current encoding.
+ */
+static SuffixChar*
+initSuffixTree(char *filename) 
+{
+   SuffixChar *rootSuffixTree = NULL;
+   MemoryContext ccxt = CurrentMemoryContext;
+   tsearch_readline_state  trst;
+   bool            skip;
+
+   filename = get_tsearch_config_filename(filename, "rules");
+   if (!tsearch_readline_begin(&trst, filename))
+       ereport(ERROR,
+               (errcode(ERRCODE_CONFIG_FILE_ERROR),
+                errmsg("could not open unaccent file \"%s\": %m",
+                       filename)));
+
+   do  
+   {
+       char    src[4096];
+       char    trg[4096];
+       int     srclen;
+       int     trglen;
+       char   *line = NULL;
+
+       skip = true;
+
+       PG_TRY();
+       {
+           /*
+            * pg_do_encoding_conversion() (called by tsearch_readline())
+            * will emit exception if it finds untranslatable characters in current locale.
+            * We just skip such characters.
+            */
+           while ((line = tsearch_readline(&trst)) != NULL)
+           {
+               if ( sscanf(line, "%s\t%s\n", src, trg)!=2 )
+                   continue;
+
+               srclen = strlen(src);
+               trglen = strlen(trg);
+
+               rootSuffixTree = placeChar(rootSuffixTree, 
+                                           (unsigned char*)src, srclen, 
+                                           trg, trglen);
+               skip = false;
+               pfree(line);
+           }
+       }
+       PG_CATCH();
+       {
+           ErrorData  *errdata;
+           MemoryContext ecxt;
+
+           ecxt = MemoryContextSwitchTo(ccxt);
+           errdata = CopyErrorData();
+           if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
+           {
+               FlushErrorState();
+           }
+           else
+           {
+               MemoryContextSwitchTo(ecxt);
+               PG_RE_THROW();
+           }
+       }
+       PG_END_TRY();
+   }
+   while(skip);
+
+   tsearch_readline_end(&trst);
+
+   return rootSuffixTree;
+}
+
+/*
+ * findReplaceTo - find multibyte character in tree
+ */
+static SuffixChar * 
+findReplaceTo( SuffixChar *node, unsigned char *src, int srclen )
+{
+   while( node ) 
+   {
+       node = node + *src;
+       if ( srclen == 1 )
+           return node;
+
+       src++;
+       srclen--;
+       node = node->nextChar;
+   }
+
+   return NULL;
+}
+
+PG_FUNCTION_INFO_V1(unaccent_init);
+Datum       unaccent_init(PG_FUNCTION_ARGS);
+Datum
+unaccent_init(PG_FUNCTION_ARGS)
+{
+   List       *dictoptions = (List *) PG_GETARG_POINTER(0);
+   SuffixChar *rootSuffixTree;
+   bool        fileloaded = false;
+   ListCell   *l;
+
+   foreach(l, dictoptions)
+   {
+       DefElem    *defel = (DefElem *) lfirst(l);
+
+       if (pg_strcasecmp("Rules", defel->defname) == 0)
+       {
+           if (fileloaded)
+               ereport(ERROR,
+                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                        errmsg("multiple Rules parameters")));
+               rootSuffixTree = initSuffixTree(defGetString(defel));
+               fileloaded = true;
+       }
+       else
+       {
+           ereport(ERROR,
+                   (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                    errmsg("unrecognized Unaccent parameter: \"%s\"",
+                           defel->defname)));
+       }
+   }
+
+   if (!fileloaded)
+   {
+       ereport(ERROR,
+               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                errmsg("missing Rules parameter")));
+   }
+
+   PG_RETURN_POINTER(rootSuffixTree);
+}
+
+PG_FUNCTION_INFO_V1(unaccent_lexize);
+Datum       unaccent_lexize(PG_FUNCTION_ARGS);
+Datum
+unaccent_lexize(PG_FUNCTION_ARGS)
+{
+   SuffixChar *rootSuffixTree = (SuffixChar*)PG_GETARG_POINTER(0);
+   char       *srcchar = (char *) PG_GETARG_POINTER(1);
+   int32       len = PG_GETARG_INT32(2);
+   char       *srcstart, *trgchar;
+   int         charlen;
+   TSLexeme   *res = NULL;
+   SuffixChar *node;
+
+   srcstart = srcchar;
+   while( srcchar - srcstart < len )
+   {
+       charlen = pg_mblen(srcchar);
+
+       node = findReplaceTo( rootSuffixTree, (unsigned char *) srcchar, charlen );
+       if ( node  && node->replaceTo )
+       {
+           if ( !res )
+           {
+               /* allocate res only it it's needed */
+               res = palloc0(sizeof(TSLexeme) * 2);
+               res->lexeme = trgchar = palloc( len * pg_database_encoding_max_length() + 1 /* \0 */ );
+               res->flags = TSL_FILTER;
+               if ( srcchar != srcstart )
+               {
+                   memcpy(trgchar, srcstart, srcchar - srcstart);
+                   trgchar += (srcchar - srcstart);
+               }
+           }
+           memcpy( trgchar, node->replaceTo, node->replacelen );
+           trgchar += node->replacelen; 
+       }
+       else if ( res )
+       {
+           memcpy( trgchar, srcchar, charlen );
+           trgchar += charlen;
+       }
+
+       srcchar += charlen;
+   }
+
+   if ( res )
+       *trgchar = '\0';
+
+   PG_RETURN_POINTER(res);
+}
+
+/*
+ * Function-like wrapper for dictionary
+ */
+PG_FUNCTION_INFO_V1(unaccent_dict);
+Datum       unaccent_dict(PG_FUNCTION_ARGS);
+Datum
+unaccent_dict(PG_FUNCTION_ARGS)
+{
+   text    *str;
+   int     strArg;
+   Oid     dictOid;
+   TSDictionaryCacheEntry  *dict;
+   TSLexeme *res;
+
+   if (PG_NARGS() == 1)
+   {
+       dictOid = TSDictionaryGetDictid(stringToQualifiedNameList("unaccent"), false);
+       strArg = 0;
+   }
+   else
+   {
+       dictOid = PG_GETARG_OID(0);
+       strArg = 1;
+   }
+   str = PG_GETARG_TEXT_P(strArg);
+
+   dict = lookup_ts_dictionary_cache(dictOid);
+
+   res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+                                                    PointerGetDatum(dict->dictData),
+                                                    PointerGetDatum(VARDATA(str)),
+                                                    Int32GetDatum(VARSIZE(str) - VARHDRSZ),
+                                                    PointerGetDatum(NULL)));
+
+   PG_FREE_IF_COPY(str, strArg);
+
+   if ( res == NULL )
+   {
+       PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+   }
+   else if ( res->lexeme == NULL )
+   {
+       pfree(res);
+       PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+   }
+   else
+   {
+       text *txt = cstring_to_text(res->lexeme);
+
+       pfree(res->lexeme);
+       pfree(res);
+
+       PG_RETURN_TEXT_P(txt);
+   }
+}
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules

new file mode 100644 (file)

index 0000000..cc2f7a6
--- /dev/null
+++ b/contrib/unaccent/unaccent.rules
@@ -0,0 +1,187 @@
+À A
+Á A
+Â A
+Ã A
+Ä A
+Å A
+Æ A
+à a
+á a
+â a
+ã a
+ä a
+å a
+æ a
+Ā A
+ā a
+Ă A
+ă a
+Ą A
+ą a
+Ç C
+ç c
+Ć C
+ć c
+Ĉ C
+ĉ c
+Ċ C
+ċ c
+Č C
+č c
+Ď D
+ď d
+Đ D
+đ d
+È E
+É E
+Ê E
+Ë E
+è e
+é e
+ê e
+ë e
+Ē E
+ē e
+Ĕ E
+ĕ e
+Ė E
+ė e
+Ę E
+ę e
+Ě E
+ě e
+Ĝ G
+ĝ g
+Ğ G
+ğ g
+Ġ G
+ġ g
+Ģ G
+ģ g
+Ĥ H
+ĥ h
+Ħ H
+ħ h
+Ĩ I
+Ì I
+Í I
+Î I
+Ï I
+ì i
+í i
+î i
+ï i
+ĩ i
+Ī I
+ī i
+Ĭ I
+ĭ i
+Į I
+į i
+İ I
+ı i
+Ĳ I
+ĳ i
+Ĵ J
+ĵ j
+Ķ K
+ķ k
+ĸ k
+Ĺ L
+ĺ l
+Ļ L
+ļ l
+Ľ L
+ľ l
+Ŀ L
+ŀ l
+Ł L
+ł l
+Ñ N
+ñ n
+Ń N
+ń n
+Ņ N
+ņ n
+Ň N
+ň n
+ŉ n
+Ŋ N
+ŋ n
+Ò O
+Ó O
+Ô O
+Õ O
+Ö O
+ò o
+ó o
+ô o
+õ o
+ö o
+Ō O
+ō o
+Ŏ O
+ŏ o
+Ő O
+ő o
+Œ E
+œ e
+Ø O
+ø o
+Ŕ R
+ŕ r
+Ŗ R
+ŗ r
+Ř R
+ř r
+ß S
+Ś S
+ś s
+Ŝ S
+ŝ s
+Ş S
+ş s
+Š S
+š s
+Ţ T
+ţ t
+Ť T
+ť t
+Ŧ T
+ŧ t
+Ù U
+Ú U
+Û U
+Ü U
+ù u
+ú u
+û u
+ü u
+Ũ U
+ũ u
+Ū U
+ū u
+Ŭ U
+ŭ u
+Ů U
+ů u
+Ű U
+ű u
+Ų U
+ų u
+Ŵ W
+ŵ w
+Ý Y
+ý y
+ÿ y
+Ŷ Y
+ŷ y
+Ÿ Y
+Ź Z
+ź z
+Ż Z
+ż z
+Ž Z
+ž z
+ё е
+Ё Е
diff --git a/contrib/unaccent/unaccent.sql.in b/contrib/unaccent/unaccent.sql.in

new file mode 100644 (file)

index 0000000..ba98139
--- /dev/null
+++ b/contrib/unaccent/unaccent.sql.in
@@ -0,0 +1,33 @@
+/* $PostgreSQL: pgsql/contrib/unaccent/unaccent.sql.in,v 1.1 2009/08/18 10:34:39 teodor Exp $ */
+
+CREATE OR REPLACE FUNCTION unaccent(regdictionary, text)
+   RETURNS text
+   AS 'MODULE_PATHNAME', 'unaccent_dict'
+   LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION unaccent(text)
+   RETURNS text
+   AS 'MODULE_PATHNAME', 'unaccent_dict'
+   LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION unaccent_init(internal)
+   RETURNS internal
+   AS 'MODULE_PATHNAME', 'unaccent_init'
+   LANGUAGE C;
+
+CREATE OR REPLACE FUNCTION unaccent_lexize(internal,internal,internal,internal)
+   RETURNS internal
+   AS 'MODULE_PATHNAME', 'unaccent_lexize'
+   LANGUAGE C;
+
+CREATE TEXT SEARCH TEMPLATE unaccent (
+    INIT = unaccent_init,
+   LEXIZE = unaccent_lexize
+);
+
+
+CREATE TEXT SEARCH DICTIONARY unaccent (
+   TEMPLATE = unaccent,
+   RULES    = 'unaccent'
+);
+
diff --git a/contrib/unaccent/uninstall_unaccent.sql b/contrib/unaccent/uninstall_unaccent.sql

new file mode 100644 (file)

index 0000000..89e3627
--- /dev/null
+++ b/contrib/unaccent/uninstall_unaccent.sql
@@ -0,0 +1,9 @@
+/* $PostgreSQL: pgsql/contrib/unaccent/uninstall_unaccent.sql,v 1.1 2009/08/18 10:34:39 teodor Exp $ */
+
+DROP FUNCTION IF EXISTS unaccent(regdictionary, text) CASCADE;
+DROP FUNCTION IF EXISTS unaccent(text) CASCADE;
+DROP TEXT SEARCH DICTIONARY IF EXISTS unaccent CASCADE;
+DROP TEXT SEARCH TEMPLATE IF EXISTS unaccent CASCADE;
+DROP FUNCTION IF EXISTS unaccent_init(internal) CASCADE;
+DROP FUNCTION IF EXISTS unaccent_lexize(internal,internal,internal,internal) CASCADE;
+
diff --git a/doc/src/sgml/contrib.sgml b/doc/src/sgml/contrib.sgml

index 0ef92b48968e85fb8f5e7863bf0b5560ca769cad..cffbc55249c8e111cbcfa93e6abd7ed5c88d33bf 100644 (file)
--- a/doc/src/sgml/contrib.sgml
+++ b/doc/src/sgml/contrib.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.13 2009/04/27 16:27:35 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.14 2009/08/18 10:34:39 teodor Exp $ -->
  
  <appendix id="contrib">
   <title>Additional Supplied Modules</title>
@@ -113,6 +113,7 @@ psql -d dbname -f <replaceable>SHAREDIR</>/contrib/<replaceable>module</>.sql
   &tablefunc;
   &test-parser;
   &tsearch2;
+ &unaccent;
   &uuid-ossp;
   &vacuumlo;
   &xml2;
diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml

index 7e194f7bccb1ab7961197e55c3763ad1317df08d..bee66008b6695a4dd99d5f48980a56ae810f052a 100644 (file)
--- a/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.63 2009/08/17 22:14:44 petere Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.64 2009/08/18 10:34:39 teodor Exp $ -->
  
  <!entity history    SYSTEM "history.sgml">
  <!entity info       SYSTEM "info.sgml">
@@ -126,6 +126,7 @@
  <!entity tablefunc       SYSTEM "tablefunc.sgml">
  <!entity test-parser     SYSTEM "test-parser.sgml">
  <!entity tsearch2        SYSTEM "tsearch2.sgml">
+<!entity unaccent      SYSTEM "unaccent.sgml">
  <!entity uuid-ossp       SYSTEM "uuid-ossp.sgml">
  <!entity vacuumlo        SYSTEM "vacuumlo.sgml">
  <!entity xml2            SYSTEM "xml2.sgml"> 
diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml

new file mode 100644 (file)

index 0000000..b3c7bbe
--- /dev/null
+++ b/doc/src/sgml/unaccent.sgml
@@ -0,0 +1,150 @@
+<sect1 id="unaccent">
+ <title>unaccent</title>
+
+ <indexterm zone="unaccent">
+  <primary>unaccent</primary>
+ </indexterm>
+
+ <para>
+  <filename>unaccent</> removes accents (diacritic signs) from a lexeme.
+  It's a filtering dictionary, that means its output is 
+  always passed to the next dictionary (if any), contrary to the standard 
+  behaviour. Currently, it supports most important accents from european 
+  languages. 
+ </para>
+
+ <para>
+  Limitation: Current implementation of <filename>unaccent</> 
+  dictionary cannot be used as a normalizing dictionary for 
+  <filename>thesaurus</filename> dictionary.
+ </para>
+ 
+ <sect2>
+  <title>Configuration</title>
+
+  <para>
+   A <literal>unaccent</> dictionary accepts the following options:
+  </para>
+  <itemizedlist>
+   <listitem>
+    <para>
+     <literal>RULES</> is the base name of the file containing the list of
+     translation rules.  This file must be stored in
+     <filename>$SHAREDIR/tsearch_data/</> (where <literal>$SHAREDIR</> means
+     the <productname>PostgreSQL</> installation's shared-data directory).
+     Its name must end in <literal>.rules</> (which is not to be included in
+     the <literal>RULES</> parameter).
+    </para>
+   </listitem>
+  </itemizedlist>
+  <para>
+   The rules file has the following format:
+  </para>
+  <itemizedlist>
+   <listitem>
+    <para>
+     Each line represents pair: character_with_accent  character_without_accent
+    <programlisting>
+&Agrave;   A
+&Aacute;   A
+&Acirc;    A
+&Atilde;   A
+&Auml;     A
+&Aring;        A
+&AElig;    A
+    </programlisting>
+    </para>
+   </listitem>
+  </itemizedlist>
+
+  <para>
+   Look at <filename>unaccent.rules</>, which is installed in
+   <filename>$SHAREDIR/tsearch_data/</>, for an example.
+  </para>
+ </sect2>
+
+ <sect2>
+  <title>Usage</title>
+
+  <para>
+   Running the installation script creates a text search template
+   <literal>unaccent</> and a dictionary <literal>unaccent</>
+   based on it, with default parameters.  You can alter the
+   parameters, for example
+
+<programlisting>
+=# ALTER TEXT SEARCH DICTIONARY unaccent (RULES='my_rules');
+</programlisting>
+
+   or create new dictionaries based on the template.
+  </para>
+
+  <para>
+   To test the dictionary, you can try
+
+<programlisting>
+=# select ts_lexize('unaccent','Hôtel');
+ ts_lexize 
+-----------
+ {Hotel}
+(1 row)
+</programlisting>
+  </para>
+  
+  <para>
+  Filtering dictionary are useful for correct work of 
+  <function>ts_headline</function> function.
+<programlisting>
+=# CREATE TEXT SEARCH CONFIGURATION fr ( COPY = french );
+=# ALTER TEXT SEARCH CONFIGURATION fr
+   ALTER MAPPING FOR hword, hword_part, word
+   WITH unaccent, french_stem;
+=# select to_tsvector('fr','Hôtels de la Mer');
+    to_tsvector    
+-------------------
+ 'hotel':1 'mer':4
+(1 row)
+
+=# select to_tsvector('fr','Hôtel de la Mer') @@ to_tsquery('fr','Hotels');
+ ?column? 
+----------
+ t
+(1 row)
+=# select ts_headline('fr','Hôtel de la Mer',to_tsquery('fr','Hotels'));
+      ts_headline       
+------------------------
+  &lt;b&gt;Hôtel&lt;/b&gt;de la Mer
+(1 row)
+
+</programlisting>
+  </para>
+ </sect2>
+
+ <sect2>
+ <title>Function</title>
+
+ <para>
+  <function>unaccent</> function removes accents (diacritic signs) from
+  argument string. Basically, it's a wrapper around 
+  <filename>unaccent</> dictionary.
+ </para>
+
+ <indexterm>
+  <primary>unaccent</primary>
+ </indexterm>
+
+ <synopsis>
+   unaccent(<optional><replaceable class="PARAMETER">dictionary</replaceable>,
+   </optional> <replaceable class="PARAMETER">string</replaceable>) 
+  returns <type>text</type>
+ </synopsis>  
+
+ <para>
+<programlisting>
+SELECT unaccent('unaccent','Hôtel');
+SELECT unaccent('Hôtel');
+</programlisting>
+ </para>
+ </sect2>
+
+</sect1>
author	Teodor Sigaev <teodor@sigaev.ru>
	Tue, 18 Aug 2009 10:34:39 +0000 (10:34 +0000)
committer	Teodor Sigaev <teodor@sigaev.ru>
	Tue, 18 Aug 2009 10:34:39 +0000 (10:34 +0000)
contrib/Makefile		patch \| blob \| blame \| history
contrib/README		patch \| blob \| blame \| history
contrib/unaccent/Makefile	[new file with mode: 0644]	patch \| blob
contrib/unaccent/expected/unaccent.out	[new file with mode: 0644]	patch \| blob
contrib/unaccent/sql/unaccent.sql	[new file with mode: 0644]	patch \| blob
contrib/unaccent/unaccent.c	[new file with mode: 0644]	patch \| blob
contrib/unaccent/unaccent.rules	[new file with mode: 0644]	patch \| blob
contrib/unaccent/unaccent.sql.in	[new file with mode: 0644]	patch \| blob
contrib/unaccent/uninstall_unaccent.sql	[new file with mode: 0644]	patch \| blob
doc/src/sgml/contrib.sgml		patch \| blob \| blame \| history
doc/src/sgml/filelist.sgml		patch \| blob \| blame \| history
doc/src/sgml/unaccent.sgml	[new file with mode: 0644]	patch \| blob