Unaccent dictionary.

author Teodor Sigaev <teodor@sigaev.ru>

Tue, 18 Aug 2009 10:34:39 +0000 (10:34 +0000)

committer Teodor Sigaev <teodor@sigaev.ru>

Tue, 18 Aug 2009 10:34:39 +0000 (10:34 +0000)
author Teodor Sigaev <teodor@sigaev.ru>
Tue, 18 Aug 2009 10:34:39 +0000 (10:34 +0000)
committer Teodor Sigaev <teodor@sigaev.ru>
Tue, 18 Aug 2009 10:34:39 +0000 (10:34 +0000)
diff --git a/contrib/Makefile b/contrib/Makefile

index 247c4972f0da4690965f71d512118cd8f7617632..0afa149ac93a12999e08ff795c4396695161f48e 100644 (file)
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -39,6 +39,7 @@ SUBDIRS = \
                 tablefunc       \
                 test_parser     \
                 tsearch2        \
+               unaccent        \
                 vacuumlo
  
  ifeq ($(with_openssl),yes)
diff --git a/contrib/README b/contrib/README

index 1ae49adc704530d3f07e734116ef273efb08618a..a8396a5bfadf513ab5133da0d70a27a71ca5f961 100644 (file)
--- a/contrib/README
+++ b/contrib/README
@@ -169,6 +169,10 @@ tsearch2 -
         Pavel Stehule <pavel.stehule@gmail.com>, based on code originally by
         Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.
  
+unaccent -
+       Unaccent dictionary for text search
+       Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.
+
  uuid-ossp -
         UUID generation functions
         by Peter Eisentraut <peter_e@gmx.net>
diff --git a/contrib/unaccent/Makefile b/contrib/unaccent/Makefile

new file mode 100644 (file)

index 0000000..0226912
--- /dev/null
+++ b/contrib/unaccent/Makefile
@@ -0,0 +1,24 @@
+# $PostgreSQL$
+
+MODULE_big = unaccent
+OBJS = unaccent.o
+
+DATA_built = unaccent.sql
+DATA = uninstall_unaccent.sql
+DATA_TSEARCH = unaccent.rules
+REGRESS = unaccent
+
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_trgm
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+#redefine REGRESS_OPTS because of needings of UTF8 database
+REGRESS_OPTS = --dbname=$(CONTRIB_TESTDB) --multibyte=UTF8 --no-locale 
diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out

new file mode 100644 (file)

index 0000000..8d197c5
--- /dev/null
+++ b/contrib/unaccent/expected/unaccent.out
@@ -0,0 +1,58 @@
+SET client_min_messages = warning;
+\set ECHO none
+RESET client_min_messages;
+SET client_encoding TO 'KOI8';
+SELECT unaccent('foobar');
+ unaccent 
+----------
+ foobar
+(1 row)
+
+SELECT unaccent('£ÌËÁ');
+ unaccent 
+----------
+ ÅÌËÁ
+(1 row)
+
+SELECT unaccent('³öéë');
+ unaccent 
+----------
+ åöéë
+(1 row)
+
+SELECT unaccent('unaccent', 'foobar');
+ unaccent 
+----------
+ foobar
+(1 row)
+
+SELECT unaccent('unaccent', '£ÌËÁ');
+ unaccent 
+----------
+ ÅÌËÁ
+(1 row)
+
+SELECT unaccent('unaccent', '³öéë');
+ unaccent 
+----------
+ åöéë
+(1 row)
+
+SELECT ts_lexize('unaccent', 'foobar');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+SELECT ts_lexize('unaccent', '£ÌËÁ');
+ ts_lexize 
+-----------
+ {ÅÌËÁ}
+(1 row)
+
+SELECT ts_lexize('unaccent', '³öéë');
+ ts_lexize 
+-----------
+ {åöéë}
+(1 row)
+
diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql

new file mode 100644 (file)

index 0000000..71ab5bb
--- /dev/null
+++ b/contrib/unaccent/sql/unaccent.sql
@@ -0,0 +1,19 @@
+SET client_min_messages = warning;
+\set ECHO none
+\i unaccent.sql
+\set ECHO all
+RESET client_min_messages;
+
+SET client_encoding TO 'KOI8';
+
+SELECT unaccent('foobar');
+SELECT unaccent('£ÌËÁ');
+SELECT unaccent('³öéë');
+
+SELECT unaccent('unaccent', 'foobar');
+SELECT unaccent('unaccent', '£ÌËÁ');
+SELECT unaccent('unaccent', '³öéë');
+
+SELECT ts_lexize('unaccent', 'foobar');
+SELECT ts_lexize('unaccent', '£ÌËÁ');
+SELECT ts_lexize('unaccent', '³öéë');
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c

new file mode 100644 (file)

index 0000000..924697f
--- /dev/null
+++ b/contrib/unaccent/unaccent.c
@@ -0,0 +1,318 @@
+/*-------------------------------------------------------------------------
+ *
+ * unaccent.c
+ *    Text search unaccent dictionary
+ *
+ * Copyright (c) 2009, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *    $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "catalog/namespace.h"
+#include "commands/defrem.h"
+#include "mb/pg_wchar.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+#include "utils/builtins.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * Unaccent dictionary uses uncompressed suffix tree to find a 
+ * character to replace. Each node of tree is an array of 
+ * SuffixChar struct with length = 256 (n-th element of array
+ * corresponds to byte)
+ */
+typedef struct SuffixChar {
+       struct SuffixChar       *nextChar;
+       char                            *replaceTo;
+       int                                     replacelen;
+} SuffixChar;
+
+/*
+ * placeChar - put str into tree's structure, byte by byte.
+ */
+static SuffixChar*
+placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
+{
+       SuffixChar      *curnode;
+
+       if ( !node )
+       {
+               node = palloc(sizeof(SuffixChar) * 256);
+               memset(node, 0, sizeof(SuffixChar) * 256);
+       }
+
+       curnode = node + *str;
+
+       if ( lenstr == 1 )
+       {
+               if ( curnode->replaceTo )
+                       elog(WARNING, "duplicate TO argument, use first one");
+               else
+               {
+                       curnode->replacelen = replacelen;
+                       curnode->replaceTo = palloc( replacelen );
+                       memcpy(curnode->replaceTo, replaceTo, replacelen);
+               }
+       }
+       else
+       {
+               curnode->nextChar = placeChar( curnode->nextChar, str+1, lenstr-1, replaceTo, replacelen);
+       }
+
+       return node;
+}
+
+/*
+ * initSuffixTree  - create suffix tree from file. Function converts
+ * UTF8-encoded file into current encoding.
+ */
+static SuffixChar*
+initSuffixTree(char *filename) 
+{
+       SuffixChar *rootSuffixTree = NULL;
+       MemoryContext ccxt = CurrentMemoryContext;
+       tsearch_readline_state  trst;
+       bool                    skip;
+
+       filename = get_tsearch_config_filename(filename, "rules");
+       if (!tsearch_readline_begin(&trst, filename))
+               ereport(ERROR,
+                               (errcode(ERRCODE_CONFIG_FILE_ERROR),
+                                errmsg("could not open unaccent file \"%s\": %m",
+                                               filename)));
+
+       do      
+       {
+               char    src[4096];
+               char    trg[4096];
+               int             srclen;
+               int             trglen;
+               char   *line = NULL;
+
+               skip = true;
+
+               PG_TRY();
+               {
+                       /*
+                        * pg_do_encoding_conversion() (called by tsearch_readline())
+                        * will emit exception if it finds untranslatable characters in current locale.
+                        * We just skip such characters.
+                        */
+                       while ((line = tsearch_readline(&trst)) != NULL)
+                       {
+                               if ( sscanf(line, "%s\t%s\n", src, trg)!=2 )
+                                       continue;
+
+                               srclen = strlen(src);
+                               trglen = strlen(trg);
+
+                               rootSuffixTree = placeChar(rootSuffixTree, 
+                                                                                       (unsigned char*)src, srclen, 
+                                                                                       trg, trglen);
+                               skip = false;
+                               pfree(line);
+                       }
+               }
+               PG_CATCH();
+               {
+                       ErrorData  *errdata;
+                       MemoryContext ecxt;
+
+                       ecxt = MemoryContextSwitchTo(ccxt);
+                       errdata = CopyErrorData();
+                       if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
+                       {
+                               FlushErrorState();
+                       }
+                       else
+                       {
+                               MemoryContextSwitchTo(ecxt);
+                               PG_RE_THROW();
+                       }
+               }
+               PG_END_TRY();
+       }
+       while(skip);
+
+       tsearch_readline_end(&trst);
+
+       return rootSuffixTree;
+}
+
+/*
+ * findReplaceTo - find multibyte character in tree
+ */
+static SuffixChar * 
+findReplaceTo( SuffixChar *node, unsigned char *src, int srclen )
+{
+       while( node ) 
+       {
+               node = node + *src;
+               if ( srclen == 1 )
+                       return node;
+
+               src++;
+               srclen--;
+               node = node->nextChar;
+       }
+
+       return NULL;
+}
+
+PG_FUNCTION_INFO_V1(unaccent_init);
+Datum       unaccent_init(PG_FUNCTION_ARGS);
+Datum
+unaccent_init(PG_FUNCTION_ARGS)
+{
+       List       *dictoptions = (List *) PG_GETARG_POINTER(0);
+       SuffixChar *rootSuffixTree;
+       bool        fileloaded = false;
+       ListCell   *l;
+
+       foreach(l, dictoptions)
+       {
+               DefElem    *defel = (DefElem *) lfirst(l);
+
+               if (pg_strcasecmp("Rules", defel->defname) == 0)
+               {
+                       if (fileloaded)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                                errmsg("multiple Rules parameters")));
+                               rootSuffixTree = initSuffixTree(defGetString(defel));
+                               fileloaded = true;
+               }
+               else
+               {
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("unrecognized Unaccent parameter: \"%s\"",
+                                                       defel->defname)));
+               }
+       }
+
+       if (!fileloaded)
+       {
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                errmsg("missing Rules parameter")));
+       }
+
+       PG_RETURN_POINTER(rootSuffixTree);
+}
+
+PG_FUNCTION_INFO_V1(unaccent_lexize);
+Datum       unaccent_lexize(PG_FUNCTION_ARGS);
+Datum
+unaccent_lexize(PG_FUNCTION_ARGS)
+{
+       SuffixChar *rootSuffixTree = (SuffixChar*)PG_GETARG_POINTER(0);
+       char       *srcchar = (char *) PG_GETARG_POINTER(1);
+       int32           len = PG_GETARG_INT32(2);
+       char       *srcstart, *trgchar;
+       int                     charlen;
+       TSLexeme   *res = NULL;
+       SuffixChar *node;
+
+       srcstart = srcchar;
+       while( srcchar - srcstart < len )
+       {
+               charlen = pg_mblen(srcchar);
+
+               node = findReplaceTo( rootSuffixTree, (unsigned char *) srcchar, charlen );
+               if ( node  && node->replaceTo )
+               {
+                       if ( !res )
+                       {
+                               /* allocate res only it it's needed */
+                               res = palloc0(sizeof(TSLexeme) * 2);
+                               res->lexeme = trgchar = palloc( len * pg_database_encoding_max_length() + 1 /* \0 */ );
+                               res->flags = TSL_FILTER;
+                               if ( srcchar != srcstart )
+                               {
+                                       memcpy(trgchar, srcstart, srcchar - srcstart);
+                                       trgchar += (srcchar - srcstart);
+                               }
+                       }
+                       memcpy( trgchar, node->replaceTo, node->replacelen );
+                       trgchar += node->replacelen; 
+               }
+               else if ( res )
+               {
+                       memcpy( trgchar, srcchar, charlen );
+                       trgchar += charlen;
+               }
+
+               srcchar += charlen;
+       }
+
+       if ( res )
+               *trgchar = '\0';
+
+       PG_RETURN_POINTER(res);
+}
+
+/*
+ * Function-like wrapper for dictionary
+ */
+PG_FUNCTION_INFO_V1(unaccent_dict);
+Datum       unaccent_dict(PG_FUNCTION_ARGS);
+Datum
+unaccent_dict(PG_FUNCTION_ARGS)
+{
+       text    *str;
+       int             strArg;
+       Oid             dictOid;
+       TSDictionaryCacheEntry  *dict;
+       TSLexeme *res;
+
+       if (PG_NARGS() == 1)
+       {
+               dictOid = TSDictionaryGetDictid(stringToQualifiedNameList("unaccent"), false);
+               strArg = 0;
+       }
+       else
+       {
+               dictOid = PG_GETARG_OID(0);
+               strArg = 1;
+       }
+       str = PG_GETARG_TEXT_P(strArg);
+
+       dict = lookup_ts_dictionary_cache(dictOid);
+
+       res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+                                                                                                        PointerGetDatum(dict->dictData),
+                                                                                                        PointerGetDatum(VARDATA(str)),
+                                                                                                        Int32GetDatum(VARSIZE(str) - VARHDRSZ),
+                                                                                                        PointerGetDatum(NULL)));
+
+       PG_FREE_IF_COPY(str, strArg);
+
+       if ( res == NULL )
+       {
+               PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+       }
+       else if ( res->lexeme == NULL )
+       {
+               pfree(res);
+               PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+       }
+       else
+       {
+               text *txt = cstring_to_text(res->lexeme);
+
+               pfree(res->lexeme);
+               pfree(res);
+
+               PG_RETURN_TEXT_P(txt);
+       }
+}
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules

new file mode 100644 (file)

index 0000000..cc2f7a6
--- /dev/null
+++ b/contrib/unaccent/unaccent.rules
@@ -0,0 +1,187 @@
+À     A
+Á     A
+Â     A
+Ã     A
+Ä     A
+Å     A
+Æ     A
+à     a
+á     a
+â     a
+ã     a
+ä     a
+å     a
+æ     a
+Ā     A
+ā     a
+Ă     A
+ă     a
+Ą     A
+ą     a
+Ç     C
+ç     c
+Ć     C
+ć     c
+Ĉ     C
+ĉ     c
+Ċ     C
+ċ     c
+Č     C
+č     c
+Ď     D
+ď     d
+Đ     D
+đ     d
+È     E
+É     E
+Ê     E
+Ë     E
+è     e
+é     e
+ê     e
+ë     e
+Ē     E
+ē     e
+Ĕ     E
+ĕ     e
+Ė     E
+ė     e
+Ę     E
+ę     e
+Ě     E
+ě     e
+Ĝ     G
+ĝ     g
+Ğ     G
+ğ     g
+Ġ     G
+ġ     g
+Ģ     G
+ģ     g
+Ĥ     H
+ĥ     h
+Ħ     H
+ħ     h
+Ĩ     I
+Ì     I
+Í     I
+Î     I
+Ï     I
+ì     i
+í     i
+î     i
+ï     i
+ĩ     i
+Ī     I
+ī     i
+Ĭ     I
+ĭ     i
+Į     I
+į     i
+İ     I
+ı     i
+Ĳ     I
+ĳ     i
+Ĵ     J
+ĵ     j
+Ķ     K
+ķ     k
+ĸ     k
+Ĺ     L
+ĺ     l
+Ļ     L
+ļ     l
+Ľ     L
+ľ     l
+Ŀ     L
+ŀ     l
+Ł     L
+ł     l
+Ñ     N
+ñ     n
+Ń     N
+ń     n
+Ņ     N
+ņ     n
+Ň     N
+ň     n
+ŉ     n
+Ŋ     N
+ŋ     n
+Ò     O
+Ó     O
+Ô     O
+Õ     O
+Ö     O
+ò     o
+ó     o
+ô     o
+õ     o
+ö     o
+Ō     O
+ō     o
+Ŏ     O
+ŏ     o
+Ő     O
+ő     o
+Œ     E
+œ     e
+Ø     O
+ø     o
+Ŕ     R
+ŕ     r
+Ŗ     R
+ŗ     r
+Ř     R
+ř     r
+ß     S
+Ś     S
+ś     s
+Ŝ     S
+ŝ     s
+Ş     S
+ş     s
+Š     S
+š     s
+Ţ     T
+ţ     t
+Ť     T
+ť     t
+Ŧ     T
+ŧ     t
+Ù     U
+Ú     U
+Û     U
+Ü     U
+ù     u
+ú     u
+û     u
+ü     u
+Ũ     U
+ũ     u
+Ū     U
+ū     u
+Ŭ     U
+ŭ     u
+Ů     U
+ů     u
+Ű     U
+ű     u
+Ų     U
+ų     u
+Ŵ     W
+ŵ     w
+Ý     Y
+ý     y
+ÿ     y
+Ŷ     Y
+ŷ     y
+Ÿ     Y
+Ź     Z
+ź     z
+Ż     Z
+ż     z
+Ž     Z
+ž     z
+ё     е
+Ё     Е
diff --git a/contrib/unaccent/unaccent.sql.in b/contrib/unaccent/unaccent.sql.in

new file mode 100644 (file)

index 0000000..4077225
--- /dev/null
+++ b/contrib/unaccent/unaccent.sql.in
@@ -0,0 +1,33 @@
+/* $PostgreSQL$ */
+
+CREATE OR REPLACE FUNCTION unaccent(regdictionary, text)
+       RETURNS text
+       AS 'MODULE_PATHNAME', 'unaccent_dict'
+       LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION unaccent(text)
+       RETURNS text
+       AS 'MODULE_PATHNAME', 'unaccent_dict'
+       LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION unaccent_init(internal)
+       RETURNS internal
+       AS 'MODULE_PATHNAME', 'unaccent_init'
+       LANGUAGE C;
+
+CREATE OR REPLACE FUNCTION unaccent_lexize(internal,internal,internal,internal)
+       RETURNS internal
+       AS 'MODULE_PATHNAME', 'unaccent_lexize'
+       LANGUAGE C;
+
+CREATE TEXT SEARCH TEMPLATE unaccent (
+    INIT = unaccent_init,
+       LEXIZE = unaccent_lexize
+);
+
+
+CREATE TEXT SEARCH DICTIONARY unaccent (
+       TEMPLATE = unaccent,
+       RULES    = 'unaccent'
+);
+
diff --git a/contrib/unaccent/uninstall_unaccent.sql b/contrib/unaccent/uninstall_unaccent.sql

new file mode 100644 (file)

index 0000000..40b5f6b
--- /dev/null
+++ b/contrib/unaccent/uninstall_unaccent.sql
@@ -0,0 +1,9 @@
+/* $PostgreSQL$ */
+
+DROP FUNCTION IF EXISTS unaccent(regdictionary, text) CASCADE;
+DROP FUNCTION IF EXISTS unaccent(text) CASCADE;
+DROP TEXT SEARCH DICTIONARY IF EXISTS unaccent CASCADE;
+DROP TEXT SEARCH TEMPLATE IF EXISTS unaccent CASCADE;
+DROP FUNCTION IF EXISTS unaccent_init(internal) CASCADE;
+DROP FUNCTION IF EXISTS unaccent_lexize(internal,internal,internal,internal) CASCADE;
+
diff --git a/doc/src/sgml/contrib.sgml b/doc/src/sgml/contrib.sgml

index efd110755e0d17a2f004608cb8961183f4f7ae79..0d18dbf800fdf4acb11dd49198c0d20039caa8b1 100644 (file)
--- a/doc/src/sgml/contrib.sgml
+++ b/doc/src/sgml/contrib.sgml
@@ -113,6 +113,7 @@ psql -d dbname -f <replaceable>SHAREDIR</>/contrib/<replaceable>module</>.sql
   &tablefunc;
   &test-parser;
   &tsearch2;
+ &unaccent;
   &uuid-ossp;
   &vacuumlo;
   &xml2;
diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml

index 358c55df24ca909fdcceaa04d6696b72939a05b5..e7f501f4aa6e15bb57fee40cad60e16cf413635a 100644 (file)
--- a/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
@@ -126,6 +126,7 @@
  <!entity tablefunc       SYSTEM "tablefunc.sgml">
  <!entity test-parser     SYSTEM "test-parser.sgml">
  <!entity tsearch2        SYSTEM "tsearch2.sgml">
+<!entity unaccent      SYSTEM "unaccent.sgml">
  <!entity uuid-ossp       SYSTEM "uuid-ossp.sgml">
  <!entity vacuumlo        SYSTEM "vacuumlo.sgml">
  <!entity xml2            SYSTEM "xml2.sgml"> 
diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml

new file mode 100644 (file)

index 0000000..b3c7bbe
--- /dev/null
+++ b/doc/src/sgml/unaccent.sgml
@@ -0,0 +1,150 @@
+<sect1 id="unaccent">
+ <title>unaccent</title>
+
+ <indexterm zone="unaccent">
+  <primary>unaccent</primary>
+ </indexterm>
+
+ <para>
+  <filename>unaccent</> removes accents (diacritic signs) from a lexeme.
+  It's a filtering dictionary, that means its output is 
+  always passed to the next dictionary (if any), contrary to the standard 
+  behaviour. Currently, it supports most important accents from european 
+  languages. 
+ </para>
+
+ <para>
+  Limitation: Current implementation of <filename>unaccent</> 
+  dictionary cannot be used as a normalizing dictionary for 
+  <filename>thesaurus</filename> dictionary.
+ </para>
+ 
+ <sect2>
+  <title>Configuration</title>
+
+  <para>
+   A <literal>unaccent</> dictionary accepts the following options:
+  </para>
+  <itemizedlist>
+   <listitem>
+    <para>
+     <literal>RULES</> is the base name of the file containing the list of
+     translation rules.  This file must be stored in
+     <filename>$SHAREDIR/tsearch_data/</> (where <literal>$SHAREDIR</> means
+     the <productname>PostgreSQL</> installation's shared-data directory).
+     Its name must end in <literal>.rules</> (which is not to be included in
+     the <literal>RULES</> parameter).
+    </para>
+   </listitem>
+  </itemizedlist>
+  <para>
+   The rules file has the following format:
+  </para>
+  <itemizedlist>
+   <listitem>
+    <para>
+     Each line represents pair: character_with_accent  character_without_accent
+    <programlisting>
+&Agrave;       A
+&Aacute;       A
+&Acirc;        A
+&Atilde;       A
+&Auml;         A
+&Aring;                A
+&AElig;        A
+    </programlisting>
+    </para>
+   </listitem>
+  </itemizedlist>
+
+  <para>
+   Look at <filename>unaccent.rules</>, which is installed in
+   <filename>$SHAREDIR/tsearch_data/</>, for an example.
+  </para>
+ </sect2>
+
+ <sect2>
+  <title>Usage</title>
+
+  <para>
+   Running the installation script creates a text search template
+   <literal>unaccent</> and a dictionary <literal>unaccent</>
+   based on it, with default parameters.  You can alter the
+   parameters, for example
+
+<programlisting>
+=# ALTER TEXT SEARCH DICTIONARY unaccent (RULES='my_rules');
+</programlisting>
+
+   or create new dictionaries based on the template.
+  </para>
+
+  <para>
+   To test the dictionary, you can try
+
+<programlisting>
+=# select ts_lexize('unaccent','Hôtel');
+ ts_lexize 
+-----------
+ {Hotel}
+(1 row)
+</programlisting>
+  </para>
+  
+  <para>
+  Filtering dictionary are useful for correct work of 
+  <function>ts_headline</function> function.
+<programlisting>
+=# CREATE TEXT SEARCH CONFIGURATION fr ( COPY = french );
+=# ALTER TEXT SEARCH CONFIGURATION fr
+       ALTER MAPPING FOR hword, hword_part, word
+       WITH unaccent, french_stem;
+=# select to_tsvector('fr','Hôtels de la Mer');
+    to_tsvector    
+-------------------
+ 'hotel':1 'mer':4
+(1 row)
+
+=# select to_tsvector('fr','Hôtel de la Mer') @@ to_tsquery('fr','Hotels');
+ ?column? 
+----------
+ t
+(1 row)
+=# select ts_headline('fr','Hôtel de la Mer',to_tsquery('fr','Hotels'));
+      ts_headline       
+------------------------
+  &lt;b&gt;Hôtel&lt;/b&gt;de la Mer
+(1 row)
+
+</programlisting>
+  </para>
+ </sect2>
+
+ <sect2>
+ <title>Function</title>
+
+ <para>
+  <function>unaccent</> function removes accents (diacritic signs) from
+  argument string. Basically, it's a wrapper around 
+  <filename>unaccent</> dictionary.
+ </para>
+
+ <indexterm>
+  <primary>unaccent</primary>
+ </indexterm>
+
+ <synopsis>
+   unaccent(<optional><replaceable class="PARAMETER">dictionary</replaceable>,
+   </optional> <replaceable class="PARAMETER">string</replaceable>) 
+  returns <type>text</type>
+ </synopsis>  
+
+ <para>
+<programlisting>
+SELECT unaccent('unaccent','Hôtel');
+SELECT unaccent('Hôtel');
+</programlisting>
+ </para>
+ </sect2>
+
+</sect1>
author	Teodor Sigaev <teodor@sigaev.ru>
	Tue, 18 Aug 2009 10:34:39 +0000 (10:34 +0000)
committer	Teodor Sigaev <teodor@sigaev.ru>
	Tue, 18 Aug 2009 10:34:39 +0000 (10:34 +0000)
contrib/Makefile		patch \| blob \| blame \| history
contrib/README		patch \| blob \| blame \| history
contrib/unaccent/Makefile	[new file with mode: 0644]	patch \| blob
contrib/unaccent/expected/unaccent.out	[new file with mode: 0644]	patch \| blob
contrib/unaccent/sql/unaccent.sql	[new file with mode: 0644]	patch \| blob
contrib/unaccent/unaccent.c	[new file with mode: 0644]	patch \| blob
contrib/unaccent/unaccent.rules	[new file with mode: 0644]	patch \| blob
contrib/unaccent/unaccent.sql.in	[new file with mode: 0644]	patch \| blob
contrib/unaccent/uninstall_unaccent.sql	[new file with mode: 0644]	patch \| blob
doc/src/sgml/contrib.sgml		patch \| blob \| blame \| history
doc/src/sgml/filelist.sgml		patch \| blob \| blame \| history
doc/src/sgml/unaccent.sgml	[new file with mode: 0644]	patch \| blob