Rename and slightly redefine the default text search parser's "word"

categories, as per discussion. asciiword (formerly lword) is still ASCII-letters-only, and numword (formerly word) is still the most general mixed-alpha-and-digits case. But word (formerly nlword) is now any-group-of-letters-with-at-least-one-non-ASCII, rather than all-non-ASCII as before. This is no worse than before for parsing mixed Russian/English text, which seems to have been the design center for the original coding; and it should simplify matters for parsing most European languages. In particular it will not be necessary for any language to accept strings containing digits as being regular "words". The hyphenated-word categories are adjusted similarly.
author: Tom Lane 2007-10-23 20:46:12 +0000
committer: Tom Lane 2007-10-23 20:46:12 +0000
commit: dbaec70c153239224c0288d865b96c2f939fbdf5 (patch)
tree: a2309acc315e5d4b9f9b0cd8b2ad60dc999ba93d /src/backend/snowball
parent: 344d0cae64dbf398559b855806fc7338ec0a2e64 (diff)
2 files changed, 15 insertions, 13 deletions
diff --git a/src/backend/snowball/Makefile b/src/backend/snowball/Makefile
index e09d332e78..ba0c60db28 100644
--- a/src/backend/snowball/Makefile
+++ b/src/backend/snowball/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for src/backend/snowball
 #
-# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.3 2007/08/27 10:29:49 mha Exp $
+# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.4 2007/10/23 20:46:12 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -46,8 +46,9 @@ OBJS= dict_snowball.o api.o utilities.o \
 	stem_UTF_8_swedish.o \
 	stem_UTF_8_turkish.o
 
-# second column is name of latin dictionary, if different
-# Note order dependency: use of some other language as latin dictionary
+# first column is language name and also name of dictionary for not-all-ASCII
+# words, second is name of dictionary for all-ASCII words
+# Note order dependency: use of some other language as ASCII dictionary
 # must come after creation of that language
 LANGUAGES=  \
 	danish		danish 		\
@@ -95,8 +96,8 @@ ifeq ($(enable_shared), yes)
 	while [ "$$#" -gt 0 ] ; \
 	do \
 		lang=$$1; shift; \
-		nonlatdictname=$$lang; \
-		latdictname=$$1; shift; \
+		nonascdictname=$$lang; \
+		ascdictname=$$1; shift; \
 		if [ -s $(srcdir)/stopwords/$${lang}.stop ] ; then \
 			stop=", StopWords=$${lang}" ; \
 		else \
@@ -106,8 +107,8 @@ ifeq ($(enable_shared), yes)
 			sed -e "s#_LANGNAME_#$$lang#g" | \
 			sed -e "s#_DICTNAME_#$${lang}_stem#g" | \
 			sed -e "s#_CFGNAME_#$$lang#g" | \
-			sed -e "s#_LATDICTNAME_#$${latdictname}_stem#g" | \
-			sed -e "s#_NONLATDICTNAME_#$${nonlatdictname}_stem#g" | \
+			sed -e "s#_ASCDICTNAME_#$${ascdictname}_stem#g" | \
+			sed -e "s#_NONASCDICTNAME_#$${nonascdictname}_stem#g" | \
 			sed -e "s#_STOPWORDS_#$$stop#g" ; \
 	done >> $@
 else
diff --git a/src/backend/snowball/snowball.sql.in b/src/backend/snowball/snowball.sql.in
index 8b6328a083..7a32c85edb 100644
--- a/src/backend/snowball/snowball.sql.in
+++ b/src/backend/snowball/snowball.sql.in
@@ -1,4 +1,4 @@
--- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.4 2007/09/03 02:30:43 tgl Exp $$
+-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.5 2007/10/23 20:46:12 tgl Exp $$
 
 -- text search configuration for _LANGNAME_ language
 CREATE TEXT SEARCH DICTIONARY _DICTNAME_
@@ -12,14 +12,15 @@ CREATE TEXT SEARCH CONFIGURATION _CFGNAME_
 COMMENT ON TEXT SEARCH CONFIGURATION _CFGNAME_ IS 'configuration for _LANGNAME_ language';
 
 ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
-	FOR email, url, host, sfloat, version, uri, file, float, int, uint
+	FOR email, url, host, sfloat, version, uri, file, float, int, uint,
+            numword, hword_numpart, numhword
 	WITH simple;
 
 ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
-    FOR lhword, lpart_hword, lword
-	WITH _LATDICTNAME_;
+    FOR asciiword, hword_asciipart, asciihword
+	WITH _ASCDICTNAME_;
 
 ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
-    FOR hword, nlhword, nlpart_hword, nlword, word, part_hword
-	WITH _NONLATDICTNAME_;
+    FOR word, hword_part, hword
+	WITH _NONASCDICTNAME_;
author	Tom Lane	2007-10-23 20:46:12 +0000
committer	Tom Lane	2007-10-23 20:46:12 +0000
commit	dbaec70c153239224c0288d865b96c2f939fbdf5 (patch)
tree	a2309acc315e5d4b9f9b0cd8b2ad60dc999ba93d /src/backend/snowball
parent	344d0cae64dbf398559b855806fc7338ec0a2e64 (diff)