From dbaec70c153239224c0288d865b96c2f939fbdf5 Mon Sep 17 00:00:00 2001
From: Tom Lane
Date: Tue, 23 Oct 2007 20:46:12 +0000
Subject: Rename and slightly redefine the default text search parser's "word"
 categories, as per discussion.  asciiword (formerly lword) is still
 ASCII-letters-only, and numword (formerly word) is still the most general
 mixed-alpha-and-digits case.  But word (formerly nlword) is now
 any-group-of-letters-with-at-least-one-non-ASCII, rather than all-non-ASCII
 as before.  This is no worse than before for parsing mixed Russian/English
 text, which seems to have been the design center for the original coding; and
 it should simplify matters for parsing most European languages.  In
 particular it will not be necessary for any language to accept strings
 containing digits as being regular "words".  The hyphenated-word categories
 are adjusted similarly.

---
 src/test/regress/expected/tsdicts.out | 10 +++----
 src/test/regress/expected/tsearch.out | 50 +++++++++++++++++------------------
 src/test/regress/sql/tsdicts.sql      | 11 ++++----
 3 files changed, 35 insertions(+), 36 deletions(-)

(limited to 'src/test/regress')

diff --git a/src/test/regress/expected/tsdicts.out b/src/test/regress/expected/tsdicts.out
index a1c13e70870..3520baceac7 100644
--- a/src/test/regress/expected/tsdicts.out
+++ b/src/test/regress/expected/tsdicts.out
@@ -209,8 +209,8 @@ SELECT ts_lexize('synonym', 'Gogle');
 (1 row)
 
 -- Create and simple test thesaurus dictionary
--- More test in configuration checks because of ts_lexize
--- can not give more tat one word as it may wish thesaurus.
+-- More tests in configuration checks because ts_lexize()
+-- cannot pass more than one word to thesaurus.
 CREATE TEXT SEARCH DICTIONARY thesaurus (
                         Template=thesaurus,
 						DictFile=thesaurus_sample, 
@@ -227,7 +227,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
 						COPY=english
 );
 ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
-	hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word 
+	word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
 	WITH ispell, english_stem;
 SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
                                             to_tsvector                                             
@@ -276,7 +276,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
 						COPY=english
 );
 ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR 
-	lword, lpart_hword, lhword 
+	asciiword, hword_asciipart, asciihword 
 	WITH synonym, english_stem;
 SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
                     to_tsvector                    
@@ -296,7 +296,7 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
 						COPY=synonym_tst
 );
 ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR 
-	lword, lpart_hword, lhword 
+	asciiword, hword_asciipart, asciihword 
 	WITH synonym, thesaurus, english_stem;
 SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
            to_tsvector            
diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out
index 6eb453194da..3d55715be38 100644
--- a/src/test/regress/expected/tsearch.out
+++ b/src/test/regress/expected/tsearch.out
@@ -208,31 +208,31 @@ SELECT ts_lexize('english_stem', 'identity');
 (1 row)
 
 SELECT * FROM ts_token_type('default');
- tokid |    alias     |            description            
--------+--------------+-----------------------------------
-     1 | lword        | Latin word
-     2 | nlword       | Non-latin word
-     3 | word         | Word
-     4 | email        | Email
-     5 | url          | URL
-     6 | host         | Host
-     7 | sfloat       | Scientific notation
-     8 | version      | VERSION
-     9 | part_hword   | Part of hyphenated word
-    10 | nlpart_hword | Non-latin part of hyphenated word
-    11 | lpart_hword  | Latin part of hyphenated word
-    12 | blank        | Space symbols
-    13 | tag          | HTML Tag
-    14 | protocol     | Protocol head
-    15 | hword        | Hyphenated word
-    16 | lhword       | Latin hyphenated word
-    17 | nlhword      | Non-latin hyphenated word
-    18 | uri          | URI
-    19 | file         | File or path name
-    20 | float        | Decimal notation
-    21 | int          | Signed integer
-    22 | uint         | Unsigned integer
-    23 | entity       | HTML Entity
+ tokid |      alias      |               description                
+-------+-----------------+------------------------------------------
+     1 | asciiword       | Word, all ASCII
+     2 | word            | Word, all letters
+     3 | numword         | Word, letters and digits
+     4 | email           | Email address
+     5 | url             | URL
+     6 | host            | Host
+     7 | sfloat          | Scientific notation
+     8 | version         | Version number
+     9 | hword_numpart   | Hyphenated word part, letters and digits
+    10 | hword_part      | Hyphenated word part, all letters
+    11 | hword_asciipart | Hyphenated word part, all ASCII
+    12 | blank           | Space symbols
+    13 | tag             | HTML tag
+    14 | protocol        | Protocol head
+    15 | numhword        | Hyphenated word, letters and digits
+    16 | asciihword      | Hyphenated word, all ASCII
+    17 | hword           | Hyphenated word, all letters
+    18 | uri             | URI
+    19 | file            | File or path name
+    20 | float           | Decimal notation
+    21 | int             | Signed integer
+    22 | uint            | Unsigned integer
+    23 | entity          | HTML entity
 (23 rows)
 
 SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
diff --git a/src/test/regress/sql/tsdicts.sql b/src/test/regress/sql/tsdicts.sql
index 2e6cf791d87..f36e63a3110 100644
--- a/src/test/regress/sql/tsdicts.sql
+++ b/src/test/regress/sql/tsdicts.sql
@@ -58,8 +58,8 @@ SELECT ts_lexize('synonym', 'PoStGrEs');
 SELECT ts_lexize('synonym', 'Gogle');
 
 -- Create and simple test thesaurus dictionary
--- More test in configuration checks because of ts_lexize
--- can not give more tat one word as it may wish thesaurus.
+-- More tests in configuration checks because ts_lexize()
+-- cannot pass more than one word to thesaurus.
 CREATE TEXT SEARCH DICTIONARY thesaurus (
                         Template=thesaurus,
 						DictFile=thesaurus_sample, 
@@ -74,7 +74,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
 );
 
 ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
-	hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word 
+	word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
 	WITH ispell, english_stem;
 
 SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
@@ -99,7 +99,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
 );
 
 ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR 
-	lword, lpart_hword, lhword 
+	asciiword, hword_asciipart, asciihword 
 	WITH synonym, english_stem;
 
 SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
@@ -112,10 +112,9 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
 );
 
 ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR 
-	lword, lpart_hword, lhword 
+	asciiword, hword_asciipart, asciihword 
 	WITH synonym, thesaurus, english_stem;
 
 SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
 SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbrevation SN)');
 SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');
-
-- 
cgit v1.2.3