-<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.23 2007/10/22 20:13:37 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.24 2007/10/23 20:46:12 tgl Exp $ -->
<chapter id="textsearch">
<title id="textsearch-title">Full Text Search</title>
</thead>
<tbody>
<row>
- <entry>lword</entry>
- <entry>Latin word (only ASCII letters)</entry>
+ <entry><literal>asciiword</></entry>
+ <entry>Word, all ASCII letters</entry>
<entry><literal>foo</literal></entry>
</row>
<row>
- <entry>nlword</entry>
- <entry>Non-latin word (only non-ASCII letters)</entry>
- <entry><literal></literal></entry>
+ <entry><literal>word</></entry>
+ <entry>Word, all letters</entry>
+ <entry><literal>føø</literal></entry>
</row>
<row>
- <entry>word</entry>
- <entry>Word (other cases)</entry>
+ <entry><literal>numword</></entry>
+ <entry>Word, letters and digits</entry>
<entry><literal>beta1</literal></entry>
</row>
<row>
- <entry>lhword</entry>
- <entry>Latin hyphenated word</entry>
+ <entry><literal>asciihword</></entry>
+ <entry>Hyphenated word, all ASCII</entry>
<entry><literal>foo-bar</literal></entry>
</row>
<row>
- <entry>nlhword</entry>
- <entry>Non-latin hyphenated word</entry>
- <entry><literal></literal></entry>
+ <entry><literal>hword</></entry>
+ <entry>Hyphenated word, all letters</entry>
+ <entry><literal>føø-bar</literal></entry>
</row>
<row>
- <entry>hword</entry>
- <entry>Hyphenated word</entry>
+ <entry><literal>numhword</></entry>
+ <entry>Hyphenated word, letters and digits</entry>
<entry><literal>foo-beta1</literal></entry>
</row>
<row>
- <entry>lpart_hword</entry>
- <entry>Latin part of hyphenated word</entry>
+ <entry><literal>hword_asciipart</></entry>
+ <entry>Hyphenated word part, all ASCII</entry>
<entry><literal>foo</literal> or <literal>bar</literal> in the context
- <literal>foo-bar</></entry>
+ <literal>foo-bar</literal></entry>
</row>
<row>
- <entry>nlpart_hword</entry>
- <entry>Non-latin part of hyphenated word</entry>
- <entry><literal></literal></entry>
+ <entry><literal>hword_part</></entry>
+ <entry>Hyphenated word part, all letters</entry>
+ <entry><literal>føø</literal> in the context
+ <literal>føø-bar</literal></entry>
</row>
<row>
- <entry>part_hword</entry>
- <entry>Part of hyphenated word</entry>
+ <entry><literal>hword_numpart</></entry>
+ <entry>Hyphenated word part, letters and digits</entry>
<entry><literal>beta1</literal> in the context
- <literal>foo-beta1</></entry>
+ <literal>foo-beta1</literal></entry>
</row>
<row>
- <entry>email</entry>
+ <entry><literal>email</></entry>
<entry>Email address</entry>
<entry><literal>foo@bar.com</literal></entry>
</row>
<row>
- <entry>protocol</entry>
+ <entry><literal>protocol</></entry>
<entry>Protocol head</entry>
<entry><literal>http://</literal></entry>
</row>
<row>
- <entry>url</entry>
+ <entry><literal>url</></entry>
<entry>URL</entry>
<entry><literal>foo.com/stuff/index.html</literal></entry>
</row>
<row>
- <entry>host</entry>
+ <entry><literal>host</></entry>
<entry>Host</entry>
<entry><literal>foo.com</literal></entry>
</row>
<row>
- <entry>uri</entry>
+ <entry><literal>uri</></entry>
<entry>URI</entry>
<entry><literal>/stuff/index.html</literal>, in the context of a URL</entry>
</row>
<row>
- <entry>file</entry>
+ <entry><literal>file</></entry>
<entry>File or path name</entry>
<entry><literal>/usr/local/foo.txt</literal>, if not within a URL</entry>
</row>
<row>
- <entry>sfloat</entry>
+ <entry><literal>sfloat</></entry>
<entry>Scientific notation</entry>
<entry><literal>-1.234e56</literal></entry>
</row>
<row>
- <entry>float</entry>
+ <entry><literal>float</></entry>
<entry>Decimal notation</entry>
<entry><literal>-1.234</literal></entry>
</row>
<row>
- <entry>int</entry>
+ <entry><literal>int</></entry>
<entry>Signed integer</entry>
<entry><literal>-1234</literal></entry>
</row>
<row>
- <entry>uint</entry>
+ <entry><literal>uint</></entry>
<entry>Unsigned integer</entry>
<entry><literal>1234</literal></entry>
</row>
<row>
- <entry>version</entry>
+ <entry><literal>version</></entry>
<entry>Version number</entry>
<entry><literal>8.3.0</literal></entry>
</row>
<row>
- <entry>tag</entry>
- <entry>HTML Tag</entry>
+ <entry><literal>tag</></entry>
+ <entry>HTML tag</entry>
<entry><literal><A HREF="dictionaries.html"></literal></entry>
</row>
<row>
- <entry>entity</entry>
- <entry>HTML Entity</entry>
+ <entry><literal>entity</></entry>
+ <entry>HTML entity</entry>
<entry><literal>&amp;</literal></entry>
</row>
<row>
- <entry>blank</entry>
+ <entry><literal>blank</></entry>
<entry>Space symbols</entry>
<entry>(any whitespace or punctuation not otherwise recognized)</entry>
</row>
</tgroup>
</table>
+ <note>
+ <para>
+ The parser's notion of a <quote>letter</> is determined by the server's
+ locale setting, specifically <varname>lc_ctype</>. Words containing
+ only the basic ASCII letters are reported as a separate token type,
+ since it is sometimes useful to distinguish them. In most European
+ languages, token types <literal>word</> and <literal>asciiword</>
+ should always be treated alike.
+ </para>
+ </note>
+
<para>
It is possible for the parser to produce overlapping tokens from the same
piece of text. As an example, a hyphenated word will be reported both
<programlisting>
SELECT alias, description, token FROM ts_debug('foo-bar-beta1');
- alias | description | token
--------------+-------------------------------+---------------
- hword | Hyphenated word | foo-bar-beta1
- lpart_hword | Latin part of hyphenated word | foo
- blank | Space symbols | -
- lpart_hword | Latin part of hyphenated word | bar
- blank | Space symbols | -
- part_hword | Part of hyphenated word | beta1
+ alias | description | token
+-----------------+------------------------------------------+---------------
+ numhword | Hyphenated word, letters and digits | foo-bar-beta1
+ hword_asciipart | Hyphenated word part, all ASCII | foo
+ blank | Space symbols | -
+ hword_asciipart | Hyphenated word part, all ASCII | bar
+ blank | Space symbols | -
+ hword_numpart | Hyphenated word part, letters and digits | beta1
</programlisting>
This behavior is desirable since it allows searches to work for both
a <application>Snowball</> stemmer or <literal>simple</>, which
recognizes everything. For example, for an astronomy-specific search
(<literal>astro_en</literal> configuration) one could bind token type
- <type>lword</type> (Latin word) to a synonym dictionary of astronomical
+ <type>asciiword</type> (ASCII word) to a synonym dictionary of astronomical
terms, a general English dictionary and a <application>Snowball</> English
stemmer:
<programlisting>
ALTER TEXT SEARCH CONFIGURATION astro_en
- ADD MAPPING FOR lword WITH astrosyn, english_ispell, english_stem;
+ ADD MAPPING FOR asciiword WITH astrosyn, english_ispell, english_stem;
</programlisting>
</para>
<programlisting>
SELECT * FROM ts_debug('english', 'Paris');
- alias | description | token | dictionaries | dictionary | lexemes
--------+-------------+-------+----------------+--------------+---------
- lword | Latin word | Paris | {english_stem} | english_stem | {pari}
+ alias | description | token | dictionaries | dictionary | lexemes
+-----------+-----------------+-------+----------------+--------------+---------
+ asciiword | Word, all ASCII | Paris | {english_stem} | english_stem | {pari}
CREATE TEXT SEARCH DICTIONARY my_synonym (
TEMPLATE = synonym,
);
ALTER TEXT SEARCH CONFIGURATION english
- ALTER MAPPING FOR lword WITH my_synonym, english_stem;
+ ALTER MAPPING FOR asciiword WITH my_synonym, english_stem;
SELECT * FROM ts_debug('english', 'Paris');
- alias | description | token | dictionaries | dictionary | lexemes
--------+-------------+-------+---------------------------+------------+---------
- lword | Latin word | Paris | {my_synonym,english_stem} | my_synonym | {paris}
+ alias | description | token | dictionaries | dictionary | lexemes
+-----------+-----------------+-------+---------------------------+------------+---------
+ asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris}
</programlisting>
</para>
uses these assignments to check if it should handle the next word or stop
accumulation. The thesaurus dictionary must be configured
carefully. For example, if the thesaurus dictionary is assigned to handle
- only the <literal>lword</literal> token, then a thesaurus dictionary
+ only the <literal>asciiword</literal> token, then a thesaurus dictionary
definition like <literal>one 7</> will not work since token type
<literal>uint</literal> is not assigned to the thesaurus dictionary.
</para>
<programlisting>
ALTER TEXT SEARCH CONFIGURATION russian
- ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_simple;
+ ADD MAPPING FOR asciiword, asciihword, hword_asciipart WITH thesaurus_simple;
</programlisting>
</para>
);
ALTER TEXT SEARCH CONFIGURATION russian
- ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_astro, english_stem;
+ ADD MAPPING FOR asciiword, asciihword, hword_asciipart WITH thesaurus_astro, english_stem;
</programlisting>
Now we can see how it works.
);
</programlisting>
- Now we can set up the mappings for Latin words for configuration
+ Now we can set up the mappings for words in configuration
<literal>pg</>:
<programlisting>
ALTER TEXT SEARCH CONFIGURATION pg
- ALTER MAPPING FOR lword, lhword, lpart_hword
+ ALTER MAPPING FOR asciiword, asciihword, hword_asciipart,
+ word, hword, hword_part
WITH pg_dict, english_ispell, english_stem;
</programlisting>
<programlisting>
SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats');
- alias | description | token | dictionaries | dictionary | lexemes
--------+---------------+-------+----------------+--------------+---------
- lword | Latin word | a | {english_stem} | english_stem | {}
- blank | Space symbols | | {} | |
- lword | Latin word | fat | {english_stem} | english_stem | {fat}
- blank | Space symbols | | {} | |
- lword | Latin word | cat | {english_stem} | english_stem | {cat}
- blank | Space symbols | | {} | |
- lword | Latin word | sat | {english_stem} | english_stem | {sat}
- blank | Space symbols | | {} | |
- lword | Latin word | on | {english_stem} | english_stem | {}
- blank | Space symbols | | {} | |
- lword | Latin word | a | {english_stem} | english_stem | {}
- blank | Space symbols | | {} | |
- lword | Latin word | mat | {english_stem} | english_stem | {mat}
- blank | Space symbols | | {} | |
- blank | Space symbols | - | {} | |
- lword | Latin word | it | {english_stem} | english_stem | {}
- blank | Space symbols | | {} | |
- lword | Latin word | ate | {english_stem} | english_stem | {ate}
- blank | Space symbols | | {} | |
- lword | Latin word | a | {english_stem} | english_stem | {}
- blank | Space symbols | | {} | |
- lword | Latin word | fat | {english_stem} | english_stem | {fat}
- blank | Space symbols | | {} | |
- lword | Latin word | rats | {english_stem} | english_stem | {rat}
+ alias | description | token | dictionaries | dictionary | lexemes
+-----------+-----------------+-------+----------------+--------------+---------
+ asciiword | Word, all ASCII | a | {english_stem} | english_stem | {}
+ blank | Space symbols | | {} | |
+ asciiword | Word, all ASCII | fat | {english_stem} | english_stem | {fat}
+ blank | Space symbols | | {} | |
+ asciiword | Word, all ASCII | cat | {english_stem} | english_stem | {cat}
+ blank | Space symbols | | {} | |
+ asciiword | Word, all ASCII | sat | {english_stem} | english_stem | {sat}
+ blank | Space symbols | | {} | |
+ asciiword | Word, all ASCII | on | {english_stem} | english_stem | {}
+ blank | Space symbols | | {} | |
+ asciiword | Word, all ASCII | a | {english_stem} | english_stem | {}
+ blank | Space symbols | | {} | |
+ asciiword | Word, all ASCII | mat | {english_stem} | english_stem | {mat}
+ blank | Space symbols | | {} | |
+ blank | Space symbols | - | {} | |
+ asciiword | Word, all ASCII | it | {english_stem} | english_stem | {}
+ blank | Space symbols | | {} | |
+ asciiword | Word, all ASCII | ate | {english_stem} | english_stem | {ate}
+ blank | Space symbols | | {} | |
+ asciiword | Word, all ASCII | a | {english_stem} | english_stem | {}
+ blank | Space symbols | | {} | |
+ asciiword | Word, all ASCII | fat | {english_stem} | english_stem | {fat}
+ blank | Space symbols | | {} | |
+ asciiword | Word, all ASCII | rats | {english_stem} | english_stem | {rat}
</programlisting>
</para>
);
ALTER TEXT SEARCH CONFIGURATION public.english
- ALTER MAPPING FOR lword WITH english_ispell, english_stem;
+ ALTER MAPPING FOR asciiword WITH english_ispell, english_stem;
</programlisting>
<programlisting>
SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
- alias | description | token | dictionaries | dictionary | lexemes
--------+---------------+-------------+-------------------------------+----------------+-------------
- lword | Latin word | The | {english_ispell,english_stem} | english_ispell | {}
- blank | Space symbols | | {} | |
- lword | Latin word | Brightest | {english_ispell,english_stem} | english_ispell | {bright}
- blank | Space symbols | | {} | |
- lword | Latin word | supernovaes | {english_ispell,english_stem} | english_stem | {supernova}
+ alias | description | token | dictionaries | dictionary | lexemes
+-----------+-----------------+-------------+-------------------------------+----------------+-------------
+ asciiword | Word, all ASCII | The | {english_ispell,english_stem} | english_ispell | {}
+ blank | Space symbols | | {} | |
+ asciiword | Word, all ASCII | Brightest | {english_ispell,english_stem} | english_ispell | {bright}
+ blank | Space symbols | | {} | |
+ asciiword | Word, all ASCII | supernovaes | {english_ispell,english_stem} | english_stem | {supernova}
</programlisting>
<para>
In this example, the word <literal>Brightest</> was recognized by the
- parser as a <literal>Latin word</literal> (alias <literal>lword</literal>).
+ parser as an <literal>ASCII word</literal> (alias <literal>asciiword</literal>).
For this token type the dictionary list is
<literal>english_ispell</> and
<literal>english_stem</literal>. The word was recognized by
<programlisting>
SELECT alias, token, dictionary, lexemes
FROM ts_debug('public.english','The Brightest supernovaes');
- alias | token | dictionary | lexemes
--------+-------------+----------------+-------------
- lword | The | english_ispell | {}
- blank | | |
- lword | Brightest | english_ispell | {bright}
- blank | | |
- lword | supernovaes | english_stem | {supernova}
+ alias | token | dictionary | lexemes
+-----------+-------------+----------------+-------------
+ asciiword | The | english_ispell | {}
+ blank | | |
+ asciiword | Brightest | english_ispell | {bright}
+ blank | | |
+ asciiword | supernovaes | english_stem | {supernova}
</programlisting>
</para>
<programlisting>
SELECT * FROM ts_token_type('default');
- tokid | alias | description
--------+--------------+-----------------------------------
- 1 | lword | Latin word
- 2 | nlword | Non-latin word
- 3 | word | Word
- 4 | email | Email
- 5 | url | URL
- 6 | host | Host
- 7 | sfloat | Scientific notation
- 8 | version | VERSION
- 9 | part_hword | Part of hyphenated word
- 10 | nlpart_hword | Non-latin part of hyphenated word
- 11 | lpart_hword | Latin part of hyphenated word
- 12 | blank | Space symbols
- 13 | tag | HTML Tag
- 14 | protocol | Protocol head
- 15 | hword | Hyphenated word
- 16 | lhword | Latin hyphenated word
- 17 | nlhword | Non-latin hyphenated word
- 18 | uri | URI
- 19 | file | File or path name
- 20 | float | Decimal notation
- 21 | int | Signed integer
- 22 | uint | Unsigned integer
- 23 | entity | HTML Entity
+ tokid | alias | description
+-------+-----------------+------------------------------------------
+ 1 | asciiword | Word, all ASCII
+ 2 | word | Word, all letters
+ 3 | numword | Word, letters and digits
+ 4 | email | Email address
+ 5 | url | URL
+ 6 | host | Host
+ 7 | sfloat | Scientific notation
+ 8 | version | Version number
+ 9 | hword_numpart | Hyphenated word part, letters and digits
+ 10 | hword_part | Hyphenated word part, all letters
+ 11 | hword_asciipart | Hyphenated word part, all ASCII
+ 12 | blank | Space symbols
+ 13 | tag | HTML tag
+ 14 | protocol | Protocol head
+ 15 | numhword | Hyphenated word, letters and digits
+ 16 | asciihword | Hyphenated word, all ASCII
+ 17 | hword | Hyphenated word, all letters
+ 18 | uri | URI
+ 19 | file | File or path name
+ 20 | float | Decimal notation
+ 21 | int | Signed integer
+ 22 | uint | Unsigned integer
+ 23 | entity | HTML entity
</programlisting>
</para>
=> \dF+ russian
Text search configuration "pg_catalog.russian"
Parser: "pg_catalog.default"
- Token | Dictionaries
---------------+--------------
- email | simple
- file | simple
- float | simple
- host | simple
- hword | russian_stem
- int | simple
- lhword | english_stem
- lpart_hword | english_stem
- lword | english_stem
- nlhword | russian_stem
- nlpart_hword | russian_stem
- nlword | russian_stem
- part_hword | russian_stem
- sfloat | simple
- uint | simple
- uri | simple
- url | simple
- version | simple
- word | russian_stem
+ Token | Dictionaries
+-----------------+--------------
+ asciihword | english_stem
+ asciiword | english_stem
+ email | simple
+ file | simple
+ float | simple
+ host | simple
+ hword | russian_stem
+ hword_asciipart | english_stem
+ hword_numpart | simple
+ hword_part | russian_stem
+ int | simple
+ numhword | simple
+ numword | simple
+ sfloat | simple
+ uint | simple
+ uri | simple
+ url | simple
+ version | simple
+ word | russian_stem
</programlisting>
</para>
</listitem>
Get headline | prsd_headline |
Get token types | prsd_lextype |
- Token types for parser "pg_catalog.default"
- Token name | Description
---------------+-----------------------------------
- blank | Space symbols
- email | Email
- entity | HTML Entity
- file | File or path name
- float | Decimal notation
- host | Host
- hword | Hyphenated word
- int | Signed integer
- lhword | Latin hyphenated word
- lpart_hword | Latin part of hyphenated word
- lword | Latin word
- nlhword | Non-latin hyphenated word
- nlpart_hword | Non-latin part of hyphenated word
- nlword | Non-latin word
- part_hword | Part of hyphenated word
- protocol | Protocol head
- sfloat | Scientific notation
- tag | HTML Tag
- uint | Unsigned integer
- uri | URI
- url | URL
- version | VERSION
- word | Word
+ Token types for parser "pg_catalog.default"
+ Token name | Description
+-----------------+------------------------------------------
+ asciihword | Hyphenated word, all ASCII
+ asciiword | Word, all ASCII
+ blank | Space symbols
+ email | Email address
+ entity | HTML entity
+ file | File or path name
+ float | Decimal notation
+ host | Host
+ hword | Hyphenated word, all letters
+ hword_asciipart | Hyphenated word part, all ASCII
+ hword_numpart | Hyphenated word part, letters and digits
+ hword_part | Hyphenated word part, all letters
+ int | Signed integer
+ numhword | Hyphenated word, letters and digits
+ numword | Word, letters and digits
+ protocol | Protocol head
+ sfloat | Scientific notation
+ tag | HTML tag
+ uint | Unsigned integer
+ uri | URI
+ url | URL
+ version | Version number
+ word | Word, all letters
(23 rows)
</programlisting>
</para>
/*-------------------------------------------------------------------------
*
* wparser_def.c
- * Standard word parser
+ * Default text search parser
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.3 2007/09/07 15:09:55 teodor Exp $
+ * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.4 2007/10/23 20:46:12 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "utils/builtins.h"
-/* rememder !!!! */
-#define LASTNUM 23
+/* Output token categories */
-#define LATWORD 1
-#define CYRWORD 2
-#define UWORD 3
-#define EMAIL 4
-#define FURL 5
-#define HOST 6
-#define SCIENTIFIC 7
+#define ASCIIWORD 1
+#define WORD_T 2
+#define NUMWORD 3
+#define EMAIL 4
+#define URL_T 5
+#define HOST 6
+#define SCIENTIFIC 7
#define VERSIONNUMBER 8
-#define PARTHYPHENWORD 9
-#define CYRPARTHYPHENWORD 10
-#define LATPARTHYPHENWORD 11
-#define SPACE 12
-#define TAG 13
+#define NUMPARTHWORD 9
+#define PARTHWORD 10
+#define ASCIIPARTHWORD 11
+#define SPACE 12
+#define TAG_T 13
#define PROTOCOL 14
-#define HYPHENWORD 15
-#define LATHYPHENWORD 16
-#define CYRHYPHENWORD 17
-#define URI 18
-#define FILEPATH 19
-#define DECIMAL 20
-#define SIGNEDINT 21
-#define UNSIGNEDINT 22
-#define HTMLENTITY 23
-
-static const char *lex_descr[] = {
+#define NUMHWORD 15
+#define ASCIIHWORD 16
+#define HWORD 17
+#define URI 18
+#define FILEPATH 19
+#define DECIMAL 20
+#define SIGNEDINT 21
+#define UNSIGNEDINT 22
+#define HTMLENTITY 23
+
+#define LASTNUM 23
+
+static const char * const tok_alias[] = {
"",
- "Latin word",
- "Non-latin word",
- "Word",
- "Email",
- "URL",
- "Host",
- "Scientific notation",
- "VERSION",
- "Part of hyphenated word",
- "Non-latin part of hyphenated word",
- "Latin part of hyphenated word",
- "Space symbols",
- "HTML Tag",
- "Protocol head",
- "Hyphenated word",
- "Latin hyphenated word",
- "Non-latin hyphenated word",
- "URI",
- "File or path name",
- "Decimal notation",
- "Signed integer",
- "Unsigned integer",
- "HTML Entity"
-};
-
-static const char *tok_alias[] = {
- "",
- "lword",
- "nlword",
+ "asciiword",
"word",
+ "numword",
"email",
"url",
"host",
"sfloat",
"version",
- "part_hword",
- "nlpart_hword",
- "lpart_hword",
+ "hword_numpart",
+ "hword_part",
+ "hword_asciipart",
"blank",
"tag",
"protocol",
+ "numhword",
+ "asciihword",
"hword",
- "lhword",
- "nlhword",
"uri",
"file",
"float",
"entity"
};
+static const char * const lex_descr[] = {
+ "",
+ "Word, all ASCII",
+ "Word, all letters",
+ "Word, letters and digits",
+ "Email address",
+ "URL",
+ "Host",
+ "Scientific notation",
+ "Version number",
+ "Hyphenated word part, letters and digits",
+ "Hyphenated word part, all letters",
+ "Hyphenated word part, all ASCII",
+ "Space symbols",
+ "HTML tag",
+ "Protocol head",
+ "Hyphenated word, letters and digits",
+ "Hyphenated word, all ASCII",
+ "Hyphenated word, all letters",
+ "URI",
+ "File or path name",
+ "Decimal notation",
+ "Signed integer",
+ "Unsigned integer",
+ "HTML entity"
+};
+
+
+/* Parser states */
+
typedef enum
{
TPS_Base = 0,
- TPS_InUWord,
- TPS_InLatWord,
- TPS_InCyrWord,
+ TPS_InNumWord,
+ TPS_InAsciiWord,
+ TPS_InWord,
TPS_InUnsignedInt,
TPS_InSignedIntFirst,
TPS_InSignedInt,
TPS_InProtocolFirst,
TPS_InProtocolSecond,
TPS_InProtocolEnd,
- TPS_InHyphenLatWordFirst,
- TPS_InHyphenLatWord,
- TPS_InHyphenCyrWordFirst,
- TPS_InHyphenCyrWord,
- TPS_InHyphenUWordFirst,
- TPS_InHyphenUWord,
+ TPS_InHyphenAsciiWordFirst,
+ TPS_InHyphenAsciiWord,
+ TPS_InHyphenWordFirst,
+ TPS_InHyphenWord,
+ TPS_InHyphenNumWordFirst,
+ TPS_InHyphenNumWord,
TPS_InHyphenValueFirst,
TPS_InHyphenValue,
TPS_InHyphenValueExact,
TPS_InParseHyphen,
TPS_InParseHyphenHyphen,
- TPS_InHyphenCyrWordPart,
- TPS_InHyphenLatWordPart,
- TPS_InHyphenUWordPart,
+ TPS_InHyphenWordPart,
+ TPS_InHyphenAsciiWordPart,
+ TPS_InHyphenNumWordPart,
TPS_InHyphenUnsignedInt,
TPS_InHDecimalPartFirst,
TPS_InHDecimalPart,
/* forward declaration */
struct TParser;
-
typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
* except p_iseq */
typedef void (*TParserSpecial) (struct TParser *); /* special handler for
TParserSpecial special;
} TParserStateActionItem;
+/* Flag bits in TParserStateActionItem.flags */
+#define A_NEXT 0x0000
+#define A_BINGO 0x0001
+#define A_POP 0x0002
+#define A_PUSH 0x0004
+#define A_RERUN 0x0008
+#define A_CLEAR 0x0010
+#define A_MERGE 0x0020
+#define A_CLRALL 0x0040
+
typedef struct
{
TParserState state;
} TParser;
+
+/* forward decls here */
+static bool TParserGet(TParser * prs);
+
+
static TParserPosition *
newTParserPosition(TParserPosition * prev)
{
return prs;
}
-static bool TParserGet(TParser * prs);
-
static void
TParserClose(TParser * prs)
{
}
/*
- * defining support function, equvalent is* macroses, but
+ * Character-type support functions, equivalent to is* macros, but
* working with any possible encodings and locales. Note,
* that with multibyte encoding and C-locale isw* function may fail
- * or give wrong result. Note 2: multibyte encoding and C-local
+ * or give wrong result. Note 2: multibyte encoding and C-locale
* often are used for Asian languages
*/
}
static int
-p_islatin(TParser * prs)
+p_isasclet(TParser * prs)
{
- return (p_isalpha(prs) && p_isascii(prs)) ? 1 : 0;
+ return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
}
-static int
-p_isnonlatin(TParser * prs)
-{
- return (p_isalpha(prs) && !p_isascii(prs)) ? 1 : 0;
-}
+/* deliberately suppress unused-function complaints for the above */
void _make_compiler_happy(void);
void
_make_compiler_happy(void)
* Table of state/action of parser
*/
-#define A_NEXT 0x0000
-#define A_BINGO 0x0001
-#define A_POP 0x0002
-#define A_PUSH 0x0004
-#define A_RERUN 0x0008
-#define A_CLEAR 0x0010
-#define A_MERGE 0x0020
-#define A_CLRALL 0x0040
-
static TParserStateActionItem actionTPS_Base[] = {
{p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
{p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InLatWord, 0, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_InCyrWord, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
{p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
};
-static TParserStateActionItem actionTPS_InUWord[] = {
- {p_isEOF, 0, A_BINGO, TPS_Base, UWORD, NULL},
- {p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
+static TParserStateActionItem actionTPS_InNumWord[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
+ {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
- {NULL, 0, A_BINGO, TPS_Base, UWORD, NULL}
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
};
-static TParserStateActionItem actionTPS_InLatWord[] = {
- {p_isEOF, 0, A_BINGO, TPS_Base, LATWORD, NULL},
- {p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
+static TParserStateActionItem actionTPS_InAsciiWord[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
- {p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
- {NULL, 0, A_BINGO, TPS_Base, LATWORD, NULL}
+ {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
};
-static TParserStateActionItem actionTPS_InCyrWord[] = {
- {p_isEOF, 0, A_BINGO, TPS_Base, CYRWORD, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_Null, 0, NULL},
- {p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenCyrWordFirst, 0, NULL},
- {NULL, 0, A_BINGO, TPS_Base, CYRWORD, NULL}
+static TParserStateActionItem actionTPS_InWord[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
};
static TParserStateActionItem actionTPS_InUnsignedInt[] = {
{p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
- {p_islatin, 0, A_PUSH, TPS_InHost, 0, NULL},
- {p_isalpha, 0, A_NEXT, TPS_InUWord, 0, NULL},
+ {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntityFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '#', A_NEXT, TPS_InHTMLEntityNumFirst, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntity[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
{p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
{p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
{p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
{p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
- {p_islatin, 0, A_PUSH, TPS_InTagName, 0, NULL},
+ {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InTagCloseFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InTagName, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
{p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
{p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
- {p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
{p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
{p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
};
static TParserStateActionItem actionTPS_InTagEnd[] = {
- {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL}
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
};
static TParserStateActionItem actionTPS_InCommentFirst[] = {
};
static TParserStateActionItem actionTPS_InCommentEnd[] = {
- {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL}
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
};
static TParserStateActionItem actionTPS_InHostFirstDomain[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHostDomainSecond[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
static TParserStateActionItem actionTPS_InHostDomain[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
static TParserStateActionItem actionTPS_InHostFirstAN[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHost[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
static TParserStateActionItem actionTPS_InFileFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
static TParserStateActionItem actionTPS_InFileTwiddle[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
static TParserStateActionItem actionTPS_InPathFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
static TParserStateActionItem actionTPS_InFile[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
- {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
static TParserStateActionItem actionTPS_InFileNext[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_CLEAR, TPS_InFile, 0, NULL},
+ {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
static TParserStateActionItem actionTPS_InFURL[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_isURI, 0, A_BINGO | A_CLRALL, TPS_Base, FURL, SpecialFURL},
+ {p_isURI, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
};
-static TParserStateActionItem actionTPS_InHyphenLatWordFirst[] = {
+static TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHyphenLatWord, 0, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
- {p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
-static TParserStateActionItem actionTPS_InHyphenLatWord[] = {
- {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, LATHYPHENWORD, SpecialHyphen},
- {p_islatin, 0, A_NEXT, TPS_InHyphenLatWord, 0, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
- {p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL},
- {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, LATHYPHENWORD, SpecialHyphen}
+static TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
+ {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
};
-static TParserStateActionItem actionTPS_InHyphenCyrWordFirst[] = {
+static TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWord, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
- {p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
-static TParserStateActionItem actionTPS_InHyphenCyrWord[] = {
- {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, CYRHYPHENWORD, SpecialHyphen},
- {p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWord, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
- {p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenCyrWordFirst, 0, NULL},
- {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, CYRHYPHENWORD, SpecialHyphen}
+static TParserStateActionItem actionTPS_InHyphenWord[] = {
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
};
-static TParserStateActionItem actionTPS_InHyphenUWordFirst[] = {
+static TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
- {p_isalnum, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
-static TParserStateActionItem actionTPS_InHyphenUWord[] = {
- {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
- {p_isalnum, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
- {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
+static TParserStateActionItem actionTPS_InHyphenNumWord[] = {
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
+ {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
};
static TParserStateActionItem actionTPS_InHyphenValue[] = {
- {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
- {p_isalpha, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
- {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenValueExact[] = {
- {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
- {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InParseHyphen[] = {
{p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHyphenLatWordPart, 0, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWordPart, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
{NULL, 0, A_RERUN, TPS_Base, 0, NULL}
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
-static TParserStateActionItem actionTPS_InHyphenCyrWordPart[] = {
- {p_isEOF, 0, A_BINGO, TPS_Base, CYRPARTHYPHENWORD, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWordPart, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
- {p_isdigit, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
- {NULL, 0, A_BINGO, TPS_InParseHyphen, CYRPARTHYPHENWORD, NULL}
+static TParserStateActionItem actionTPS_InHyphenWordPart[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
};
-static TParserStateActionItem actionTPS_InHyphenLatWordPart[] = {
- {p_isEOF, 0, A_BINGO, TPS_Base, LATPARTHYPHENWORD, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHyphenLatWordPart, 0, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
- {p_isdigit, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
- {NULL, 0, A_BINGO, TPS_InParseHyphen, LATPARTHYPHENWORD, NULL}
+static TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
};
-static TParserStateActionItem actionTPS_InHyphenUWordPart[] = {
- {p_isEOF, 0, A_BINGO, TPS_Base, PARTHYPHENWORD, NULL},
- {p_isalnum, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
- {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHYPHENWORD, NULL}
+static TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
+ {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
};
static TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
- {p_isalpha, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHDecimalPartFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, UNSIGNEDINT, NULL}
};
};
/*
- * order should be the same as in typedef enum {} TParserState!!
+ * order must be the same as in typedef enum {} TParserState!!
*/
static const TParserStateAction Actions[] = {
{TPS_Base, actionTPS_Base},
- {TPS_InUWord, actionTPS_InUWord},
- {TPS_InLatWord, actionTPS_InLatWord},
- {TPS_InCyrWord, actionTPS_InCyrWord},
+ {TPS_InNumWord, actionTPS_InNumWord},
+ {TPS_InAsciiWord, actionTPS_InAsciiWord},
+ {TPS_InWord, actionTPS_InWord},
{TPS_InUnsignedInt, actionTPS_InUnsignedInt},
{TPS_InSignedIntFirst, actionTPS_InSignedIntFirst},
{TPS_InSignedInt, actionTPS_InSignedInt},
{TPS_InProtocolFirst, actionTPS_InProtocolFirst},
{TPS_InProtocolSecond, actionTPS_InProtocolSecond},
{TPS_InProtocolEnd, actionTPS_InProtocolEnd},
- {TPS_InHyphenLatWordFirst, actionTPS_InHyphenLatWordFirst},
- {TPS_InHyphenLatWord, actionTPS_InHyphenLatWord},
- {TPS_InHyphenCyrWordFirst, actionTPS_InHyphenCyrWordFirst},
- {TPS_InHyphenCyrWord, actionTPS_InHyphenCyrWord},
- {TPS_InHyphenUWordFirst, actionTPS_InHyphenUWordFirst},
- {TPS_InHyphenUWord, actionTPS_InHyphenUWord},
+ {TPS_InHyphenAsciiWordFirst, actionTPS_InHyphenAsciiWordFirst},
+ {TPS_InHyphenAsciiWord, actionTPS_InHyphenAsciiWord},
+ {TPS_InHyphenWordFirst, actionTPS_InHyphenWordFirst},
+ {TPS_InHyphenWord, actionTPS_InHyphenWord},
+ {TPS_InHyphenNumWordFirst, actionTPS_InHyphenNumWordFirst},
+ {TPS_InHyphenNumWord, actionTPS_InHyphenNumWord},
{TPS_InHyphenValueFirst, actionTPS_InHyphenValueFirst},
{TPS_InHyphenValue, actionTPS_InHyphenValue},
{TPS_InHyphenValueExact, actionTPS_InHyphenValueExact},
{TPS_InParseHyphen, actionTPS_InParseHyphen},
{TPS_InParseHyphenHyphen, actionTPS_InParseHyphenHyphen},
- {TPS_InHyphenCyrWordPart, actionTPS_InHyphenCyrWordPart},
- {TPS_InHyphenLatWordPart, actionTPS_InHyphenLatWordPart},
- {TPS_InHyphenUWordPart, actionTPS_InHyphenUWordPart},
+ {TPS_InHyphenWordPart, actionTPS_InHyphenWordPart},
+ {TPS_InHyphenAsciiWordPart, actionTPS_InHyphenAsciiWordPart},
+ {TPS_InHyphenNumWordPart, actionTPS_InHyphenNumWordPart},
{TPS_InHyphenUnsignedInt, actionTPS_InHyphenUnsignedInt},
{TPS_InHDecimalPartFirst, actionTPS_InHDecimalPartFirst},
{TPS_InHDecimalPart, actionTPS_InHDecimalPart},
{
TParserStateActionItem *item = NULL;
+ Assert(prs->state);
+
if (prs->state->posbyte >= prs->lenstr)
return false;
- Assert(prs->state);
prs->lexeme = prs->str + prs->state->posbyte;
prs->state->pushedAtAction = NULL;
prs->state->state = item->tostate;
/* check for go away */
- if ((item->flags & A_BINGO) || (prs->state->posbyte >= prs->lenstr && (item->flags & A_RERUN) == 0))
+ if ((item->flags & A_BINGO) ||
+ (prs->state->posbyte >= prs->lenstr &&
+ (item->flags & A_RERUN) == 0))
break;
- /* go to begining of loop if we should rerun or we just restore state */
+ /* go to beginning of loop if we should rerun or we just restore state */
if (item->flags & (A_RERUN | A_POP))
continue;
PG_RETURN_VOID();
}
-#define LEAVETOKEN(x) ( (x)==12 )
-#define COMPLEXTOKEN(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
-#define ENDPUNCTOKEN(x) ( (x)==12 )
-
+#define LEAVETOKEN(x) ( (x)==SPACE )
+#define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
+#define ENDPUNCTOKEN(x) ( (x)==SPACE )
-#define TS_IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
-#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
-#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
-#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
-#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || TS_IDIGNORE(x) )
+#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==HTMLENTITY )
+#define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
+#define HTMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
+#define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
+#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
typedef struct
{