Some languages have symbols with zero display's width or/and vowels/signs which
authorTeodor Sigaev <teodor@sigaev.ru>
Wed, 11 Mar 2009 16:03:40 +0000 (16:03 +0000)
committerTeodor Sigaev <teodor@sigaev.ru>
Wed, 11 Mar 2009 16:03:40 +0000 (16:03 +0000)
are not an alphabetic character although they are not word-breakers too.
So, treat them as part of word.

Per off-list discussion with Dibyendra Hyoju <dibyendra@gmail.com> and
and Bal Krishna Bal <balkrishna7bal@gmail.com> about Nepali language and
Devanagari alphabet.

src/backend/tsearch/wparser_def.c

index 1174b3fb774839661897475884335f38f77094bc..d7d72afddd8ac50d3309bd38ac835cc2bbcac395 100644 (file)
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.22 2009/03/10 17:32:14 teodor Exp $
+ *   $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.23 2009/03/11 16:03:40 teodor Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -659,6 +659,291 @@ p_isURLPath(TParser *prs)
    return res;
 }
 
+/*
+ * returns true if current character has zero display length or
+ * it's a special sign in several languages. Such characters
+ * aren't a word-breaker although they aren't an isalpha. 
+ * In beginning of word they aren't a part of it. 
+ */
+static int
+p_isspecial(TParser *prs)
+{
+   /*
+    * pg_dsplen could return -1 which means error or control character
+    */
+   if ( pg_dsplen(prs->str + prs->state->posbyte) == 0 )
+       return 1;
+
+#ifdef USE_WIDE_UPPER_LOWER
+   /*
+    * Unicode Characters in the 'Mark, Spacing Combining' Category
+    * That characters are not alpha although they are not breakers
+    * of word too.
+    * Check that only in utf encoding, because other encodings
+    * aren't supported by postgres or even exists.
+    */
+   if ( GetDatabaseEncoding() == PG_UTF8 && prs->usewide )
+   {
+       static pg_wchar strange_letter[] = {
+                       /*
+                        * use binary search, so elements
+                        * should be ordered
+                        */
+                       0x0903, /*  DEVANAGARI SIGN VISARGA */
+                       0x093E, /*  DEVANAGARI VOWEL SIGN AA */
+                       0x093F, /*  DEVANAGARI VOWEL SIGN I */
+                       0x0940, /*  DEVANAGARI VOWEL SIGN II */
+                       0x0949, /*  DEVANAGARI VOWEL SIGN CANDRA O */
+                       0x094A, /*  DEVANAGARI VOWEL SIGN SHORT O */
+                       0x094B, /*  DEVANAGARI VOWEL SIGN O */
+                       0x094C, /*  DEVANAGARI VOWEL SIGN AU */
+                       0x0982, /*  BENGALI SIGN ANUSVARA */
+                       0x0983, /*  BENGALI SIGN VISARGA */
+                       0x09BE, /*  BENGALI VOWEL SIGN AA */
+                       0x09BF, /*  BENGALI VOWEL SIGN I */
+                       0x09C0, /*  BENGALI VOWEL SIGN II */
+                       0x09C7, /*  BENGALI VOWEL SIGN E */
+                       0x09C8, /*  BENGALI VOWEL SIGN AI */
+                       0x09CB, /*  BENGALI VOWEL SIGN O */
+                       0x09CC, /*  BENGALI VOWEL SIGN AU */
+                       0x09D7, /*  BENGALI AU LENGTH MARK */
+                       0x0A03, /*  GURMUKHI SIGN VISARGA */
+                       0x0A3E, /*  GURMUKHI VOWEL SIGN AA */
+                       0x0A3F, /*  GURMUKHI VOWEL SIGN I */
+                       0x0A40, /*  GURMUKHI VOWEL SIGN II */
+                       0x0A83, /*  GUJARATI SIGN VISARGA */
+                       0x0ABE, /*  GUJARATI VOWEL SIGN AA */
+                       0x0ABF, /*  GUJARATI VOWEL SIGN I */
+                       0x0AC0, /*  GUJARATI VOWEL SIGN II */
+                       0x0AC9, /*  GUJARATI VOWEL SIGN CANDRA O */
+                       0x0ACB, /*  GUJARATI VOWEL SIGN O */
+                       0x0ACC, /*  GUJARATI VOWEL SIGN AU */
+                       0x0B02, /*  ORIYA SIGN ANUSVARA */
+                       0x0B03, /*  ORIYA SIGN VISARGA */
+                       0x0B3E, /*  ORIYA VOWEL SIGN AA */
+                       0x0B40, /*  ORIYA VOWEL SIGN II */
+                       0x0B47, /*  ORIYA VOWEL SIGN E */
+                       0x0B48, /*  ORIYA VOWEL SIGN AI */
+                       0x0B4B, /*  ORIYA VOWEL SIGN O */
+                       0x0B4C, /*  ORIYA VOWEL SIGN AU */
+                       0x0B57, /*  ORIYA AU LENGTH MARK */
+                       0x0BBE, /*  TAMIL VOWEL SIGN AA */
+                       0x0BBF, /*  TAMIL VOWEL SIGN I */
+                       0x0BC1, /*  TAMIL VOWEL SIGN U */
+                       0x0BC2, /*  TAMIL VOWEL SIGN UU */
+                       0x0BC6, /*  TAMIL VOWEL SIGN E */
+                       0x0BC7, /*  TAMIL VOWEL SIGN EE */
+                       0x0BC8, /*  TAMIL VOWEL SIGN AI */
+                       0x0BCA, /*  TAMIL VOWEL SIGN O */
+                       0x0BCB, /*  TAMIL VOWEL SIGN OO */
+                       0x0BCC, /*  TAMIL VOWEL SIGN AU */
+                       0x0BD7, /*  TAMIL AU LENGTH MARK */
+                       0x0C01, /*  TELUGU SIGN CANDRABINDU */
+                       0x0C02, /*  TELUGU SIGN ANUSVARA */
+                       0x0C03, /*  TELUGU SIGN VISARGA */
+                       0x0C41, /*  TELUGU VOWEL SIGN U */
+                       0x0C42, /*  TELUGU VOWEL SIGN UU */
+                       0x0C43, /*  TELUGU VOWEL SIGN VOCALIC R */
+                       0x0C44, /*  TELUGU VOWEL SIGN VOCALIC RR */
+                       0x0C82, /*  KANNADA SIGN ANUSVARA */
+                       0x0C83, /*  KANNADA SIGN VISARGA */
+                       0x0CBE, /*  KANNADA VOWEL SIGN AA */
+                       0x0CC0, /*  KANNADA VOWEL SIGN II */
+                       0x0CC1, /*  KANNADA VOWEL SIGN U */
+                       0x0CC2, /*  KANNADA VOWEL SIGN UU */
+                       0x0CC3, /*  KANNADA VOWEL SIGN VOCALIC R */
+                       0x0CC4, /*  KANNADA VOWEL SIGN VOCALIC RR */
+                       0x0CC7, /*  KANNADA VOWEL SIGN EE */
+                       0x0CC8, /*  KANNADA VOWEL SIGN AI */
+                       0x0CCA, /*  KANNADA VOWEL SIGN O */
+                       0x0CCB, /*  KANNADA VOWEL SIGN OO */
+                       0x0CD5, /*  KANNADA LENGTH MARK */
+                       0x0CD6, /*  KANNADA AI LENGTH MARK */
+                       0x0D02, /*  MALAYALAM SIGN ANUSVARA */
+                       0x0D03, /*  MALAYALAM SIGN VISARGA */
+                       0x0D3E, /*  MALAYALAM VOWEL SIGN AA */
+                       0x0D3F, /*  MALAYALAM VOWEL SIGN I */
+                       0x0D40, /*  MALAYALAM VOWEL SIGN II */
+                       0x0D46, /*  MALAYALAM VOWEL SIGN E */
+                       0x0D47, /*  MALAYALAM VOWEL SIGN EE */
+                       0x0D48, /*  MALAYALAM VOWEL SIGN AI */
+                       0x0D4A, /*  MALAYALAM VOWEL SIGN O */
+                       0x0D4B, /*  MALAYALAM VOWEL SIGN OO */
+                       0x0D4C, /*  MALAYALAM VOWEL SIGN AU */
+                       0x0D57, /*  MALAYALAM AU LENGTH MARK */
+                       0x0D82, /*  SINHALA SIGN ANUSVARAYA */
+                       0x0D83, /*  SINHALA SIGN VISARGAYA */
+                       0x0DCF, /*  SINHALA VOWEL SIGN AELA-PILLA */
+                       0x0DD0, /*  SINHALA VOWEL SIGN KETTI AEDA-PILLA */
+                       0x0DD1, /*  SINHALA VOWEL SIGN DIGA AEDA-PILLA */
+                       0x0DD8, /*  SINHALA VOWEL SIGN GAETTA-PILLA */
+                       0x0DD9, /*  SINHALA VOWEL SIGN KOMBUVA */
+                       0x0DDA, /*  SINHALA VOWEL SIGN DIGA KOMBUVA */
+                       0x0DDB, /*  SINHALA VOWEL SIGN KOMBU DEKA */
+                       0x0DDC, /*  SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
+                       0x0DDD, /*  SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA */
+                       0x0DDE, /*  SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
+                       0x0DDF, /*  SINHALA VOWEL SIGN GAYANUKITTA */
+                       0x0DF2, /*  SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
+                       0x0DF3, /*  SINHALA VOWEL SIGN DIGA GAYANUKITTA */
+                       0x0F3E, /*  TIBETAN SIGN YAR TSHES */
+                       0x0F3F, /*  TIBETAN SIGN MAR TSHES */
+                       0x0F7F, /*  TIBETAN SIGN RNAM BCAD */
+                       0x102B, /*  MYANMAR VOWEL SIGN TALL AA */
+                       0x102C, /*  MYANMAR VOWEL SIGN AA */
+                       0x1031, /*  MYANMAR VOWEL SIGN E */
+                       0x1038, /*  MYANMAR SIGN VISARGA */
+                       0x103B, /*  MYANMAR CONSONANT SIGN MEDIAL YA */
+                       0x103C, /*  MYANMAR CONSONANT SIGN MEDIAL RA */
+                       0x1056, /*  MYANMAR VOWEL SIGN VOCALIC R */
+                       0x1057, /*  MYANMAR VOWEL SIGN VOCALIC RR */
+                       0x1062, /*  MYANMAR VOWEL SIGN SGAW KAREN EU */
+                       0x1063, /*  MYANMAR TONE MARK SGAW KAREN HATHI */
+                       0x1064, /*  MYANMAR TONE MARK SGAW KAREN KE PHO */
+                       0x1067, /*  MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
+                       0x1068, /*  MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
+                       0x1069, /*  MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
+                       0x106A, /*  MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
+                       0x106B, /*  MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
+                       0x106C, /*  MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
+                       0x106D, /*  MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
+                       0x1083, /*  MYANMAR VOWEL SIGN SHAN AA */
+                       0x1084, /*  MYANMAR VOWEL SIGN SHAN E */
+                       0x1087, /*  MYANMAR SIGN SHAN TONE-2 */
+                       0x1088, /*  MYANMAR SIGN SHAN TONE-3 */
+                       0x1089, /*  MYANMAR SIGN SHAN TONE-5 */
+                       0x108A, /*  MYANMAR SIGN SHAN TONE-6 */
+                       0x108B, /*  MYANMAR SIGN SHAN COUNCIL TONE-2 */
+                       0x108C, /*  MYANMAR SIGN SHAN COUNCIL TONE-3 */
+                       0x108F, /*  MYANMAR SIGN RUMAI PALAUNG TONE-5 */
+                       0x17B6, /*  KHMER VOWEL SIGN AA */
+                       0x17BE, /*  KHMER VOWEL SIGN OE */
+                       0x17BF, /*  KHMER VOWEL SIGN YA */
+                       0x17C0, /*  KHMER VOWEL SIGN IE */
+                       0x17C1, /*  KHMER VOWEL SIGN E */
+                       0x17C2, /*  KHMER VOWEL SIGN AE */
+                       0x17C3, /*  KHMER VOWEL SIGN AI */
+                       0x17C4, /*  KHMER VOWEL SIGN OO */
+                       0x17C5, /*  KHMER VOWEL SIGN AU */
+                       0x17C7, /*  KHMER SIGN REAHMUK */
+                       0x17C8, /*  KHMER SIGN YUUKALEAPINTU */
+                       0x1923, /*  LIMBU VOWEL SIGN EE */
+                       0x1924, /*  LIMBU VOWEL SIGN AI */
+                       0x1925, /*  LIMBU VOWEL SIGN OO */
+                       0x1926, /*  LIMBU VOWEL SIGN AU */
+                       0x1929, /*  LIMBU SUBJOINED LETTER YA */
+                       0x192A, /*  LIMBU SUBJOINED LETTER RA */
+                       0x192B, /*  LIMBU SUBJOINED LETTER WA */
+                       0x1930, /*  LIMBU SMALL LETTER KA */
+                       0x1931, /*  LIMBU SMALL LETTER NGA */
+                       0x1933, /*  LIMBU SMALL LETTER TA */
+                       0x1934, /*  LIMBU SMALL LETTER NA */
+                       0x1935, /*  LIMBU SMALL LETTER PA */
+                       0x1936, /*  LIMBU SMALL LETTER MA */
+                       0x1937, /*  LIMBU SMALL LETTER RA */
+                       0x1938, /*  LIMBU SMALL LETTER LA */
+                       0x19B0, /*  NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
+                       0x19B1, /*  NEW TAI LUE VOWEL SIGN AA */
+                       0x19B2, /*  NEW TAI LUE VOWEL SIGN II */
+                       0x19B3, /*  NEW TAI LUE VOWEL SIGN U */
+                       0x19B4, /*  NEW TAI LUE VOWEL SIGN UU */
+                       0x19B5, /*  NEW TAI LUE VOWEL SIGN E */
+                       0x19B6, /*  NEW TAI LUE VOWEL SIGN AE */
+                       0x19B7, /*  NEW TAI LUE VOWEL SIGN O */
+                       0x19B8, /*  NEW TAI LUE VOWEL SIGN OA */
+                       0x19B9, /*  NEW TAI LUE VOWEL SIGN UE */
+                       0x19BA, /*  NEW TAI LUE VOWEL SIGN AY */
+                       0x19BB, /*  NEW TAI LUE VOWEL SIGN AAY */
+                       0x19BC, /*  NEW TAI LUE VOWEL SIGN UY */
+                       0x19BD, /*  NEW TAI LUE VOWEL SIGN OY */
+                       0x19BE, /*  NEW TAI LUE VOWEL SIGN OAY */
+                       0x19BF, /*  NEW TAI LUE VOWEL SIGN UEY */
+                       0x19C0, /*  NEW TAI LUE VOWEL SIGN IY */
+                       0x19C8, /*  NEW TAI LUE TONE MARK-1 */
+                       0x19C9, /*  NEW TAI LUE TONE MARK-2 */
+                       0x1A19, /*  BUGINESE VOWEL SIGN E */
+                       0x1A1A, /*  BUGINESE VOWEL SIGN O */
+                       0x1A1B, /*  BUGINESE VOWEL SIGN AE */
+                       0x1B04, /*  BALINESE SIGN BISAH */
+                       0x1B35, /*  BALINESE VOWEL SIGN TEDUNG */
+                       0x1B3B, /*  BALINESE VOWEL SIGN RA REPA TEDUNG */
+                       0x1B3D, /*  BALINESE VOWEL SIGN LA LENGA TEDUNG */
+                       0x1B3E, /*  BALINESE VOWEL SIGN TALING */
+                       0x1B3F, /*  BALINESE VOWEL SIGN TALING REPA */
+                       0x1B40, /*  BALINESE VOWEL SIGN TALING TEDUNG */
+                       0x1B41, /*  BALINESE VOWEL SIGN TALING REPA TEDUNG */
+                       0x1B43, /*  BALINESE VOWEL SIGN PEPET TEDUNG */
+                       0x1B44, /*  BALINESE ADEG ADEG */
+                       0x1B82, /*  SUNDANESE SIGN PANGWISAD */
+                       0x1BA1, /*  SUNDANESE CONSONANT SIGN PAMINGKAL */
+                       0x1BA6, /*  SUNDANESE VOWEL SIGN PANAELAENG */
+                       0x1BA7, /*  SUNDANESE VOWEL SIGN PANOLONG */
+                       0x1BAA, /*  SUNDANESE SIGN PAMAAEH */
+                       0x1C24, /*  LEPCHA SUBJOINED LETTER YA */
+                       0x1C25, /*  LEPCHA SUBJOINED LETTER RA */
+                       0x1C26, /*  LEPCHA VOWEL SIGN AA */
+                       0x1C27, /*  LEPCHA VOWEL SIGN I */
+                       0x1C28, /*  LEPCHA VOWEL SIGN O */
+                       0x1C29, /*  LEPCHA VOWEL SIGN OO */
+                       0x1C2A, /*  LEPCHA VOWEL SIGN U */
+                       0x1C2B, /*  LEPCHA VOWEL SIGN UU */
+                       0x1C34, /*  LEPCHA CONSONANT SIGN NYIN-DO */
+                       0x1C35, /*  LEPCHA CONSONANT SIGN KANG */
+                       0xA823, /*  SYLOTI NAGRI VOWEL SIGN A */
+                       0xA824, /*  SYLOTI NAGRI VOWEL SIGN I */
+                       0xA827, /*  SYLOTI NAGRI VOWEL SIGN OO */
+                       0xA880, /*  SAURASHTRA SIGN ANUSVARA */
+                       0xA881, /*  SAURASHTRA SIGN VISARGA */
+                       0xA8B4, /*  SAURASHTRA CONSONANT SIGN HAARU */
+                       0xA8B5, /*  SAURASHTRA VOWEL SIGN AA */
+                       0xA8B6, /*  SAURASHTRA VOWEL SIGN I */
+                       0xA8B7, /*  SAURASHTRA VOWEL SIGN II */
+                       0xA8B8, /*  SAURASHTRA VOWEL SIGN U */
+                       0xA8B9, /*  SAURASHTRA VOWEL SIGN UU */
+                       0xA8BA, /*  SAURASHTRA VOWEL SIGN VOCALIC R */
+                       0xA8BB, /*  SAURASHTRA VOWEL SIGN VOCALIC RR */
+                       0xA8BC, /*  SAURASHTRA VOWEL SIGN VOCALIC L */
+                       0xA8BD, /*  SAURASHTRA VOWEL SIGN VOCALIC LL */
+                       0xA8BE, /*  SAURASHTRA VOWEL SIGN E */
+                       0xA8BF, /*  SAURASHTRA VOWEL SIGN EE */
+                       0xA8C0, /*  SAURASHTRA VOWEL SIGN AI */
+                       0xA8C1, /*  SAURASHTRA VOWEL SIGN O */
+                       0xA8C2, /*  SAURASHTRA VOWEL SIGN OO */
+                       0xA8C3, /*  SAURASHTRA VOWEL SIGN AU */
+                       0xA952, /*  REJANG CONSONANT SIGN H */
+                       0xA953, /*  REJANG VIRAMA */
+                       0xAA2F, /*  CHAM VOWEL SIGN O */
+                       0xAA30, /*  CHAM VOWEL SIGN AI */
+                       0xAA33, /*  CHAM CONSONANT SIGN YA */
+                       0xAA34, /*  CHAM CONSONANT SIGN RA */
+                       0xAA4D  /*  CHAM CONSONANT SIGN FINAL H */
+                   };
+       pg_wchar    *StopLow = strange_letter,
+                   *StopHigh = strange_letter + lengthof(strange_letter),
+                   *StopMiddle;
+       pg_wchar    c;
+
+       if ( prs->pgwstr )
+           c = *(prs->pgwstr + prs->state->poschar);
+       else
+           c = (pg_wchar) *(prs->wstr + prs->state->poschar);
+
+       while( StopLow < StopHigh )
+       {
+           StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+           if ( *StopMiddle == c )
+               return 1;
+           else if ( *StopMiddle < c )
+               StopLow = StopMiddle + 1;
+           else
+               StopHigh = StopMiddle;
+       }
+   }
+#endif
+
+   return 0;
+}
+
 /*
  * Table of state/action of parser
  */
@@ -683,6 +968,7 @@ static const TParserStateActionItem actionTPS_Base[] = {
 static const TParserStateActionItem actionTPS_InNumWord[] = {
    {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
    {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+   {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
    {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
    {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
@@ -703,12 +989,14 @@ static const TParserStateActionItem actionTPS_InAsciiWord[] = {
    {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
    {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
    {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
+   {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
    {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
 };
 
 static const TParserStateActionItem actionTPS_InWord[] = {
    {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
    {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
+   {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
    {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
    {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
    {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
@@ -723,6 +1011,7 @@ static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
    {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
    {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+   {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
    {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
    {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
 };
@@ -1196,6 +1485,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
    {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
    {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
    {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
+   {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
    {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
@@ -1211,6 +1501,7 @@ static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
 static const TParserStateActionItem actionTPS_InHyphenWord[] = {
    {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
    {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
+   {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
    {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
@@ -1226,6 +1517,7 @@ static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
 static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
    {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
    {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+   {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
    {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
 };
@@ -1234,6 +1526,7 @@ static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
    {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
    {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+   {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
@@ -1249,12 +1542,14 @@ static const TParserStateActionItem actionTPS_InParseHyphen[] = {
 static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
    {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
+   {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
    {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
 static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
    {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
    {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
+   {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
 };
@@ -1263,6 +1558,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
    {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
    {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
    {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
+   {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
 };
@@ -1270,6 +1566,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
 static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
    {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
    {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
+   {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
 };
 
@@ -1277,6 +1574,7 @@ static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
    {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
    {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
+   {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
    {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };