Text parser rewritten:
authorTeodor Sigaev <teodor@sigaev.ru>
Mon, 21 Nov 2005 12:27:57 +0000 (12:27 +0000)
committerTeodor Sigaev <teodor@sigaev.ru>
Mon, 21 Nov 2005 12:27:57 +0000 (12:27 +0000)
        - supports multibyte encodings
        - more strict rules for lexemes
        - flex isn't used
Add:
        - tsquery plainto_tsquery(text)
          Function makes tsquery from plain text.
        - &&, ||, !! operation for tsquery for combining
          tsquery from it's parts:  'foo & bar' || 'asd' => 'foo & bar | asd'

15 files changed:
contrib/tsearch2/Makefile
contrib/tsearch2/expected/tsearch2.out
contrib/tsearch2/query.c
contrib/tsearch2/query_support.c
contrib/tsearch2/sql/tsearch2.sql
contrib/tsearch2/ts_locale.c [new file with mode: 0644]
contrib/tsearch2/ts_locale.h [new file with mode: 0644]
contrib/tsearch2/tsearch.sql.in
contrib/tsearch2/wordparser/Makefile
contrib/tsearch2/wordparser/deflex.c
contrib/tsearch2/wordparser/deflex.h
contrib/tsearch2/wordparser/parser.c [new file with mode: 0644]
contrib/tsearch2/wordparser/parser.h
contrib/tsearch2/wordparser/parser.l [deleted file]
contrib/tsearch2/wparser_def.c

index 4901b611ee1e0648f914d8190213b5a533a89492..2ef904ddb4e01629b80c70b8f0a67caf7bce5dcd 100644 (file)
@@ -1,4 +1,4 @@
-# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.11 2005/11/08 17:08:46 teodor Exp $
+# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.12 2005/11/21 12:27:57 teodor Exp $
 
 MODULE_big = tsearch2
 OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
@@ -6,7 +6,8 @@ OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
        wparser.o wparser_def.o \
        ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
        tsvector_op.o rank.o ts_stat.o \
-       query_util.o query_support.o query_rewrite.o query_gist.o
+       query_util.o query_support.o query_rewrite.o query_gist.o \
+       ts_locale.o
 
 SUBDIRS     := snowball ispell wordparser
 SUBDIROBJS  := $(SUBDIRS:%=%/SUBSYS.o)
index 296c0ac676f874dd6e2ed3ed7996df6a8eb8ff0f..a98c2216a8da55d0074aa1c35e3a4190532fa376 100644 (file)
@@ -13,12 +13,12 @@ psql:tsearch2.sql:342: NOTICE:  argument type tsvector is only a shell
 psql:tsearch2.sql:396: NOTICE:  type "tsquery" is not yet defined
 DETAIL:  Creating a shell type definition.
 psql:tsearch2.sql:401: NOTICE:  argument type tsquery is only a shell
-psql:tsearch2.sql:544: NOTICE:  type "gtsvector" is not yet defined
+psql:tsearch2.sql:559: NOTICE:  type "gtsvector" is not yet defined
 DETAIL:  Creating a shell type definition.
-psql:tsearch2.sql:549: NOTICE:  argument type gtsvector is only a shell
-psql:tsearch2.sql:998: NOTICE:  type "gtsq" is not yet defined
+psql:tsearch2.sql:564: NOTICE:  argument type gtsvector is only a shell
+psql:tsearch2.sql:1054: NOTICE:  type "gtsq" is not yet defined
 DETAIL:  Creating a shell type definition.
-psql:tsearch2.sql:1003: NOTICE:  argument type gtsq is only a shell
+psql:tsearch2.sql:1059: NOTICE:  argument type gtsq is only a shell
 --tsvector
 SELECT '1'::tsvector;
  tsvector 
@@ -653,7 +653,7 @@ select * from token_type('default');
     11 | lpart_hword  | Latin part of hyphenated word
     12 | blank        | Space symbols
     13 | tag          | HTML Tag
-    14 | http         | HTTP head
+    14 | protocol     | Protocol head
     15 | hword        | Hyphenated word
     16 | lhword       | Latin hyphenated word
     17 | nlhword      | Non-latin hyphenated word
@@ -672,14 +672,13 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
 -------+--------------------------------------
     22 | 345
     12 |  
-     4 | qwe@efd.r
-    12 |  
-    12 | '
-    12 |  
+     1 | qwe
+    12 | @
+    19 | efd.r
+    12 |  
     14 | http://
      6 | www.com
-    12 | /
-    12 |  
+    12 | / 
     14 | http://
      5 | aew.werc.ewr/?ad=qwe&dw
      6 | aew.werc.ewr
@@ -700,10 +699,8 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
      6 | 4aew.werc.ewr
     12 |  
     14 | http://
-     5 | 5aew.werc.ewr:8100/?
-     6 | 5aew.werc.ewr
-    18 | :8100/?
-    12 |   
+     6 | 5aew.werc.ewr:8100
+    12 | /?  
      1 | ad
     12 | =
      1 | qwe
@@ -711,12 +708,12 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
      1 | dw
     12 |  
      5 | 6aew.werc.ewr:8100/?ad=qwe&dw
-     6 | 6aew.werc.ewr
-    18 | :8100/?ad=qwe&dw
+     6 | 6aew.werc.ewr:8100
+    18 | /?ad=qwe&dw
     12 |  
      5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32
-     6 | 7aew.werc.ewr
-    18 | :8100/?ad=qwe&dw=%20%32
+     6 | 7aew.werc.ewr:8100
+    18 | /?ad=qwe&dw=%20%32
     12 |  
      7 | +4.0e-10
     12 |  
@@ -747,11 +744,15 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
      1 | jf
     12 |  
      1 | sdjk
-    13 | <we hjwer <werrwe>
+    12 | <
+     1 | we
     12 |  
-     3 | ewr1
-    12 | >
+     1 | hjwer
+    12 |  
+    13 | <werrwe>
     12 |  
+     3 | ewr1
+    12 | > 
      3 | ewri2
     12 |  
     13 | <a href="qwe<qwe>">
@@ -767,57 +768,53 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
     12 |  
     19 | /wqe-324/ewr
     12 |  
-     6 | gist.h
-    12 |  
-     6 | gist.h.c
+    19 | gist.h
     12 |  
-     6 | gist.c
-    12 | .
+    19 | gist.h.c
     12 |  
+    19 | gist.c
+    12 | . 
      1 | readline
     12 |  
     20 | 4.2
     12 |  
     20 | 4.2
-    12 | .
-    12 |  
+    12 | . 
     20 | 4.2
-    12 | ,
-    12 |  
-    15 | readline-4
+    12 | , 
+    15 | readline-4.2
     11 | readline
     12 | -
     20 | 4.2
     12 |  
-    15 | readline-4
+    15 | readline-4.2
     11 | readline
     12 | -
     20 | 4.2
-    12 | .
-    12 |  
+    12 | . 
     22 | 234
     12 |  
 
-    13 | <i <b>
+    12 | <
+     1 | i
+    12 |  
+    13 | <b>
     12 |  
      1 | wow
     12 |   
-    12 | <
-    12 |  
+    12 | < 
      1 | jqw
     12 |  
-    12 | <
-    12 | >
-    12 |  
+    12 | <> 
      1 | qwerty
-(138 rows)
+(135 rows)
 
 SELECT to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234 
 <i <b> wow  < jqw <> qwerty');
-                                                                                                                                                                                                                                                                                                                                                                                                                                               to_tsvector                                                                                                                                                                                                                                                                                                                                                                                                                                                
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
- 'ad':18 'dw':20 'jf':40 '234':62 '345':1 '4.2':53,54,55,58,61 '455':32 'jqw':64 'qwe':19,28,29,36 'wer':37 'wow':63 'asdf':38 'ewr1':42 'qwer':39 'sdjk':41 '5.005':33 'ewri2':43 'qwqwe':30 'wefjn':47 'gist.c':51 'gist.h':49 'qwerti':65 '234.435':31 ':8100/?':17 'qwe-wer':35 'readlin':52,57,60 'www.com':3 '+4.0e-10':27 'gist.h.c':50 'rewt/ewr':46 'qwe@efd.r':2 'readline-4':56,59 '/?ad=qwe&dw':6,9,13 '/wqe-324/ewr':48 'aew.werc.ewr':5 '1aew.werc.ewr':8 '2aew.werc.ewr':10 '3aew.werc.ewr':12 '4aew.werc.ewr':14 '5aew.werc.ewr':16 '6aew.werc.ewr':22 '7aew.werc.ewr':25 '/usr/local/fff':44 '/awdf/dwqe/4325':45 ':8100/?ad=qwe&dw':23 'teodor@stack.net':34 '5aew.werc.ewr:8100/?':15 ':8100/?ad=qwe&dw=%20%32':26 'aew.werc.ewr/?ad=qwe&dw':4 '1aew.werc.ewr/?ad=qwe&dw':7 '3aew.werc.ewr/?ad=qwe&dw':11 '6aew.werc.ewr:8100/?ad=qwe&dw':21 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':24
+                                                                                                                                                                                                                                                                                                                                                                                                                             to_tsvector                                                                                                                                                                                                                                                                                                                                                                                                                              
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 'ad':17 'dw':19 'jf':39 '234':63 '345':1 '4.2':54,55,56,59,62 '455':31 'jqw':66 'qwe':2,18,27,28,35 'wer':36 'wow':65 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':67 '234.435':30 'qwe-wer':34 'readlin':53,58,61 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 'readline-4.2':57,60 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
 (1 row)
 
 SELECT length(to_tsvector('default', '345 qw'));
@@ -831,7 +828,7 @@ SELECT length(to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://ae
 <i <b> wow  < jqw <> qwerty'));
  length 
 --------
-     53
+     51
 (1 row)
 
 select to_tsquery('default', 'qwe & sKies '); 
@@ -876,6 +873,36 @@ select to_tsquery('default', '(the|and&(i&1))&fghj');
  '1' & 'fghj'
 (1 row)
 
+select plainto_tsquery('default', 'the and z 1))& fghj');
+  plainto_tsquery   
+--------------------
+ 'z' & '1' & 'fghj'
+(1 row)
+
+select plainto_tsquery('default', 'foo bar') && plainto_tsquery('default', 'asd');
+       ?column?        
+-----------------------
+ 'foo' & 'bar' & 'asd'
+(1 row)
+
+select plainto_tsquery('default', 'foo bar') || plainto_tsquery('default', 'asd fg');
+           ?column?           
+------------------------------
+ 'foo' & 'bar' | 'asd' & 'fg'
+(1 row)
+
+select plainto_tsquery('default', 'foo bar') || !!plainto_tsquery('default', 'asd fg');
+             ?column?              
+-----------------------------------
+ 'foo' & 'bar' | !( 'asd' & 'fg' )
+(1 row)
+
+select plainto_tsquery('default', 'foo bar') && 'asd | fg';
+             ?column?             
+----------------------------------
+ 'foo' & 'bar' & ( 'asd' | 'fg' )
+(1 row)
+
 select 'a b:89  ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca';
  ?column? 
 ----------
index e6f1ae3a8984fb36086edd3bb6da04fce160cfbe..e312cf6af7166f00ffde947583200552dd5cc35b 100644 (file)
@@ -51,10 +51,20 @@ Datum       to_tsquery_name(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(to_tsquery_current);
 Datum      to_tsquery_current(PG_FUNCTION_ARGS);
 
+PG_FUNCTION_INFO_V1(plainto_tsquery);
+Datum      plainto_tsquery(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(plainto_tsquery_name);
+Datum      plainto_tsquery_name(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(plainto_tsquery_current);
+Datum      plainto_tsquery_current(PG_FUNCTION_ARGS);
+
 /* parser's states */
 #define WAITOPERAND 1
 #define WAITOPERATOR   2
 #define WAITFIRSTOPERAND 3
+#define WAITSINGLEOPERAND 4
 
 /*
  * node of query tree, also used
@@ -195,6 +205,14 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
                else if (*(state->buf) != ' ')
                    return ERR;
                break;
+           case WAITSINGLEOPERAND:
+               if ( *(state->buf) == '\0' ) 
+                   return END;
+               *strval = state->buf;
+               *lenval = strlen( state->buf );
+               state->buf += strlen( state->buf );
+               state->count++;
+               return VAL; 
            default:
                return ERR;
                break;
@@ -582,7 +600,7 @@ findoprnd(ITEM * ptr, int4 *pos)
  * input
  */
 static QUERYTYPE *
-           queryin(char *buf, void (*pushval) (QPRS_STATE *, int, char *, int, int2), int cfg_id)
+queryin(char *buf, void (*pushval) (QPRS_STATE *, int, char *, int, int2), int cfg_id, bool isplain)
 {
    QPRS_STATE  state;
    int4        i;
@@ -599,7 +617,7 @@ static QUERYTYPE *
 
    /* init state */
    state.buf = buf;
-   state.state = WAITFIRSTOPERAND;
+   state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND;
    state.count = 0;
    state.num = 0;
    state.str = NULL;
@@ -679,7 +697,7 @@ Datum
 tsquery_in(PG_FUNCTION_ARGS)
 {
    SET_FUNCOID();
-   PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0));
+   PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0, false));
 }
 
 /*
@@ -910,7 +928,7 @@ to_tsquery(PG_FUNCTION_ARGS)
    str = text2char(in);
    PG_FREE_IF_COPY(in, 1);
 
-   query = queryin(str, pushval_morph, PG_GETARG_INT32(0));
+   query = queryin(str, pushval_morph, PG_GETARG_INT32(0),false);
    
    if ( query->size == 0 )
        PG_RETURN_POINTER(query);
@@ -950,3 +968,59 @@ to_tsquery_current(PG_FUNCTION_ARGS)
                                        Int32GetDatum(get_currcfg()),
                                        PG_GETARG_DATUM(0)));
 }
+
+Datum
+plainto_tsquery(PG_FUNCTION_ARGS)
+{
+   text       *in = PG_GETARG_TEXT_P(1);
+   char       *str;
+   QUERYTYPE  *query;
+   ITEM       *res;
+   int4        len;
+
+   SET_FUNCOID();
+
+   str = text2char(in);
+   PG_FREE_IF_COPY(in, 1);
+
+   query = queryin(str, pushval_morph, PG_GETARG_INT32(0), true);
+   
+   if ( query->size == 0 )
+       PG_RETURN_POINTER(query);
+
+   res = clean_fakeval_v2(GETQUERY(query), &len);
+   if (!res)
+   {
+       query->len = HDRSIZEQT;
+       query->size = 0;
+       PG_RETURN_POINTER(query);
+   }
+   memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(ITEM));
+   pfree(res);
+   PG_RETURN_POINTER(query);
+}
+
+Datum
+plainto_tsquery_name(PG_FUNCTION_ARGS)
+{
+   text       *name = PG_GETARG_TEXT_P(0);
+   Datum       res;
+
+   SET_FUNCOID();
+   res = DirectFunctionCall2(plainto_tsquery,
+                             Int32GetDatum(name2id_cfg(name)),
+                             PG_GETARG_DATUM(1));
+
+   PG_FREE_IF_COPY(name, 0);
+   PG_RETURN_DATUM(res);
+}
+
+Datum
+plainto_tsquery_current(PG_FUNCTION_ARGS)
+{
+   SET_FUNCOID();
+   PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery,
+                                       Int32GetDatum(get_currcfg()),
+                                       PG_GETARG_DATUM(0)));
+}
+
index c973def7d4df67472e42f3a1b3db395edd473707..edc2d48fcfbe7f31c54d1b660238bcb1fef29a04 100644 (file)
@@ -14,6 +14,117 @@ tsquery_numnode(PG_FUNCTION_ARGS) {
    PG_RETURN_INT32(nnode);
 }
 
+static QTNode* 
+join_tsqueries(QUERYTYPE *a, QUERYTYPE *b) {
+   QTNode  *res=(QTNode*)palloc0( sizeof(QTNode) );
+
+   res->flags |= QTN_NEEDFREE;
+
+   res->valnode = (ITEM*)palloc0( sizeof(ITEM) );
+   res->valnode->type = OPR;
+
+   res->child = (QTNode**)palloc0( sizeof(QTNode*)*2 );
+   res->child[0] = QT2QTN( GETQUERY(b), GETOPERAND(b) );
+   res->child[1] = QT2QTN( GETQUERY(a), GETOPERAND(a) );
+   res->nchild = 2;
+
+   return res;
+}
+
+PG_FUNCTION_INFO_V1(tsquery_and);
+Datum           tsquery_and(PG_FUNCTION_ARGS);
+
+Datum
+tsquery_and(PG_FUNCTION_ARGS) {
+   QUERYTYPE  *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
+   QUERYTYPE  *b = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)));
+   QTNode  *res;
+   QUERYTYPE  *query;
+
+   if ( a->size == 0 ) {
+       PG_FREE_IF_COPY(a,1);
+       PG_RETURN_POINTER(b);
+   } else if ( b->size == 0 ) {
+       PG_FREE_IF_COPY(b,1);
+       PG_RETURN_POINTER(a);
+   }   
+
+   res = join_tsqueries(a, b);
+
+   res->valnode->val = '&';
+
+   query = QTN2QT( res, PlainMemory );
+
+   QTNFree(res);
+   PG_FREE_IF_COPY(a,0);
+   PG_FREE_IF_COPY(b,1);
+
+   PG_RETURN_POINTER(query);
+}
+
+PG_FUNCTION_INFO_V1(tsquery_or);
+Datum           tsquery_or(PG_FUNCTION_ARGS);
+
+Datum
+tsquery_or(PG_FUNCTION_ARGS) {
+   QUERYTYPE  *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
+   QUERYTYPE  *b = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)));
+   QTNode  *res;
+   QUERYTYPE  *query;
+
+   if ( a->size == 0 ) {
+       PG_FREE_IF_COPY(a,1);
+       PG_RETURN_POINTER(b);
+   } else if ( b->size == 0 ) {
+       PG_FREE_IF_COPY(b,1);
+       PG_RETURN_POINTER(a);
+   }   
+
+   res = join_tsqueries(a, b);
+
+   res->valnode->val = '|';
+
+   query = QTN2QT( res, PlainMemory );
+
+   QTNFree(res);
+   PG_FREE_IF_COPY(a,0);
+   PG_FREE_IF_COPY(b,1);
+
+   PG_RETURN_POINTER(query);
+}
+
+PG_FUNCTION_INFO_V1(tsquery_not);
+Datum           tsquery_not(PG_FUNCTION_ARGS);
+
+Datum
+tsquery_not(PG_FUNCTION_ARGS) {
+   QUERYTYPE  *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
+   QTNode  *res;
+   QUERYTYPE  *query;
+
+   if ( a->size == 0 ) 
+       PG_RETURN_POINTER(a);
+
+   res=(QTNode*)palloc0( sizeof(QTNode) );
+
+   res->flags |= QTN_NEEDFREE;
+
+   res->valnode = (ITEM*)palloc0( sizeof(ITEM) );
+   res->valnode->type = OPR;
+   res->valnode->val = '!';
+
+   res->child = (QTNode**)palloc0( sizeof(QTNode*) );
+   res->child[0] = QT2QTN( GETQUERY(a), GETOPERAND(a) );
+   res->nchild = 1;
+
+   query = QTN2QT( res, PlainMemory );
+
+   QTNFree(res);
+   PG_FREE_IF_COPY(a,0);
+
+   PG_RETURN_POINTER(query);
+}
+
 static int
 CompareTSQ( QUERYTYPE *a, QUERYTYPE *b ) {
    if ( a->size != b->size ) {
index 0923ce7a19755022e181fcaea6d567152dfa3c0a..bd0baa3b41d4d88603a28194634119f75fab56ce 100644 (file)
@@ -173,6 +173,13 @@ select to_tsquery('default', 'asd&(and|fghj)');
 select to_tsquery('default', '(asd&and)|fghj');
 select to_tsquery('default', '(asd&!and)|fghj');
 select to_tsquery('default', '(the|and&(i&1))&fghj');
+
+select plainto_tsquery('default', 'the and z 1))& fghj');
+select plainto_tsquery('default', 'foo bar') && plainto_tsquery('default', 'asd');
+select plainto_tsquery('default', 'foo bar') || plainto_tsquery('default', 'asd fg');
+select plainto_tsquery('default', 'foo bar') || !!plainto_tsquery('default', 'asd fg');
+select plainto_tsquery('default', 'foo bar') && 'asd | fg';
+
 select 'a b:89  ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca';
 select 'a b:89  ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:B';
 select 'a b:89  ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:A';
diff --git a/contrib/tsearch2/ts_locale.c b/contrib/tsearch2/ts_locale.c
new file mode 100644 (file)
index 0000000..b84681f
--- /dev/null
@@ -0,0 +1,61 @@
+#include "ts_locale.h"
+
+#include "utils/builtins.h"
+#include "utils/pg_locale.h"
+#include "mb/pg_wchar.h"
+
+
+#if defined(TS_USE_WIDE) && defined(WIN32)
+
+size_t
+wchar2char( const char *to, const wchar_t *from, size_t len ) {
+   if (GetDatabaseEncoding() == PG_UTF8) {
+       int r;
+
+       if (len==0)
+           return 0;
+
+       r = WideCharToMultiByte(CP_UTF8, 0, from, len, to, nbytes,
+               NULL, NULL);
+
+       
+       if ( r==0 )
+           ereport(ERROR,
+               (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                   errmsg("UTF-16 to UTF-8 translation failed: %lu",
+                       GetLastError())));
+
+       return r;
+   }
+
+   return wcstombs(to, from, len);
+}
+
+size_t 
+char2wchar( const wchar_t *to, const char *from, size_t len ) {
+   if (GetDatabaseEncoding() == PG_UTF8) {
+       int r;
+
+       if (len==0)
+           return 0;
+
+       r = MultiByteToWideChar(CP_UTF8, 0, from, len,
+           to, len);
+
+       if (!r) {
+           pg_verifymbstr(from, len, false);
+           ereport(ERROR,
+               (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+               errmsg("invalid multibyte character for locale"),
+               errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
+       }
+
+       Assert(r <= nbytes);
+
+       return r;
+   }
+   
+   return mbstowcs(to, from, len);
+}
+
+#endif
diff --git a/contrib/tsearch2/ts_locale.h b/contrib/tsearch2/ts_locale.h
new file mode 100644 (file)
index 0000000..a7ce6f1
--- /dev/null
@@ -0,0 +1,38 @@
+#ifndef __TSLOCALE_H__
+#define __TSLOCALE_H__
+
+#include "postgres.h"
+
+#include <ctype.h>
+#include <limits.h>
+
+/*
+ * towlower() and friends should be in <wctype.h>, but some pre-C99 systems
+ * declare them in <wchar.h>.
+ */
+#ifdef HAVE_WCHAR_H
+#include <wchar.h>
+#endif
+#ifdef HAVE_WCTYPE_H
+#include <wctype.h>
+#endif
+
+#if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
+#define TS_USE_WIDE
+
+#ifdef WIN32
+
+size_t wchar2char( const char *to, const wchar_t *from, size_t len );
+size_t char2wchar( const wchar_t *to, const char *from, size_t len );
+
+#else /* WIN32 */
+
+/* correct mbstowcs */
+#define char2wchar mbstowcs
+#define wchar2char wcstombs
+
+#endif /* WIN32 */
+#endif /* defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER) */ 
+
+#endif  /* __TSLOCALE_H__ */
index 9bdf641e121d239c00ec6cd5eecd6faef3895436..4fdf974d0d1defe50f07fa97d905dfb3a558e119 100644 (file)
@@ -427,6 +427,21 @@ RETURNS tsquery
 AS 'MODULE_PATHNAME','to_tsquery_current'
 LANGUAGE 'c' with (isstrict,iscachable);
 
+CREATE FUNCTION plainto_tsquery(oid, text)
+RETURNS tsquery
+AS 'MODULE_PATHNAME'
+LANGUAGE 'c' with (isstrict,iscachable);
+
+CREATE FUNCTION plainto_tsquery(text, text)
+RETURNS tsquery
+AS 'MODULE_PATHNAME','plainto_tsquery_name'
+LANGUAGE 'c' with (isstrict,iscachable);
+
+CREATE FUNCTION plainto_tsquery(text)
+RETURNS tsquery
+AS 'MODULE_PATHNAME','plainto_tsquery_current'
+LANGUAGE 'c' with (isstrict,iscachable);
+
 --operations
 CREATE FUNCTION exectsq(tsvector, tsquery)
 RETURNS bool
@@ -929,6 +944,47 @@ CREATE OR REPLACE FUNCTION numnode(tsquery)
         language 'C'
         with (isstrict,iscachable);
 
+CREATE OR REPLACE FUNCTION tsquery_and(tsquery,tsquery)
+        returns tsquery
+        as 'MODULE_PATHNAME', 'tsquery_and'
+        language 'C'
+        with (isstrict,iscachable);
+
+CREATE OPERATOR && (
+        LEFTARG = tsquery,
+        RIGHTARG = tsquery,
+        PROCEDURE = tsquery_and,
+        COMMUTATOR = '&&',
+        RESTRICT = contsel,
+        JOIN = contjoinsel
+);
+
+CREATE OR REPLACE FUNCTION tsquery_or(tsquery,tsquery)
+        returns tsquery
+        as 'MODULE_PATHNAME', 'tsquery_or'
+        language 'C'
+        with (isstrict,iscachable);
+
+CREATE OPERATOR || (
+        LEFTARG = tsquery,
+        RIGHTARG = tsquery,
+        PROCEDURE = tsquery_or,
+        COMMUTATOR = '||',
+        RESTRICT = contsel,
+        JOIN = contjoinsel
+);
+
+CREATE OR REPLACE FUNCTION tsquery_not(tsquery)
+        returns tsquery
+        as 'MODULE_PATHNAME', 'tsquery_not'
+        language 'C'
+        with (isstrict,iscachable);
+
+CREATE OPERATOR !! (
+        RIGHTARG = tsquery,
+        PROCEDURE = tsquery_not
+);
+
 --------------rewrite subsystem
 
 CREATE OR REPLACE FUNCTION rewrite(tsquery, text)
index 0070970e2165e054eea0d1afe88c528c02ee2e6d..c4eceba60bb22b2e515908786b8b2c43a7608fe3 100644 (file)
@@ -1,8 +1,8 @@
-# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.8 2005/10/18 01:30:49 tgl Exp $
+# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.9 2005/11/21 12:27:57 teodor Exp $
 
 SUBOBJS =  parser.o deflex.o
 
-EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) parser.c
+EXTRA_CLEAN = SUBSYS.o $(SUBOBJS)
 
 PG_CPPFLAGS = -I$(srcdir)/..
 
@@ -20,13 +20,6 @@ override CFLAGS += $(CFLAGS_SL)
 
 all: SUBSYS.o
 
-parser.c: parser.l
-ifdef FLEX
-   $(FLEX) $(FLEXFLAGS) -8 -Ptsearch2_yy -o'$@' $<
-else
-   @$(missing) flex $< $@
-endif
-
 SUBSYS.o: $(SUBOBJS)
    $(LD) $(LDREL) $(LDOUT) $@ $^
 
index bbf3271b666f4682143cc69f200e8759b084f15a..8f93d277a1e21a0942a2068d8ffaeff3362f4c85 100644 (file)
@@ -15,7 +15,7 @@ const char *lex_descr[] = {
    "Latin part of hyphenated word",
    "Space symbols",
    "HTML Tag",
-   "HTTP head",
+   "Protocol head",
    "Hyphenated word",
    "Latin hyphenated word",
    "Non-latin hyphenated word",
@@ -42,7 +42,7 @@ const char *tok_alias[] = {
    "lpart_hword",
    "blank",
    "tag",
-   "http",
+   "protocol",
    "hword",
    "lhword",
    "nlhword",
index 651d1f9e77301352fac86b64faa7c2eff87c9141..893f8430515ea4990f41e466e3c91cfea55e6a88 100644 (file)
@@ -17,7 +17,7 @@
 #define LATPARTHYPHENWORD  11
 #define SPACE      12
 #define TAG            13
-#define HTTP       14
+#define PROTOCOL       14
 #define HYPHENWORD 15
 #define LATHYPHENWORD  16
 #define CYRHYPHENWORD  17
diff --git a/contrib/tsearch2/wordparser/parser.c b/contrib/tsearch2/wordparser/parser.c
new file mode 100644 (file)
index 0000000..e414a86
--- /dev/null
@@ -0,0 +1,1028 @@
+#include "postgres.h"
+
+#include "utils/builtins.h"
+#include "utils/pg_locale.h"
+#include "mb/pg_wchar.h"
+
+#include "deflex.h"
+#include "parser.h"
+#include "ts_locale.h"
+
+
+static TParserPosition*
+newTParserPosition(TParserPosition *prev) {
+   TParserPosition *res = (TParserPosition*)palloc(sizeof(TParserPosition));
+
+   if ( prev ) 
+       memcpy(res, prev, sizeof(TParserPosition));
+   else
+       memset(res, 0, sizeof(TParserPosition));
+
+   res->prev = prev;
+
+   res->pushedAtAction = NULL;
+
+   return res;
+}
+
+TParser*
+TParserInit( char *str, int len ) {
+   TParser *prs = (TParser*)palloc0( sizeof(TParser) );
+
+   prs->charmaxlen = pg_database_encoding_max_length();
+   prs->str = str; 
+   prs->lenstr = len;
+
+#ifdef TS_USE_WIDE
+        /*
+         * Use wide char code only when max encoding length > 1 and ctype != C.
+         * Some operating systems fail with multi-byte encodings and a C locale.
+         * Also, for a C locale there is no need to process as multibyte.
+    * From backend/utils/adt/oracle_compat.c Teodor 
+         */
+
+   if ( prs->charmaxlen > 1 && !lc_ctype_is_c() ) {
+       prs->usewide=true;
+       prs->wstr = (wchar_t*)palloc( sizeof(wchar_t) * prs->lenstr );
+       prs->lenwstr = char2wchar( prs->wstr, prs->str, prs->lenstr ); 
+   } else
+#endif
+       prs->usewide=false;
+
+   prs->state = newTParserPosition(NULL);
+   prs->state->state = TPS_Base;
+
+   return prs;
+}
+
+void
+TParserClose( TParser* prs ) {
+   while( prs->state ) {
+       TParserPosition *ptr = prs->state->prev;
+       pfree( prs->state );
+       prs->state = ptr;
+   }
+
+   if ( prs->wstr )
+       pfree( prs->wstr );
+   pfree( prs );
+}
+
+/*
+ * defining support function, equvalent is* macroses, but
+ * working with any possible encodings and locales
+ */
+
+#ifdef TS_USE_WIDE 
+
+#define p_iswhat(type)                                         \
+static int                                         \
+p_is##type(TParser *prs) {                                 \
+   Assert( prs->state );                                   \
+   return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
+       is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );       \
+}                                              \
+                                               \
+static int                                         \
+p_isnot##type(TParser *prs) {                                  \
+   return !p_is##type(prs);                                \
+}
+
+
+
+/* p_iseq should be used only for ascii symbols */
+
+static int
+p_iseq(TParser *prs, char c) {
+   Assert( prs->state );
+   return ( ( prs->state->charlen==1 && *( prs->str + prs->state->posbyte ) == c ) ) ? 1 : 0;
+}
+
+#else /* TS_USE_WIDE */
+
+#define p_iswhat(type)                                         \
+static int                                         \
+p_is##type(TParser *prs) {                                 \
+   Assert( prs->state );                                   \
+   return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );        \
+}                                              \
+                                               \
+static int                                         \
+p_isnot##type(TParser *prs) {                                  \
+   return !p_is##type(prs);                                \
+}
+
+
+static int
+p_iseq(TParser *prs, char c) {
+   Assert( prs->state );
+   return ( *( prs->str + prs->state->posbyte ) == c ) ) ? 1 : 0;
+}
+
+#endif /* TS_USE_WIDE */
+
+p_iswhat(alnum)
+p_iswhat(alpha)
+p_iswhat(digit)
+p_iswhat(lower)
+p_iswhat(print)
+p_iswhat(punct)
+p_iswhat(space)
+p_iswhat(upper)
+p_iswhat(xdigit)
+
+static int
+p_isEOF(TParser *prs) {
+   Assert( prs->state );
+   return (prs->state->posbyte == prs->lenstr || prs->state->charlen==0) ? 1 : 0; 
+}
+
+static int
+p_iseqC(TParser *prs) {
+   return p_iseq(prs, prs->c); 
+}
+
+static int
+p_isneC(TParser *prs) {
+   return !p_iseq(prs, prs->c);
+}
+
+static int
+p_isascii(TParser *prs) {
+   return ( prs->state->charlen==1 && isascii( (unsigned char) *( prs->str + prs->state->posbyte ) ) ) ? 1 : 0; 
+}
+
+static int
+p_islatin(TParser *prs) {
+   return ( p_isalpha(prs) && p_isascii(prs) ) ? 1 : 0;
+}
+
+static int
+p_isnonlatin(TParser *prs) {
+   return ( p_isalpha(prs) && !p_isascii(prs) ) ? 1 : 0;
+}
+
+void _make_compiler_happy(void);
+void
+_make_compiler_happy(void) {
+   p_isalnum(NULL);    p_isnotalnum(NULL);
+   p_isalpha(NULL);    p_isnotalpha(NULL);
+   p_isdigit(NULL);    p_isnotdigit(NULL);
+   p_islower(NULL);    p_isnotlower(NULL);
+   p_isprint(NULL);    p_isnotprint(NULL);
+   p_ispunct(NULL);    p_isnotpunct(NULL);
+   p_isspace(NULL);    p_isnotspace(NULL);
+   p_isupper(NULL);    p_isnotupper(NULL);
+   p_isxdigit(NULL);   p_isnotxdigit(NULL);
+   p_isEOF(NULL);  
+   p_iseqC(NULL);  p_isneC(NULL);
+}
+
+
+static void
+SpecialTags(TParser *prs) {
+   switch( prs->state->lencharlexeme ) {
+       case 8: /* </script */
+           if ( pg_strncasecmp( prs->lexeme, "</script", 8 ) == 0 )
+               prs->ignore = false;
+           break;
+       case 7: /* <script || </style */
+           if ( pg_strncasecmp( prs->lexeme, "</style", 7 ) == 0 )
+               prs->ignore = false;
+           else if ( pg_strncasecmp( prs->lexeme, "<script", 7 ) == 0 )
+               prs->ignore = true;
+           break;
+       case 6: /* <style */
+           if ( pg_strncasecmp( prs->lexeme, "<style", 6 ) == 0 )
+               prs->ignore = true;
+           break;
+       default: break;
+   }
+}
+
+static void
+SpecialFURL(TParser *prs) {
+   prs->wanthost = true;
+   prs->state->posbyte -= prs->state->lenbytelexeme;
+   prs->state->poschar -= prs->state->lencharlexeme;
+}
+
+static void
+SpecialHyphen(TParser *prs) {
+   prs->state->posbyte -= prs->state->lenbytelexeme;
+   prs->state->poschar -= prs->state->lencharlexeme;
+}
+
+static int
+p_isstophost(TParser *prs) {
+   if ( prs->wanthost ) {
+       prs->wanthost = false;
+       return 1;
+   }
+   return 0;
+}
+
+static int
+p_isignore(TParser *prs) {
+   return (prs->ignore) ? 1 : 0;
+}
+
+static int
+p_ishost(TParser *prs) {
+   TParser *tmpprs = TParserInit( prs->str+prs->state->posbyte, prs->lenstr - prs->state->posbyte );
+   int res = 0;
+
+   if ( TParserGet(tmpprs) && tmpprs->type == HOST ) {
+       prs->state->posbyte += tmpprs->lenbytelexeme;
+       prs->state->poschar += tmpprs->lencharlexeme;
+       prs->state->lenbytelexeme += tmpprs->lenbytelexeme;
+       prs->state->lencharlexeme += tmpprs->lencharlexeme;
+       prs->state->charlen =  tmpprs->state->charlen;
+       res = 1;
+   }
+   TParserClose(tmpprs);
+
+   return res;
+}
+
+static int
+p_isURI(TParser *prs) {
+   TParser *tmpprs = TParserInit( prs->str+prs->state->posbyte, prs->lenstr - prs->state->posbyte );
+   int res = 0;
+
+   tmpprs->state = newTParserPosition( tmpprs->state );
+   tmpprs->state->state = TPS_InFileFirst;
+
+   if ( TParserGet(tmpprs) && (tmpprs->type == URI || tmpprs->type == FILEPATH) ) {
+       prs->state->posbyte += tmpprs->lenbytelexeme;
+       prs->state->poschar += tmpprs->lencharlexeme;
+       prs->state->lenbytelexeme += tmpprs->lenbytelexeme;
+       prs->state->lencharlexeme += tmpprs->lencharlexeme;
+       prs->state->charlen =  tmpprs->state->charlen;
+       res = 1;
+   }
+   TParserClose(tmpprs);
+
+   return res;
+}
+
+/*
+ * Table of state/action of parser
+ */
+
+#define A_NEXT     0x0000
+#define A_BINGO        0x0001
+#define A_POP      0x0002
+#define A_PUSH     0x0004
+#define A_RERUN        0x0008
+#define A_CLEAR        0x0010
+#define A_MERGE        0x0020
+#define A_CLRALL   0x0040
+
+static TParserStateActionItem actionTPS_Base[] = {
+   {p_isEOF,   0,  A_NEXT,         TPS_Null,       0,  NULL},
+   {p_iseqC,   '<',    A_PUSH,         TPS_InTagFirst,     0,  NULL},
+   {p_isignore,    0,  A_NEXT,         TPS_InSpace,        0,  NULL},
+   {p_islatin,     0,  A_NEXT,         TPS_InLatWord,      0,  NULL},
+   {p_isnonlatin,  0,  A_NEXT,         TPS_InCyrWord,      0,  NULL},
+   {p_isdigit,     0,  A_NEXT,         TPS_InUnsignedInt,  0,  NULL},
+   {p_iseqC,   '-',    A_PUSH,         TPS_InSignedIntFirst,   0,  NULL},
+   {p_iseqC,   '+',    A_PUSH,         TPS_InSignedIntFirst,   0,  NULL},
+   {p_iseqC,   '&',    A_PUSH,         TPS_InHTMLEntityFirst,  0,  NULL},
+   {p_iseqC,   '/',    A_PUSH,         TPS_InFileFirst,    0,  NULL},
+   {NULL,      0,  A_NEXT,         TPS_InSpace,        0,  NULL} 
+}; 
+
+
+static TParserStateActionItem actionTPS_InUWord[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       UWORD,      NULL},
+   {p_isalnum,     0,  A_NEXT,     TPS_InUWord,        0,      NULL},
+   {p_iseqC,   '@',    A_PUSH,     TPS_InEmail,        0,      NULL},
+   {p_iseqC,   '/',    A_PUSH,     TPS_InFileFirst,    0,      NULL},
+   {p_iseqC,   '-',    A_PUSH,     TPS_InHyphenUWordFirst, 0,      NULL},
+   {NULL,      0,  A_BINGO,    TPS_Base,       UWORD,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InLatWord[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       LATWORD,    NULL},
+   {p_islatin,     0,  A_NEXT,     TPS_Null,       0,      NULL},
+   {p_iseqC,   '.',    A_PUSH,     TPS_InHostFirstDomen,   0,      NULL},
+   {p_iseqC,   '.',    A_PUSH,     TPS_InFileFirst,    0,      NULL},
+   {p_iseqC,   '-',    A_PUSH,     TPS_InHostFirstAN,  0,      NULL},
+   {p_iseqC,   '-',    A_PUSH,     TPS_InHyphenLatWordFirst,0,     NULL},
+   {p_iseqC,   '@',    A_PUSH,     TPS_InEmail,        0,      NULL},
+   {p_iseqC,   ':',    A_PUSH,     TPS_InProtocolFirst,    0,      NULL},
+   {p_iseqC,   '/',    A_PUSH,     TPS_InFileFirst,    0,      NULL},
+   {p_isdigit,     0,  A_PUSH,     TPS_InHost,     0,      NULL},
+   {p_isalnum,     0,  A_NEXT,     TPS_InUWord,        0,      NULL},
+   {NULL,      0,  A_BINGO,    TPS_Base,       LATWORD,    NULL}
+};
+
+static TParserStateActionItem actionTPS_InCyrWord[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       CYRWORD,    NULL},
+   {p_isnonlatin,  0,  A_NEXT,     TPS_Null,       0,      NULL},
+   {p_isalnum,     0,  A_NEXT,     TPS_InUWord,        0,      NULL}, 
+   {p_iseqC,   '-',    A_PUSH,     TPS_InHyphenCyrWordFirst,0,     NULL},
+   {NULL,      0,  A_BINGO,    TPS_Base,       CYRWORD,    NULL}
+};
+static TParserStateActionItem actionTPS_InUnsignedInt[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       UNSIGNEDINT,    NULL}, 
+   {p_isdigit, 0,  A_NEXT,     TPS_Null,       0,      NULL}, 
+   {p_iseqC,   '.',    A_PUSH,     TPS_InHostFirstDomen,   0,      NULL}, 
+   {p_iseqC,   '.',    A_PUSH,     TPS_InUDecimalFirst,    0,      NULL}, 
+   {p_iseqC,   'e',    A_PUSH,     TPS_InMantissaFirst,    0,      NULL}, 
+   {p_iseqC,   'E',    A_PUSH,     TPS_InMantissaFirst,    0,      NULL}, 
+   {p_islatin,     0,  A_PUSH,     TPS_InHost,         0,      NULL}, 
+   {p_isalpha,     0,  A_NEXT,     TPS_InUWord,        0,      NULL}, 
+   {p_iseqC,   '/',    A_PUSH,     TPS_InFileFirst,    0,      NULL},
+   {NULL,      0,  A_BINGO,    TPS_Base,       UNSIGNEDINT,    NULL}
+};
+static TParserStateActionItem actionTPS_InSignedIntFirst[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isdigit, 0,  A_NEXT|A_CLEAR, TPS_InSignedInt,    0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+static TParserStateActionItem actionTPS_InSignedInt[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       SIGNEDINT,  NULL}, 
+   {p_isdigit, 0,  A_NEXT,     TPS_Null,       0,      NULL}, 
+   {p_iseqC,   '.',    A_PUSH,     TPS_InDecimalFirst,     0,      NULL}, 
+   {p_iseqC,   'e',    A_PUSH,     TPS_InMantissaFirst,    0,      NULL}, 
+   {p_iseqC,   'E',    A_PUSH,     TPS_InMantissaFirst,    0,      NULL}, 
+   {NULL,      0,  A_BINGO,    TPS_Base,       SIGNEDINT,  NULL}
+};
+static TParserStateActionItem actionTPS_InSpace[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       SPACE,      NULL}, 
+   {p_iseqC,   '<',    A_BINGO,    TPS_Base,       SPACE,      NULL}, 
+   {p_isignore,    0,  A_NEXT,     TPS_Null,       0,      NULL}, 
+   {p_iseqC,   '-',    A_BINGO,    TPS_Base,       SPACE,      NULL}, 
+   {p_iseqC,   '+',    A_BINGO,    TPS_Base,       SPACE,      NULL}, 
+   {p_iseqC,   '&',    A_BINGO,    TPS_Base,       SPACE,      NULL}, 
+   {p_iseqC,   '/',    A_BINGO,    TPS_Base,       SPACE,      NULL}, 
+   {p_isnotalnum,  0,  A_NEXT,     TPS_InSpace,        0,      NULL}, 
+   {NULL,      0,  A_BINGO,    TPS_Base,       SPACE,      NULL} 
+};
+
+static TParserStateActionItem actionTPS_InUDecimalFirst[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isdigit, 0,  A_CLEAR,    TPS_InUDecimal,     0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InUDecimal[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       DECIMAL,    NULL}, 
+   {p_isdigit, 0,  A_NEXT,     TPS_InUDecimal,     0,      NULL}, 
+   {p_iseqC,   '.',    A_PUSH,     TPS_InVersionFirst, 0,      NULL}, 
+   {p_iseqC,   'e',    A_PUSH,     TPS_InMantissaFirst,    0,      NULL}, 
+   {p_iseqC,   'E',    A_PUSH,     TPS_InMantissaFirst,    0,      NULL}, 
+   {NULL,      0,  A_BINGO,    TPS_Base,       DECIMAL,    NULL}
+};
+
+static TParserStateActionItem actionTPS_InDecimalFirst[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isdigit, 0,  A_CLEAR,    TPS_InDecimal,      0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InDecimal[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       DECIMAL,    NULL}, 
+   {p_isdigit, 0,  A_NEXT,     TPS_InDecimal,      0,      NULL}, 
+   {p_iseqC,   'e',    A_PUSH,     TPS_InMantissaFirst,    0,      NULL}, 
+   {p_iseqC,   'E',    A_PUSH,     TPS_InMantissaFirst,    0,      NULL}, 
+   {NULL,      0,  A_BINGO,    TPS_Base,       DECIMAL,    NULL}
+};
+
+static TParserStateActionItem actionTPS_InVersionFirst[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isdigit, 0,  A_CLEAR,    TPS_InVersion,      0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InVersion[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       VERSIONNUMBER,  NULL}, 
+   {p_isdigit, 0,  A_NEXT,     TPS_InVersion,      0,      NULL}, 
+   {p_iseqC,   '.',    A_PUSH,     TPS_InVersionFirst, 0,      NULL}, 
+   {NULL,      0,  A_BINGO,    TPS_Base,       VERSIONNUMBER,  NULL}
+};
+
+static TParserStateActionItem actionTPS_InMantissaFirst[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isdigit, 0,  A_CLEAR,    TPS_InMantissa,     0,      NULL}, 
+   {p_iseqC,   '+',    A_NEXT,     TPS_InMantissaSign, 0,      NULL}, 
+   {p_iseqC,   '-',    A_NEXT,     TPS_InMantissaSign, 0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InMantissaSign[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isdigit, 0,  A_CLEAR,    TPS_InMantissa,     0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InMantissa[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       SCIENTIFIC, NULL}, 
+   {p_isdigit, 0,  A_NEXT,     TPS_InMantissa,     0,      NULL}, 
+   {NULL,      0,  A_BINGO,    TPS_Base,       SCIENTIFIC, NULL}
+};
+
+static TParserStateActionItem actionTPS_InHTMLEntityFirst[] = {
+   {p_isEOF,       0,      A_POP,      TPS_Null,           0,      NULL},
+   {p_iseqC,       '#',    A_NEXT,         TPS_InHTMLEntityNumFirst,0,     NULL},
+   {p_islatin,     0,      A_NEXT,         TPS_InHTMLEntity,       0,      NULL},
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InHTMLEntity[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_islatin, 0,  A_NEXT,     TPS_InHTMLEntity,   0,      NULL}, 
+   {p_iseqC,   ';',    A_NEXT,     TPS_InHTMLEntityEnd,    0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InHTMLEntityNumFirst[] = {
+   {p_isEOF,       0,      A_POP,      TPS_Null,               0,      NULL},
+   {p_isdigit, 0,  A_NEXT,     TPS_InHTMLEntityNum,    0,      NULL}, 
+   {NULL,          0,      A_POP,      TPS_Null,               0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InHTMLEntityNum[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isdigit, 0,  A_NEXT,     TPS_InHTMLEntityNum,    0,      NULL}, 
+   {p_iseqC,   ';',    A_NEXT,     TPS_InHTMLEntityEnd,    0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InHTMLEntityEnd[] = {
+   {NULL,      0,  A_BINGO|A_CLEAR,TPS_Base,       HTMLENTITY, NULL}
+};
+
+static TParserStateActionItem actionTPS_InTagFirst[] = {
+   {p_isEOF,       0,      A_POP,          TPS_Null,       0,      NULL},
+   {p_iseqC,       '/',    A_PUSH,         TPS_InTagCloseFirst,    0,      NULL},
+   {p_iseqC,   '!',    A_PUSH,         TPS_InCommentFirst, 0,      NULL},
+   {p_islatin,     0,      A_PUSH,         TPS_InTag,      0,      NULL},
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InTagCloseFirst[] = {
+   {p_isEOF,       0,      A_POP,      TPS_Null,               0,      NULL},
+   {p_islatin,     0,      A_NEXT,     TPS_InTag,      0,      NULL},
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InTag[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_iseqC,   '>',    A_NEXT,     TPS_InTagEnd,       0,      SpecialTags}, 
+   {p_iseqC,   '\'',   A_NEXT,     TPS_InTagEscapeK,   0,      NULL},
+   {p_iseqC,   '"',    A_NEXT,     TPS_InTagEscapeKK,  0,      NULL},
+   {p_islatin,     0,  A_NEXT,     TPS_Null,       0,      NULL},
+   {p_isdigit,     0,  A_NEXT,     TPS_Null,       0,      NULL},
+   {p_iseqC,   '=',    A_NEXT,     TPS_Null,       0,      NULL},
+   {p_iseqC,   '-',    A_NEXT,     TPS_Null,       0,      NULL},
+   {p_iseqC,   '#',    A_NEXT,     TPS_Null,       0,      NULL},
+   {p_iseqC,   '%',    A_NEXT,     TPS_Null,       0,      NULL},
+   {p_isspace,     0,  A_NEXT,     TPS_Null,       0,      SpecialTags},
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InTagEscapeK[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_iseqC,   '\\',   A_PUSH,     TPS_InTagBackSleshed,   0,      NULL},
+   {p_iseqC,   '\'',   A_NEXT,     TPS_InTag,      0,      NULL},
+   {NULL,      0,  A_NEXT,     TPS_InTagEscapeK,   0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InTagEscapeKK[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_iseqC,   '\\',   A_PUSH,     TPS_InTagBackSleshed,   0,      NULL},
+   {p_iseqC,   '"',    A_NEXT,     TPS_InTag,      0,      NULL},
+   {NULL,      0,  A_NEXT,     TPS_InTagEscapeKK,  0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InTagBackSleshed[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {NULL,      0,  A_MERGE,    TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InTagEnd[] = {
+   {NULL,      0,  A_BINGO|A_CLRALL,TPS_Base,      TAG,        NULL}
+};
+
+static TParserStateActionItem actionTPS_InCommentFirst[] = {
+   {p_isEOF,       0,      A_POP,      TPS_Null,               0,      NULL},
+   {p_iseqC,       '-',    A_NEXT,         TPS_InCommentLast,  0,      NULL},
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InCommentLast[] = {
+   {p_isEOF,       0,      A_POP,      TPS_Null,               0,      NULL},
+   {p_iseqC,       '-',    A_NEXT,         TPS_InComment,      0,      NULL},
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InComment[] = {
+   {p_isEOF,       0,      A_POP,      TPS_Null,               0,      NULL},
+   {p_iseqC,       '-',    A_NEXT,         TPS_InCloseCommentFirst,0,      NULL},
+   {NULL,      0,  A_NEXT,     TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
+   {p_isEOF,       0,      A_POP,      TPS_Null,               0,      NULL},
+   {p_iseqC,       '-',    A_NEXT,         TPS_InCloseCommentLast, 0,      NULL},
+   {NULL,      0,  A_NEXT,     TPS_InComment,      0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InCloseCommentLast[] = {
+   {p_isEOF,       0,      A_POP,      TPS_Null,               0,      NULL},
+   {p_iseqC,       '-',    A_NEXT,     TPS_Null,       0,      NULL},
+   {p_iseqC,       '>',    A_NEXT,     TPS_InCommentEnd,   0,      NULL},
+   {NULL,      0,  A_NEXT,     TPS_InComment,      0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InCommentEnd[] = {
+   {NULL,      0,  A_BINGO|A_CLRALL,TPS_Base,      TAG,        NULL}
+};
+
+static TParserStateActionItem actionTPS_InHostFirstDomen[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_islatin, 0,  A_NEXT,     TPS_InHostDomenSecond,  0,      NULL}, 
+   {p_isdigit, 0,  A_NEXT,     TPS_InHost,     0,      NULL}, 
+   //{p_iseqC, '-',    A_POP,      TPS_InHostFirstAN,  0,      NULL}, 
+   //{p_iseqC, '.',    A_POP,      TPS_InHostFirstDomen,   0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InHostDomenSecond[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_islatin, 0,  A_NEXT,     TPS_InHostDomen,    0,      NULL}, 
+   {p_isdigit, 0,  A_PUSH,     TPS_InHost,     0,      NULL}, 
+   {p_iseqC,   '-',    A_PUSH,     TPS_InHostFirstAN,  0,      NULL}, 
+   {p_iseqC,   '.',    A_PUSH,     TPS_InHostFirstDomen,   0,      NULL}, 
+   {p_iseqC,   '@',    A_PUSH,     TPS_InEmail,        0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InHostDomen[] = {
+   {p_isEOF,   0,  A_BINGO|A_CLRALL,TPS_Base,      HOST,       NULL}, 
+   {p_islatin, 0,  A_NEXT,     TPS_InHostDomen,    0,      NULL}, 
+   {p_isdigit, 0,  A_PUSH,     TPS_InHost,     0,      NULL}, 
+   {p_iseqC,   ':',    A_PUSH,     TPS_InPortFirst,    0,      NULL}, 
+   {p_iseqC,   '-',    A_PUSH,     TPS_InHostFirstAN,  0,      NULL}, 
+   {p_iseqC,   '.',    A_PUSH,     TPS_InHostFirstDomen,   0,      NULL}, 
+   {p_iseqC,   '@',    A_PUSH,     TPS_InEmail,        0,      NULL}, 
+   {p_isdigit, 0,  A_POP,      TPS_Null,       0,      NULL},
+   {p_isstophost,  0,  A_BINGO|A_CLRALL,TPS_InURIStart,    HOST,       NULL},
+   {p_iseqC,   '/',    A_PUSH,     TPS_InFURL,     0,      NULL},
+   {NULL,      0,  A_BINGO|A_CLRALL,TPS_Base,      HOST,       NULL}
+};
+
+static TParserStateActionItem actionTPS_InPortFirst[] = {
+   {p_isEOF,       0,      A_POP,      TPS_Null,               0,      NULL},
+   {p_isdigit,     0,      A_NEXT,     TPS_InPort,     0,      NULL},
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InPort[] = {
+   {p_isEOF,       0,      A_BINGO|A_CLRALL,TPS_Base,              HOST,       NULL},
+   {p_isdigit,     0,      A_NEXT,     TPS_InPort,     0,      NULL},
+   {p_isstophost,  0,  A_BINGO|A_CLRALL,TPS_InURIStart,    HOST,       NULL},
+   {p_iseqC,   '/',    A_PUSH,     TPS_InFURL,     0,      NULL},
+   {NULL,      0,  A_BINGO|A_CLRALL,TPS_Base,      HOST,       NULL}
+};
+
+static TParserStateActionItem actionTPS_InHostFirstAN[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isdigit, 0,  A_NEXT,     TPS_InHost,     0,      NULL}, 
+   {p_islatin, 0,  A_NEXT,     TPS_InHost,     0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InHost[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isdigit, 0,  A_NEXT,     TPS_InHost,     0,      NULL}, 
+   {p_islatin, 0,  A_NEXT,     TPS_InHost,     0,      NULL}, 
+   {p_iseqC,   '@',    A_PUSH,     TPS_InEmail,        0,      NULL}, 
+   {p_iseqC,   '.',    A_PUSH,     TPS_InHostFirstDomen,   0,      NULL}, 
+   {p_iseqC,   '-',    A_PUSH,     TPS_InHostFirstAN,  0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InEmail[] = {
+   {p_ishost,  0,  A_BINGO|A_CLRALL, TPS_Base,         EMAIL,      NULL},
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InFileFirst[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_islatin, 0,  A_CLEAR,    TPS_InFile,     0,      NULL}, 
+   {p_isdigit, 0,  A_CLEAR,    TPS_InFile,     0,      NULL}, 
+   {p_iseqC,   '.',    A_CLEAR,    TPS_InFile,     0,      NULL}, 
+   {p_iseqC,   '_',    A_CLEAR,    TPS_InFile,     0,      NULL}, 
+   {p_iseqC,   '?',    A_PUSH,     TPS_InURIFirst,     0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InFile[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       FILEPATH,   NULL}, 
+   {p_islatin, 0,  A_NEXT,     TPS_InFile,     0,      NULL}, 
+   {p_isdigit, 0,  A_NEXT,     TPS_InFile,     0,      NULL}, 
+   {p_iseqC,   '.',    A_PUSH,     TPS_InFileNext,     0,      NULL}, 
+   {p_iseqC,   '_',    A_NEXT,     TPS_InFile,     0,      NULL}, 
+   {p_iseqC,   '-',    A_NEXT,     TPS_InFile,     0,      NULL}, 
+   {p_iseqC,   '/',    A_PUSH,     TPS_InFileFirst,    0,      NULL}, 
+   {p_iseqC,   '?',    A_PUSH,     TPS_InURIFirst,     0,      NULL}, 
+   {NULL,      0,  A_BINGO,    TPS_Base,       FILEPATH,   NULL}
+};
+
+static TParserStateActionItem actionTPS_InFileNext[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_islatin, 0,  A_CLEAR,    TPS_InFile,     0,      NULL}, 
+   {p_isdigit, 0,  A_CLEAR,    TPS_InFile,     0,      NULL}, 
+   {p_iseqC,   '_',    A_CLEAR,    TPS_InFile,     0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InURIFirst[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_iseqC,   '"',    A_POP,      TPS_Null,       0,      NULL}, 
+   {p_iseqC,   '\'',   A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isnotspace,  0,  A_CLEAR,    TPS_InURI,      0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL},
+};
+static TParserStateActionItem actionTPS_InURIStart[] = {
+   {NULL,      0,  A_NEXT,     TPS_InURI,      0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InURI[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       URI,        NULL}, 
+   {p_iseqC,   '"',    A_BINGO,    TPS_Base,       URI,        NULL}, 
+   {p_iseqC,   '\'',   A_BINGO,    TPS_Base,       URI,        NULL}, 
+   {p_isnotspace,  0,  A_NEXT,     TPS_InURI,      0,      NULL}, 
+   {NULL,      0,  A_BINGO,    TPS_Base,       URI,        NULL}
+};
+
+static TParserStateActionItem actionTPS_InFURL[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isURI,   0,  A_BINGO|A_CLRALL,TPS_Base,      FURL,       SpecialFURL},
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+       
+static TParserStateActionItem actionTPS_InProtocolFirst[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_iseqC,   '/',    A_NEXT,     TPS_InProtocolSecond,   0,      NULL},
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+       
+static TParserStateActionItem actionTPS_InProtocolSecond[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_iseqC,   '/',    A_NEXT,     TPS_InProtocolEnd,  0,      NULL},
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+       
+static TParserStateActionItem actionTPS_InProtocolEnd[] = {
+   {NULL,      0,  A_BINGO|A_CLRALL,TPS_Base,      PROTOCOL,   NULL}
+};
+       
+static TParserStateActionItem actionTPS_InHyphenLatWordFirst[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_islatin,     0,  A_NEXT,     TPS_InHyphenLatWord,    0,      NULL}, 
+   {p_isnonlatin,  0,  A_NEXT,     TPS_InHyphenUWord,  0,      NULL}, 
+   {p_isdigit,     0,  A_NEXT,     TPS_InHyphenValue,  0,      NULL}, 
+   {p_isdigit,     0,  A_NEXT,     TPS_InHyphenUWord,  0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenLatWord[] = {
+   {p_isEOF,   0,  A_BINGO|A_CLRALL,TPS_InParseHyphen,     LATHYPHENWORD,  SpecialHyphen}, 
+   {p_islatin,     0,  A_NEXT,     TPS_InHyphenLatWord,    0,      NULL}, 
+   {p_isnonlatin,  0,  A_NEXT,     TPS_InHyphenUWord,  0,      NULL}, 
+   {p_isdigit,     0,  A_NEXT,     TPS_InHyphenUWord,  0,      NULL}, 
+   {p_iseqC,   '-',    A_PUSH,     TPS_InHyphenLatWordFirst,0,     NULL}, 
+   {NULL,      0,  A_BINGO|A_CLRALL,TPS_InParseHyphen, LATHYPHENWORD,  SpecialHyphen}
+};
+
+static TParserStateActionItem actionTPS_InHyphenCyrWordFirst[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isnonlatin,  0,  A_NEXT,     TPS_InHyphenCyrWord,    0,      NULL}, 
+   {p_islatin,     0,  A_NEXT,     TPS_InHyphenUWord,  0,      NULL}, 
+   {p_isdigit,     0,  A_NEXT,     TPS_InHyphenValue,  0,      NULL}, 
+   {p_isdigit,     0,  A_NEXT,     TPS_InHyphenUWord,  0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenCyrWord[] = {
+   {p_isEOF,   0,  A_BINGO|A_CLRALL,TPS_InParseHyphen, CYRHYPHENWORD,  SpecialHyphen}, 
+   {p_isnonlatin,  0,  A_NEXT,     TPS_InHyphenCyrWord,    0,      NULL}, 
+   {p_islatin,     0,  A_NEXT,     TPS_InHyphenUWord,  0,      NULL}, 
+   {p_isdigit,     0,  A_NEXT,     TPS_InHyphenUWord,  0,      NULL}, 
+   {p_iseqC,   '-',    A_PUSH,     TPS_InHyphenCyrWordFirst,0,     NULL}, 
+   {NULL,      0,  A_BINGO|A_CLRALL,TPS_InParseHyphen,     CYRHYPHENWORD,  SpecialHyphen}
+};
+
+static TParserStateActionItem actionTPS_InHyphenUWordFirst[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isdigit,     0,  A_NEXT,     TPS_InHyphenValue,  0,      NULL}, 
+   {p_isalnum,     0,  A_NEXT,     TPS_InHyphenUWord,  0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenUWord[] = {
+   {p_isEOF,   0,  A_BINGO|A_CLRALL,TPS_InParseHyphen,     HYPHENWORD, SpecialHyphen}, 
+   {p_isalnum,     0,  A_NEXT,     TPS_InHyphenUWord,  0,      NULL}, 
+   {p_iseqC,   '-',    A_PUSH,     TPS_InHyphenUWordFirst,0,       NULL}, 
+   {NULL,      0,  A_BINGO|A_CLRALL,TPS_InParseHyphen,     HYPHENWORD, SpecialHyphen}
+};
+
+static TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isdigit,     0,  A_NEXT,     TPS_InHyphenValueExact, 0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenValue[] = {
+   {p_isEOF,   0,  A_BINGO|A_CLRALL,TPS_InParseHyphen,     HYPHENWORD, SpecialHyphen}, 
+   {p_isdigit,     0,  A_NEXT,     TPS_InHyphenValue,  0,      NULL}, 
+   {p_iseqC,   '.',    A_PUSH,     TPS_InHyphenValueFirst, 0,      NULL}, 
+   {p_iseqC,   '-',    A_PUSH,     TPS_InHyphenUWordFirst,0,       NULL}, 
+   {p_isalpha,     0,  A_NEXT,     TPS_InHyphenUWord,  0,      NULL}, 
+   {NULL,      0,  A_BINGO|A_CLRALL,TPS_InParseHyphen,     HYPHENWORD, SpecialHyphen}
+};
+
+static TParserStateActionItem actionTPS_InHyphenValueExact[] = {
+   {p_isEOF,   0,  A_BINGO|A_CLRALL,TPS_InParseHyphen,     HYPHENWORD, SpecialHyphen}, 
+   {p_isdigit,     0,  A_NEXT,     TPS_InHyphenValueExact, 0,      NULL}, 
+   {p_iseqC,   '.',    A_PUSH,     TPS_InHyphenValueFirst, 0,      NULL}, 
+   {p_iseqC,   '-',    A_PUSH,     TPS_InHyphenUWordFirst, 0,      NULL}, 
+   {NULL,      0,  A_BINGO|A_CLRALL,TPS_InParseHyphen,     HYPHENWORD, SpecialHyphen}
+};
+
+static TParserStateActionItem actionTPS_InParseHyphen[] = {
+   {p_isEOF,   0,  A_RERUN,    TPS_Base,       0,      NULL}, 
+   {p_islatin,     0,  A_NEXT,     TPS_InHyphenLatWordPart,0,      NULL}, 
+   {p_isnonlatin,  0,  A_NEXT,     TPS_InHyphenCyrWordPart,0,      NULL}, 
+   {p_isdigit,     0,  A_NEXT,     TPS_InHyphenUnsignedInt,0,      NULL}, 
+   {p_iseqC,   '-',    A_PUSH,     TPS_InParseHyphenHyphen,0,      NULL}, 
+   {NULL,      0,  A_RERUN,    TPS_Base,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isalnum,     0,  A_BINGO|A_CLEAR,TPS_InParseHyphen,  SPACE,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenCyrWordPart[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       CYRPARTHYPHENWORD,NULL}, 
+   {p_isnonlatin,  0,  A_NEXT,     TPS_InHyphenCyrWordPart,0,      NULL}, 
+   {p_islatin,     0,  A_NEXT,     TPS_InHyphenUWordPart,  0,      NULL}, 
+   {p_isdigit,     0,  A_NEXT,     TPS_InHyphenUWordPart,  0,      NULL}, 
+   {NULL,      0,  A_BINGO,    TPS_InParseHyphen,  CYRPARTHYPHENWORD,NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenLatWordPart[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       LATPARTHYPHENWORD,NULL}, 
+   {p_islatin,     0,  A_NEXT,     TPS_InHyphenLatWordPart,0,      NULL}, 
+   {p_isnonlatin,  0,  A_NEXT,     TPS_InHyphenUWordPart,  0,      NULL}, 
+   {p_isdigit,     0,  A_NEXT,     TPS_InHyphenUWordPart,  0,      NULL}, 
+   {NULL,      0,  A_BINGO,    TPS_InParseHyphen,  LATPARTHYPHENWORD,NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenUWordPart[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       PARTHYPHENWORD, NULL}, 
+   {p_isalnum,     0,  A_NEXT,     TPS_InHyphenUWordPart,  0,      NULL}, 
+   {NULL,      0,  A_BINGO,    TPS_InParseHyphen,  PARTHYPHENWORD, NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       UNSIGNEDINT,    NULL}, 
+   {p_isdigit,     0,  A_NEXT,     TPS_InHyphenUnsignedInt,0,      NULL}, 
+   {p_isalpha,     0,  A_NEXT,     TPS_InHyphenUWordPart,  0,      NULL}, 
+   {p_iseqC,   '.',    A_PUSH,     TPS_InHDecimalPartFirst,0,      NULL}, 
+   {NULL,      0,  A_BINGO,    TPS_InParseHyphen,  UNSIGNEDINT,    NULL}
+};
+
+static TParserStateActionItem actionTPS_InHDecimalPartFirst[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isdigit, 0,  A_CLEAR,    TPS_InHDecimalPart, 0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InHDecimalPart[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       DECIMAL,    NULL}, 
+   {p_isdigit, 0,  A_NEXT,     TPS_InHDecimalPart, 0,      NULL}, 
+   {p_iseqC,   '.',    A_PUSH,     TPS_InHVersionPartFirst,0,      NULL}, 
+   {NULL,      0,  A_BINGO,    TPS_InParseHyphen,  DECIMAL,    NULL}
+};
+
+static TParserStateActionItem actionTPS_InHVersionPartFirst[] = {
+   {p_isEOF,   0,  A_POP,      TPS_Null,       0,      NULL}, 
+   {p_isdigit, 0,  A_CLEAR,    TPS_InHVersionPart, 0,      NULL}, 
+   {NULL,      0,  A_POP,      TPS_Null,       0,      NULL}
+};
+
+static TParserStateActionItem actionTPS_InHVersionPart[] = {
+   {p_isEOF,   0,  A_BINGO,    TPS_Base,       VERSIONNUMBER,  NULL}, 
+   {p_isdigit, 0,  A_NEXT,     TPS_InHVersionPart, 0,      NULL}, 
+   {p_iseqC,   '.',    A_PUSH,     TPS_InHVersionPartFirst,0,      NULL}, 
+   {NULL,      0,  A_BINGO,    TPS_InParseHyphen,  VERSIONNUMBER,  NULL}
+};
+
+/* 
+ * order should be the same as in typedef enum {} TParserState!!
+ */
+
+static const TParserStateAction Actions[] = {
+   { TPS_Base,             actionTPS_Base }, 
+   { TPS_InUWord,          actionTPS_InUWord },
+   { TPS_InLatWord,        actionTPS_InLatWord },
+   { TPS_InCyrWord,        actionTPS_InCyrWord },
+   { TPS_InUnsignedInt,        actionTPS_InUnsignedInt },
+   { TPS_InSignedIntFirst,     actionTPS_InSignedIntFirst },
+   { TPS_InSignedInt,      actionTPS_InSignedInt },
+   { TPS_InSpace,          actionTPS_InSpace },    
+   { TPS_InUDecimalFirst,      actionTPS_InUDecimalFirst },    
+   { TPS_InUDecimal,       actionTPS_InUDecimal }, 
+   { TPS_InDecimalFirst,       actionTPS_InDecimalFirst }, 
+   { TPS_InDecimal,        actionTPS_InDecimal },  
+   { TPS_InVersionFirst,       actionTPS_InVersionFirst }, 
+   { TPS_InVersion,        actionTPS_InVersion },  
+   { TPS_InMantissaFirst,      actionTPS_InMantissaFirst },    
+   { TPS_InMantissaSign,       actionTPS_InMantissaSign }, 
+   { TPS_InMantissa,       actionTPS_InMantissa }, 
+   { TPS_InHTMLEntityFirst,    actionTPS_InHTMLEntityFirst },  
+   { TPS_InHTMLEntity,         actionTPS_InHTMLEntity },   
+   { TPS_InHTMLEntityNumFirst,     actionTPS_InHTMLEntityNumFirst },   
+   { TPS_InHTMLEntityNum,      actionTPS_InHTMLEntityNum },    
+   { TPS_InHTMLEntityEnd,      actionTPS_InHTMLEntityEnd },    
+   { TPS_InTagFirst,       actionTPS_InTagFirst }, 
+   { TPS_InTagCloseFirst,      actionTPS_InTagCloseFirst },    
+   { TPS_InTag,            actionTPS_InTag },  
+   { TPS_InTagEscapeK,         actionTPS_InTagEscapeK },   
+   { TPS_InTagEscapeKK,        actionTPS_InTagEscapeKK },  
+   { TPS_InTagBackSleshed,     actionTPS_InTagBackSleshed },
+   { TPS_InTagEnd,         actionTPS_InTagEnd },   
+   { TPS_InCommentFirst,       actionTPS_InCommentFirst }, 
+   { TPS_InCommentLast,        actionTPS_InCommentLast },  
+   { TPS_InComment,        actionTPS_InComment },  
+   { TPS_InCloseCommentFirst,  actionTPS_InCloseCommentFirst },    
+   { TPS_InCloseCommentLast,   actionTPS_InCloseCommentLast }, 
+   { TPS_InCommentEnd,         actionTPS_InCommentEnd },   
+   { TPS_InHostFirstDomen,     actionTPS_InHostFirstDomen },   
+   { TPS_InHostDomenSecond,    actionTPS_InHostDomenSecond },  
+   { TPS_InHostDomen,      actionTPS_InHostDomen },    
+   { TPS_InPortFirst,      actionTPS_InPortFirst },    
+   { TPS_InPort,           actionTPS_InPort }, 
+   { TPS_InHostFirstAN,        actionTPS_InHostFirstAN },  
+   { TPS_InHost,           actionTPS_InHost }, 
+   { TPS_InEmail,          actionTPS_InEmail },    
+   { TPS_InFileFirst,      actionTPS_InFileFirst },    
+   { TPS_InFile,           actionTPS_InFile }, 
+   { TPS_InFileNext,       actionTPS_InFileNext }, 
+   { TPS_InURIFirst,       actionTPS_InURIFirst }, 
+   { TPS_InURIStart,       actionTPS_InURIStart }, 
+   { TPS_InURI,            actionTPS_InURI },  
+   { TPS_InFURL,           actionTPS_InFURL }, 
+   { TPS_InProtocolFirst,      actionTPS_InProtocolFirst },    
+   { TPS_InProtocolSecond,     actionTPS_InProtocolSecond },   
+   { TPS_InProtocolEnd,        actionTPS_InProtocolEnd },  
+   { TPS_InHyphenLatWordFirst,     actionTPS_InHyphenLatWordFirst },   
+   { TPS_InHyphenLatWord,      actionTPS_InHyphenLatWord },    
+   { TPS_InHyphenCyrWordFirst,     actionTPS_InHyphenCyrWordFirst },   
+   { TPS_InHyphenCyrWord,      actionTPS_InHyphenCyrWord },    
+   { TPS_InHyphenUWordFirst,   actionTPS_InHyphenUWordFirst }, 
+   { TPS_InHyphenUWord,        actionTPS_InHyphenUWord },  
+   { TPS_InHyphenValueFirst,   actionTPS_InHyphenValueFirst }, 
+   { TPS_InHyphenValue,        actionTPS_InHyphenValue },  
+   { TPS_InHyphenValueExact,   actionTPS_InHyphenValueExact }, 
+   { TPS_InParseHyphen,        actionTPS_InParseHyphen },  
+   { TPS_InParseHyphenHyphen,  actionTPS_InParseHyphenHyphen },    
+   { TPS_InHyphenCyrWordPart,  actionTPS_InHyphenCyrWordPart },    
+   { TPS_InHyphenLatWordPart,  actionTPS_InHyphenLatWordPart },    
+   { TPS_InHyphenUWordPart,    actionTPS_InHyphenUWordPart },  
+   { TPS_InHyphenUnsignedInt,  actionTPS_InHyphenUnsignedInt },    
+   { TPS_InHDecimalPartFirst,  actionTPS_InHDecimalPartFirst },    
+   { TPS_InHDecimalPart,       actionTPS_InHDecimalPart }, 
+   { TPS_InHVersionPartFirst,  actionTPS_InHVersionPartFirst },    
+   { TPS_InHVersionPart,       actionTPS_InHVersionPart }, 
+   { TPS_Null,             NULL }
+};
+
+
+bool
+TParserGet( TParser *prs ) {
+   TParserStateActionItem *item=NULL;
+
+   if ( prs->state->posbyte >= prs->lenstr ) 
+       return false;
+
+   Assert( prs->state );
+   prs->lexeme    = prs->str + prs->state->posbyte;
+   prs->state->pushedAtAction = NULL;
+
+   /* look at string */
+   while (prs->state->posbyte <= prs->lenstr) {
+       if ( prs->state->posbyte == prs->lenstr ) 
+           prs->state->charlen = 0;
+       else
+           prs->state->charlen = ( prs->charmaxlen == 1 ) ? prs->charmaxlen : 
+               pg_mblen( prs->str + prs->state->posbyte );
+
+       Assert( prs->state->posbyte + prs->state->charlen <= prs->lenstr ); 
+       Assert( prs->state->state >=TPS_Base && prs->state->state < TPS_Null );
+       Assert( Actions[ prs->state->state ].state == prs->state->state ); 
+
+       item = Actions[ prs->state->state ].action;
+       Assert(item!=NULL);
+
+       if ( item < prs->state->pushedAtAction )
+           item =  prs->state->pushedAtAction;
+
+       /* find action by character class */
+       while( item->isclass ) {
+           prs->c = item->c;
+           if ( item->isclass(prs)!=0 ) {
+               if ( item > prs->state->pushedAtAction ) /* remember: after pushing we were by false way */ 
+                   break;
+           } 
+           item++;
+       }
+
+       prs->state->pushedAtAction = NULL;
+
+       /* call special handler if exists */
+       if ( item->special )
+           item->special(prs);
+
+       /* BINGO, lexeme is found */
+       if ( item->flags & A_BINGO ) {
+           Assert( item->type>0 );
+           prs->lenbytelexeme = prs->state->lenbytelexeme;
+           prs->lencharlexeme = prs->state->lencharlexeme;
+           prs->state->lenbytelexeme = prs->state->lencharlexeme = 0;
+           prs->type = item->type;
+       } 
+
+       /* do various actions by flags */   
+       if ( item->flags & A_POP ) {  /* pop stored state in stack */
+           TParserPosition *ptr = prs->state->prev;
+           pfree( prs->state );
+           prs->state = ptr;
+           Assert( prs->state );
+       } else if ( item->flags & A_PUSH ) { /* push (store) state in stack */ 
+           prs->state->pushedAtAction = item; /* remember where we push */
+           prs->state = newTParserPosition( prs->state );
+       } else if ( item->flags & A_CLEAR ) { /* clear previous pushed state */
+           TParserPosition *ptr;
+           Assert( prs->state->prev );
+           ptr = prs->state->prev->prev;
+           pfree( prs->state->prev );
+           prs->state->prev = ptr;
+       } else if ( item->flags & A_CLRALL ) { /* clear all previous pushed state */
+           TParserPosition *ptr;
+           while( prs->state->prev ) {
+               ptr = prs->state->prev->prev;
+               pfree( prs->state->prev );
+               prs->state->prev = ptr;
+           }
+       } else if ( item->flags & A_MERGE ) { /* merge posinfo with current and pushed state */
+           TParserPosition *ptr = prs->state;
+           Assert( prs->state->prev );
+           prs->state = prs->state->prev;
+
+           prs->state->posbyte = ptr->posbyte;
+           prs->state->poschar = ptr->poschar;
+           prs->state->charlen = ptr->charlen;
+           prs->state->lenbytelexeme = ptr->lenbytelexeme;
+           prs->state->lencharlexeme = ptr->lencharlexeme;
+           pfree(ptr); 
+       }
+
+       /* set new state if pointed */
+       if ( item->tostate != TPS_Null ) 
+           prs->state->state = item->tostate;
+
+       /* check for go away */ 
+       if ( (item->flags & A_BINGO) || (prs->state->posbyte >= prs->lenstr && (item->flags & A_RERUN)==0 ) ) 
+           break;
+
+       /* go to begining of loop if we should rerun or we just restore state */
+       if ( item->flags & ( A_RERUN | A_POP ) )
+           continue;
+   
+       /* move forward */  
+       if ( prs->state->charlen ) {
+           prs->state->posbyte += prs->state->charlen;
+           prs->state->lenbytelexeme += prs->state->charlen;
+           prs->state->poschar ++;
+           prs->state->lencharlexeme ++;
+       }
+   } 
+
+   return (item && (item->flags & A_BINGO)) ? true : false;
+}
+
+
index 3f0e0cd6359ff66f4f91050a215703d4319c6ab9..ee5b3b7ab5471e6dc194647db98bcd3542977464 100644 (file)
 #ifndef __PARSER_H__
 #define __PARSER_H__
 
-extern char *token;
-extern int tokenlen;
-int            tsearch2_yylex(void);
-void       tsearch2_start_parse_str(char *, int);
-void       tsearch2_end_parse(void);
+#include <ctype.h>
+#include <limits.h>
+#include "ts_locale.h"
+
+typedef enum {
+   TPS_Base = 0,
+   TPS_InUWord,
+   TPS_InLatWord,
+   TPS_InCyrWord,
+   TPS_InUnsignedInt,
+   TPS_InSignedIntFirst,
+   TPS_InSignedInt,
+   TPS_InSpace,
+   TPS_InUDecimalFirst,
+   TPS_InUDecimal,
+   TPS_InDecimalFirst,
+   TPS_InDecimal,
+   TPS_InVersionFirst,
+   TPS_InVersion,
+   TPS_InMantissaFirst,
+   TPS_InMantissaSign,
+   TPS_InMantissa,
+   TPS_InHTMLEntityFirst,
+   TPS_InHTMLEntity,
+   TPS_InHTMLEntityNumFirst,
+   TPS_InHTMLEntityNum,
+   TPS_InHTMLEntityEnd,
+   TPS_InTagFirst,
+   TPS_InTagCloseFirst,
+   TPS_InTag,
+   TPS_InTagEscapeK,
+   TPS_InTagEscapeKK,
+   TPS_InTagBackSleshed,
+   TPS_InTagEnd,
+   TPS_InCommentFirst,
+   TPS_InCommentLast,
+   TPS_InComment,
+   TPS_InCloseCommentFirst,
+   TPS_InCloseCommentLast,
+   TPS_InCommentEnd,
+   TPS_InHostFirstDomen,
+   TPS_InHostDomenSecond,
+   TPS_InHostDomen,
+   TPS_InPortFirst,
+   TPS_InPort,
+   TPS_InHostFirstAN,
+   TPS_InHost,
+   TPS_InEmail,
+   TPS_InFileFirst,
+   TPS_InFile,
+   TPS_InFileNext,
+   TPS_InURIFirst,
+   TPS_InURIStart,
+   TPS_InURI,
+   TPS_InFURL,
+   TPS_InProtocolFirst,
+   TPS_InProtocolSecond,
+   TPS_InProtocolEnd,
+   TPS_InHyphenLatWordFirst,
+   TPS_InHyphenLatWord,
+   TPS_InHyphenCyrWordFirst,
+   TPS_InHyphenCyrWord,
+   TPS_InHyphenUWordFirst,
+   TPS_InHyphenUWord,
+   TPS_InHyphenValueFirst,
+   TPS_InHyphenValue,
+   TPS_InHyphenValueExact,
+   TPS_InParseHyphen,
+   TPS_InParseHyphenHyphen,
+   TPS_InHyphenCyrWordPart,
+   TPS_InHyphenLatWordPart,
+   TPS_InHyphenUWordPart,
+   TPS_InHyphenUnsignedInt,
+   TPS_InHDecimalPartFirst,
+   TPS_InHDecimalPart,
+   TPS_InHVersionPartFirst,
+   TPS_InHVersionPart,
+   TPS_Null  /* last state (fake value) */
+} TParserState;
+
+/* forward declaration */
+struct TParser;
+
+
+typedef int (*TParserCharTest)(struct TParser*);  /* any p_is* functions except p_iseq */
+typedef void (*TParserSpecial)(struct TParser*);  /* special handler for special cases... */
+
+typedef struct {
+        TParserCharTest isclass;
+        char            c;
+        uint16          flags;
+        TParserState    tostate;
+        int             type;
+        TParserSpecial  special;
+} TParserStateActionItem;
+
+typedef struct {
+        TParserState            state;
+        TParserStateActionItem  *action;
+} TParserStateAction;
+
+typedef struct TParserPosition {
+   int     posbyte; /* position of parser in bytes */
+   int     poschar; /* osition of parser in characters */
+   int     charlen; /* length of current char */
+   int         lenbytelexeme;
+   int         lencharlexeme;
+   TParserState    state;
+   struct TParserPosition  *prev;
+   int     flags;
+   TParserStateActionItem  *pushedAtAction;
+} TParserPosition;
+
+typedef struct TParser {
+   /* string and position information */
+   char        *str;  /* multibyte string */
+   int     lenstr; /* length of mbstring */
+   wchar_t     *wstr;  /* wide character string */ 
+   int     lenwstr; /* length of wsting */
+
+   /* State of parse */
+   int     charmaxlen;
+   bool        usewide;
+   TParserPosition *state;
+   bool        ignore;
+   bool        wanthost;
+
+   /* silly char */
+   char c;
+
+   /* out */
+   char        *lexeme;
+   int         lenbytelexeme;
+   int         lencharlexeme;
+   int         type;
+   
+} TParser;
+
+
+TParser* TParserInit( char *, int );
+bool   TParserGet( TParser* );
+void   TParserClose( TParser* );
 
 #endif
diff --git a/contrib/tsearch2/wordparser/parser.l b/contrib/tsearch2/wordparser/parser.l
deleted file mode 100644 (file)
index a7cb468..0000000
+++ /dev/null
@@ -1,346 +0,0 @@
-%{
-#include "postgres.h"
-
-#include "deflex.h"
-#include "parser.h"
-#include "common.h"
-
-/* Avoid exit() on fatal scanner errors */
-#undef fprintf
-#define fprintf(file, fmt, msg)  ts_error(ERROR, fmt, msg)
-
-char *token = NULL;  /* pointer to token */
-int tokenlen;
-static char *s     = NULL;  /* to return WHOLE hyphenated-word */
-
-YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
-
-typedef struct {
-   int tlen;
-   int clen;
-   char *str;
-} TagStorage;
-
-static TagStorage ts={0,0,NULL};
-
-static void
-addTag(void)
-{
-   while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
-       ts.tlen*=2;
-       ts.str=realloc(ts.str,ts.tlen);
-       if (!ts.str)
-                   ereport(ERROR,
-                                   (errcode(ERRCODE_OUT_OF_MEMORY),
-                                    errmsg("out of memory")));
-        }
-        memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
-        ts.clen+=tsearch2_yyleng;
-   ts.str[ts.clen]='\0';
-}
-
-static void
-startTag(void)
-{
-   if ( ts.str==NULL ) {
-       ts.tlen=tsearch2_yyleng+1;
-       ts.str=malloc(ts.tlen);
-       if (!ts.str)
-                   ereport(ERROR,
-                                (errcode(ERRCODE_OUT_OF_MEMORY),
-                                 errmsg("out of memory")));
-   }
-   ts.clen=0;
-   ts.str[0]='\0';
-   addTag();
-}
-
-%}
-
-%option 8bit
-%option never-interactive
-%option nodefault
-%option nounput
-%option noyywrap
-
-/* parser's state for parsing hyphenated-word */
-%x DELIM  
-/* parser's state for parsing URL*/
-%x URL  
-%x SERVER  
-
-/* parser's state for parsing TAGS */
-%x INTAG
-%x QINTAG
-%x INCOMMENT
-%x INSCRIPT
-
-/* cyrillic koi8 char */
-CYRALNUM   [0-9\200-\377]
-CYRALPHA   [\200-\377]
-ALPHA      [a-zA-Z\200-\377]
-ALNUM      [0-9a-zA-Z\200-\377]
-
-
-HOSTNAME   ([-_[:alnum:]]+\.)+[[:alpha:]]+
-URI        [-_[:alnum:]/%,\.;=&?#]+
-
-%%
-
-"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
-
-<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
-   BEGIN INITIAL; 
-   addTag();
-   token = ts.str;
-   tokenlen = ts.clen;
-   return TAG;
-}
-
-"<!--" { BEGIN INCOMMENT; startTag(); }
-
-<INCOMMENT>"-->"   { 
-   BEGIN INITIAL;
-   addTag();
-   token = ts.str;
-   tokenlen = ts.clen;
-   return TAG;
-}
-
-
-"<"[\![:alpha:]]   { BEGIN INTAG; startTag(); }
-
-"</"[[:alpha:]]    { BEGIN INTAG; startTag(); }
-
-<INTAG>"\""    { BEGIN QINTAG; addTag(); }
-
-<QINTAG>"\\\"" { addTag(); }
-
-<QINTAG>"\""   { BEGIN INTAG; addTag(); }
-
-<INTAG>">" { 
-   BEGIN INITIAL;
-   addTag();
-   token = ts.str;
-   tokenlen = ts.clen;
-   return TAG;
-}
-
-<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }    
-
-\&(quot|amp|nbsp|lt|gt)\;   {
-   token = tsearch2_yytext;
-   tokenlen = tsearch2_yyleng;
-   return HTMLENTITY;
-}
-
-\&\#[0-9][0-9]?[0-9]?\; {
-   token = tsearch2_yytext;
-   tokenlen = tsearch2_yyleng;
-   return HTMLENTITY;
-}
-[-_\.[:alnum:]]+@{HOSTNAME}  /* Emails */ { 
-   token = tsearch2_yytext; 
-   tokenlen = tsearch2_yyleng;
-   return EMAIL; 
-}
-
-[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+  /* float */   { 
-   token = tsearch2_yytext; 
-   tokenlen = tsearch2_yyleng;
-   return SCIENTIFIC; 
-}
-
-[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
-   token = tsearch2_yytext;
-   tokenlen = tsearch2_yyleng;
-   return VERSIONNUMBER;
-}
-
-[+-]?[0-9]+\.[0-9]+ {
-   token = tsearch2_yytext;
-   tokenlen = tsearch2_yyleng;
-   return DECIMAL;
-}
-
-[+-][0-9]+ { 
-   token = tsearch2_yytext; 
-   tokenlen = tsearch2_yyleng;
-   return SIGNEDINT; 
-}
-
-<DELIM,INITIAL>[0-9]+ { 
-   token = tsearch2_yytext; 
-   tokenlen = tsearch2_yyleng;
-   return UNSIGNEDINT; 
-}
-
-http"://"        { 
-   BEGIN URL; 
-   token = tsearch2_yytext;
-   tokenlen = tsearch2_yyleng;
-   return HTTP;
-}
-
-ftp"://"        { 
-   BEGIN URL; 
-   token = tsearch2_yytext;
-   tokenlen = tsearch2_yyleng;
-   return HTTP;
-}
-
-<URL,INITIAL>{HOSTNAME}[/:]{URI} { 
-   BEGIN SERVER;
-   if (s) { free(s); s=NULL; } 
-   s = strdup( tsearch2_yytext ); 
-   tokenlen = tsearch2_yyleng;
-   yyless( 0 ); 
-   token = s;
-   return FURL;
-}
-
-<SERVER,URL,INITIAL>{HOSTNAME} {
-   token = tsearch2_yytext; 
-   tokenlen = tsearch2_yyleng;
-   return HOST;
-}
-
-<SERVER>[/:]{URI}  {
-   token = tsearch2_yytext;
-   tokenlen = tsearch2_yyleng;
-   return URI;
-}
-
-[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
-   token = tsearch2_yytext;
-   tokenlen = tsearch2_yyleng;
-   return FILEPATH;
-}
-
-({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */    {
-   BEGIN DELIM;
-   if (s) { free(s); s=NULL; } 
-   s = strdup( tsearch2_yytext );
-   tokenlen = tsearch2_yyleng;
-   yyless( 0 );
-   token = s;
-   return CYRHYPHENWORD;
-}
-
-([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */  {
-    BEGIN DELIM;
-   if (s) { free(s); s=NULL; } 
-   s = strdup( tsearch2_yytext );
-   tokenlen = tsearch2_yyleng;
-   yyless( 0 );
-   token = s;
-   return LATHYPHENWORD;
-}
-
-({ALNUM}+-)+{ALNUM}+ /* composite-word */  {
-   BEGIN DELIM;
-   if (s) { free(s); s=NULL; } 
-   s = strdup( tsearch2_yytext );
-   tokenlen = tsearch2_yyleng;
-   yyless( 0 );
-   token = s;
-   return HYPHENWORD;
-}
-
-<DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
-   token = tsearch2_yytext;
-   tokenlen = tsearch2_yyleng;
-   return VERSIONNUMBER;
-}
-
-<DELIM>\+?[0-9]+\.[0-9]+ {
-   token = tsearch2_yytext;
-   tokenlen = tsearch2_yyleng;
-   return DECIMAL;
-}
-
-<DELIM>{CYRALPHA}+  /* one word in composite-word */   { 
-   token = tsearch2_yytext; 
-   tokenlen = tsearch2_yyleng;
-   return CYRPARTHYPHENWORD; 
-}
-
-<DELIM>[[:alpha:]]+  /* one word in composite-word */  { 
-   token = tsearch2_yytext; 
-   tokenlen = tsearch2_yyleng;
-   return LATPARTHYPHENWORD; 
-}
-
-<DELIM>{ALNUM}+  /* one word in composite-word */  { 
-   token = tsearch2_yytext; 
-   tokenlen = tsearch2_yyleng;
-   return PARTHYPHENWORD; 
-}
-
-<DELIM>-  { 
-   token = tsearch2_yytext;
-   tokenlen = tsearch2_yyleng;
-   return SPACE;
-}
-
-<DELIM,SERVER,URL>.|\n /* return in basic state */ {
-   BEGIN INITIAL;
-   yyless( 0 );
-}
-
-{CYRALPHA}+ /* normal word */  { 
-   token = tsearch2_yytext; 
-   tokenlen = tsearch2_yyleng;
-   return CYRWORD; 
-}
-
-[[:alpha:]]+ /* normal word */ { 
-   token = tsearch2_yytext; 
-   tokenlen = tsearch2_yyleng;
-   return LATWORD; 
-}
-
-{ALNUM}+ /* normal word */ { 
-   token = tsearch2_yytext; 
-   tokenlen = tsearch2_yyleng;
-   return UWORD; 
-}
-
-[ \r\n\t]+ {
-   token = tsearch2_yytext;
-   tokenlen = tsearch2_yyleng;
-   return SPACE;
-}
-
-. {
-   token = tsearch2_yytext;
-   tokenlen = tsearch2_yyleng;
-   return SPACE;
-} 
-
-%%
-
-/* clearing after parsing from string */
-void
-tsearch2_end_parse(void)
-{
-   if (s)
-   {
-       free(s);
-       s = NULL;
-   } 
-   tsearch2_yy_delete_buffer( buf );
-   buf = NULL;
-} 
-
-/* start parse from string */
-void
-tsearch2_start_parse_str(char* str, int limit)
-{
-   if (buf)
-       tsearch2_end_parse();
-   buf = tsearch2_yy_scan_bytes( str, limit );
-   tsearch2_yy_switch_to_buffer( buf );
-   BEGIN INITIAL;
-}
index 6686257887222aa2face214f4d3dfd24cb7a8190..897ff2795e27690f7f0885eb16d82840718fe826 100644 (file)
@@ -39,8 +39,7 @@ Datum     prsd_start(PG_FUNCTION_ARGS);
 Datum
 prsd_start(PG_FUNCTION_ARGS)
 {
-   tsearch2_start_parse_str((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1));
-   PG_RETURN_POINTER(NULL);
+   PG_RETURN_POINTER(TParserInit( (char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
 }
 
 PG_FUNCTION_INFO_V1(prsd_getlexeme);
@@ -48,14 +47,17 @@ Datum       prsd_getlexeme(PG_FUNCTION_ARGS);
 Datum
 prsd_getlexeme(PG_FUNCTION_ARGS)
 {
-   /* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
+   TParser *p=(TParser*)PG_GETARG_POINTER(0); 
    char      **t = (char **) PG_GETARG_POINTER(1);
    int        *tlen = (int *) PG_GETARG_POINTER(2);
-   int         type = tsearch2_yylex();
 
-   *t = token;
-   *tlen = tokenlen;
-   PG_RETURN_INT32(type);
+   if ( !TParserGet(p) ) 
+       PG_RETURN_INT32(0);
+
+   *t = p->lexeme; 
+   *tlen = p->lenbytelexeme;
+
+   PG_RETURN_INT32(p->type);
 }
 
 PG_FUNCTION_INFO_V1(prsd_end);
@@ -63,8 +65,8 @@ Datum     prsd_end(PG_FUNCTION_ARGS);
 Datum
 prsd_end(PG_FUNCTION_ARGS)
 {
-   /* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
-   tsearch2_end_parse();
+   TParser *p=(TParser*)PG_GETARG_POINTER(0);
+   TParserClose(p); 
    PG_RETURN_VOID();
 }