{NODICT, NODICT}, /* EMAIL */
{NODICT, NODICT}, /* FURL */
{NODICT, NODICT}, /* HOST */
- {NODICT, NODICT}, /* FLOAT */
- {NODICT, NODICT}, /* FINT */
- {BYLOCALE, DEFAULTDICT}, /* PARTWORD */
- {BYLOCALE, NODICT}, /* NONLATINPARTWORD */
- {DEFAULTDICT, NODICT}, /* LATPARTWORD */
+ {NODICT, NODICT}, /* SCIENTIFIC */
+ {NODICT, NODICT}, /* VERSIONNUMBER */
+ {BYLOCALE, DEFAULTDICT}, /* PARTHYPHENWORD */
+ {BYLOCALE, NODICT}, /* CYRPARTHYPHENWORD */
+ {DEFAULTDICT, NODICT}, /* LATPARTHYPHENWORD */
{STOPLEXEM, NODICT}, /* SPACE */
- {STOPLEXEM, NODICT}, /* SYMTAG */
+ {STOPLEXEM, NODICT}, /* TAG */
{STOPLEXEM, NODICT}, /* HTTP */
- {BYLOCALE, DEFAULTDICT}, /* DEFISWORD */
- {DEFAULTDICT, NODICT}, /* DEFISLATWORD */
- {BYLOCALE, NODICT}, /* DEFISNONLATINWORD */
+ {BYLOCALE, DEFAULTDICT}, /* HYPHENWORD */
+ {DEFAULTDICT, NODICT}, /* LATHYPHENWORD */
+ {BYLOCALE, NODICT}, /* CYRHYPHENWORD */
{NODICT, NODICT}, /* URI */
- {NODICT, NODICT} /* FILEPATH */
+ {NODICT, NODICT}, /* FILEPATH */
+ {NODICT, NODICT}, /* DECIMAL */
+ {NODICT, NODICT}, /* SIGNEDINT */
+ {NODICT, NODICT}, /* UNSIGNEDINT */
+ {STOPLEXEM, NODICT} /* HTMLENTITY */
};
static bool inited = false;
/* postgres allocation function */
#include "postgres.h"
-#define free pfree
-#define malloc palloc
+#define free pfree
+#define malloc palloc
#define realloc repalloc
#ifdef strdup
#undef strdup
#endif
-#define strdup pstrdup
-
+#define strdup pstrdup
char *token = NULL; /* pointer to token */
-char *s = NULL; /* for returning full defis-word */
+char *s = NULL; /* to return WHOLE hyphenated-word */
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
%option nounput
%option noyywrap
-
-/* parser's state for parsing defis-word */
+/* parser's state for parsing hyphenated-word */
%x DELIM
/* parser's state for parsing URL*/
%x URL
%x SERVER
-/* parser's state for parsing filepath */
-
+/* parser's state for parsing TAGS */
%x INTAG
%x QINTAG
+%x INCOMMENT
+%x INSCRIPT
-/* NONLATIN char */
-NONLATINALNUM [0-9\200-\377]
-NONLATINALPHA [\200-\377]
+/* cyrillic koi8 char */
+CYRALNUM [0-9\200-\377]
+CYRALPHA [\200-\377]
ALPHA [a-zA-Z\200-\377]
ALNUM [0-9a-zA-Z\200-\377]
%%
-"<"[[:alpha:]] { BEGIN INTAG;
- token = tsearch_yytext;
- tokenlen = tsearch_yyleng;
- return SYMTAG;
- }
-
-"</"[[:alpha:]] { BEGIN INTAG;
- token = tsearch_yytext;
- tokenlen = tsearch_yyleng;
- return SYMTAG;
- }
+"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
-"<>" {
+<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
+ BEGIN INITIAL;
+ *tsearch_yytext=' '; *(tsearch_yytext+1) = '\0';
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return SYMTAG;
+ return SPACE;
}
-"<"[^>[:alpha:]] {
+"<!--" { BEGIN INCOMMENT; }
+
+<INCOMMENT>"-->" {
+ BEGIN INITIAL;
+ *tsearch_yytext=' '; *(tsearch_yytext+1) = '\0';
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SPACE;
}
-<INTAG>"\"" { BEGIN QINTAG;
- token = tsearch_yytext;
- tokenlen = tsearch_yyleng;
- return SYMTAG;
- }
-<QINTAG>"\\\"" {
- token = tsearch_yytext;
- tokenlen = tsearch_yyleng;
- return SYMTAG;
-}
+"<"[\![:alpha:]] { BEGIN INTAG; }
-<QINTAG>"\"" { BEGIN INTAG;
- token = tsearch_yytext;
- tokenlen = tsearch_yyleng;
- return SYMTAG;
- }
+"</"[[:alpha:]] { BEGIN INTAG; }
-<QINTAG>.|\n {
+<INTAG>"\"" { BEGIN QINTAG; }
+
+<QINTAG>"\\\"" ;
+
+<QINTAG>"\"" { BEGIN INTAG; }
+
+<INTAG>">" {
+ BEGIN INITIAL;
token = tsearch_yytext;
- tokenlen = tsearch_yyleng;
- return SYMTAG;
+ *tsearch_yytext=' ';
+ token = tsearch_yytext;
+ tokenlen = 1;
+ return TAG;
}
-<INTAG>">" { BEGIN INITIAL;
+<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n ;
+
+\&(quot|amp|nbsp|lt|gt)\; {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return SYMTAG;
- }
+ return HTMLENTITY;
+}
-<INTAG>.|\n {
+\&\#[0-9][0-9]?[0-9]?\; {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return SYMTAG;
+ return HTMLENTITY;
}
-
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
token = tsearch_yytext;
return EMAIL;
}
-<DELIM,INITIAL>[0-9] /* digit's and point (might be a version) */ {
+[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return FINT;
+ return SCIENTIFIC;
+}
+
+[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
+ token = tsearch_yytext;
+ tokenlen = tsearch_yyleng;
+ return VERSIONNUMBER;
+}
+
+[+-]?[0-9]+\.[0-9]+ {
+ token = tsearch_yytext;
+ tokenlen = tsearch_yyleng;
+ return DECIMAL;
}
-<DELIM,INITIAL>[0-9]+[0-9\.]*[0-9] /* digit's and point (might be a version) */ {
+[+-][0-9]+ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return FINT;
+ return SIGNEDINT;
}
-[+-]?[0-9\.]+[eE][+-]?[0-9]+ /* float */ {
+<DELIM,INITIAL>[0-9]+ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return FLOAT;
+ return UNSIGNEDINT;
}
http"://" {
return FILEPATH;
}
-({NONLATINALNUM}+-)+{NONLATINALPHA}+ /* composite-word */ {
+({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch_yytext );
tokenlen = tsearch_yyleng;
yyless( 0 );
token = s;
- return DEFISNONLATINWORD;
+ return CYRHYPHENWORD;
}
-([[:alnum:]]+-)+[[:alpha:]]+ /* composite-word */ {
+([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
- tokenlen = tsearch_yyleng;
s = strdup( tsearch_yytext );
+ tokenlen = tsearch_yyleng;
yyless( 0 );
token = s;
- return DEFISLATWORD;
+ return LATHYPHENWORD;
}
-({ALNUM}+-)+{ALPHA}+ /* composite-word */ {
+({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch_yytext );
tokenlen = tsearch_yyleng;
yyless( 0 );
token = s;
- return DEFISWORD;
+ return HYPHENWORD;
+}
+
+<DELIM>\+?[0-9]+\.[0-9]+ {
+ token = tsearch_yytext;
+ tokenlen = tsearch_yyleng;
+ return DECIMAL;
}
-<DELIM>{NONLATINALNUM}+ /* one word in composite-word */ {
+<DELIM>{CYRALPHA}+ /* one word in composite-word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return NONLATINPARTWORD;
+ return CYRPARTHYPHENWORD;
}
-<DELIM>[[:alnum:]]+ /* one word in composite-word */ {
+<DELIM>[[:alpha:]]+ /* one word in composite-word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return LATPARTWORD;
+ return LATPARTHYPHENWORD;
}
<DELIM>{ALNUM}+ /* one word in composite-word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return PARTWORD;
+ return PARTHYPHENWORD;
}
<DELIM>- {
<DELIM,SERVER,URL>.|\n /* return in basic state */ {
BEGIN INITIAL;
- tokenlen = tsearch_yyleng;
yyless( 0 );
}
-{NONLATINALNUM}+ /* normal word */ {
+{CYRALPHA}+ /* normal word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return NONLATINWORD;
+ return CYRWORD;
}
-[[:alnum:]]+ /* normal word */ {
+[[:alpha:]]+ /* normal word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return LATWORD;
return UWORD;
}
-.|\n {
+[ \r\n\t]+ {
+ token = tsearch_yytext;
+ tokenlen = tsearch_yyleng;
+ return SPACE;
+}
+
+. {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SPACE;