Teach regex_fixed_prefix() the correct handling of advanced regex
authorTom Lane <tgl@sss.pgh.pa.us>
Thu, 2 Dec 2004 02:45:07 +0000 (02:45 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Thu, 2 Dec 2004 02:45:07 +0000 (02:45 +0000)
escapes --- they aren't simply quoted characters.  Problem noted by
Antti Salmela.  Also fix problem with incorrect handling of multibyte
characters when followed by a quantifier.

src/backend/utils/adt/selfuncs.c

index a3f782895b9b7f765045409042adc20cbc43dc64..c388d28717e42a94ed23047949a5a702ed390ddd 100644 (file)
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.167 2004/11/09 00:34:42 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.168 2004/12/02 02:45:07 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -3459,6 +3459,8 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
        char       *match;
        int                     pos,
                                match_pos,
+                               prev_pos,
+                               prev_match_pos,
                                paren_depth;
        char       *patt;
        char       *rest;
@@ -3519,11 +3521,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 
        /* OK, allocate space for pattern */
        match = palloc(strlen(patt) + 1);
-       match_pos = 0;
+       prev_match_pos = match_pos = 0;
 
        /* note start at pos 1 to skip leading ^ */
-       for (pos = 1; patt[pos]; pos++)
+       for (prev_pos = pos = 1; patt[pos]; )
        {
+               int             len;
+
                /*
                 * Check for characters that indicate multiple possible matches
                 * here. XXX I suspect isalpha() is not an adequately
@@ -3537,6 +3541,14 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
                        (case_insensitive && isalpha((unsigned char) patt[pos])))
                        break;
 
+               /*
+                * In AREs, backslash followed by alphanumeric is an escape, not
+                * a quoted character.  Must treat it as having multiple possible
+                * matches.
+                */
+               if (patt[pos] == '\\' && isalnum((unsigned char) patt[pos + 1]))
+                       break;
+
                /*
                 * Check for quantifiers.  Except for +, this means the preceding
                 * character is optional, so we must remove it from the prefix
@@ -3546,14 +3558,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
                        patt[pos] == '?' ||
                        patt[pos] == '{')
                {
-                       if (match_pos > 0)
-                               match_pos--;
-                       pos--;
+                       match_pos = prev_match_pos;
+                       pos = prev_pos;
                        break;
                }
                if (patt[pos] == '+')
                {
-                       pos--;
+                       pos = prev_pos;
                        break;
                }
                if (patt[pos] == '\\')
@@ -3563,7 +3574,14 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
                        if (patt[pos] == '\0')
                                break;
                }
-               match[match_pos++] = patt[pos];
+               /* save position in case we need to back up on next loop cycle */
+               prev_match_pos = match_pos;
+               prev_pos = pos;
+               /* must use encoding-aware processing here */
+               len = pg_mblen(&patt[pos]);
+               memcpy(&match[match_pos], &patt[pos], len);
+               match_pos += len;
+               pos += len;
        }
 
        match[match_pos] = '\0';