Let regexp_replace() make use of REG_NOSUB when feasible.

author Tom Lane <tgl@sss.pgh.pa.us>

Tue, 10 Aug 2021 00:53:25 +0000 (20:53 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Tue, 10 Aug 2021 00:53:25 +0000 (20:53 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Tue, 10 Aug 2021 00:53:25 +0000 (20:53 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Tue, 10 Aug 2021 00:53:25 +0000 (20:53 -0400)
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c

index 268cee1cbeddeffc74add77992015265a3077ec3..3b7a607437c96179ae618add9b5e7a2bbd23f24c 100644 (file)
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -630,11 +630,10 @@ textregexreplace_noopt(PG_FUNCTION_ARGS)
     text       *s = PG_GETARG_TEXT_PP(0);
     text       *p = PG_GETARG_TEXT_PP(1);
     text       *r = PG_GETARG_TEXT_PP(2);
-   regex_t    *re;
-
-   re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
  
-   PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, 0, 1));
+   PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
+                                        REG_ADVANCED, PG_GET_COLLATION(),
+                                        0, 1));
  }
  
  /*
@@ -648,7 +647,6 @@ textregexreplace(PG_FUNCTION_ARGS)
     text       *p = PG_GETARG_TEXT_PP(1);
     text       *r = PG_GETARG_TEXT_PP(2);
     text       *opt = PG_GETARG_TEXT_PP(3);
-   regex_t    *re;
     pg_re_flags flags;
  
     /*
@@ -672,10 +670,9 @@ textregexreplace(PG_FUNCTION_ARGS)
  
     parse_re_flags(&flags, opt);
  
-   re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION());
-
-   PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, 0,
-                                        flags.glob ? 0 : 1));
+   PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
+                                        flags.cflags, PG_GET_COLLATION(),
+                                        0, flags.glob ? 0 : 1));
  }
  
  /*
@@ -694,7 +691,6 @@ textregexreplace_extended(PG_FUNCTION_ARGS)
     int         n = 1;
     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5);
     pg_re_flags re_flags;
-   regex_t    *re;
  
     /* Collect optional parameters */
     if (PG_NARGS() > 3)
@@ -723,11 +719,10 @@ textregexreplace_extended(PG_FUNCTION_ARGS)
     if (PG_NARGS() <= 4)
         n = re_flags.glob ? 0 : 1;
  
-   /* Compile the regular expression */
-   re = RE_compile_and_cache(p, re_flags.cflags, PG_GET_COLLATION());
-
     /* Do the replacement(s) */
-   PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, start - 1, n));
+   PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
+                                        re_flags.cflags, PG_GET_COLLATION(),
+                                        start - 1, n));
  }
  
  /* This is separate to keep the opr_sanity regression test from complaining */
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c

index 348b5566de45733e67966e502c8876e49c87d0cc..acb87417341c107984b8ffc5cae0c98345c284a0 100644 (file)
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -4359,34 +4359,36 @@ replace_text(PG_FUNCTION_ARGS)
  }
  
  /*
- * check_replace_text_has_escape_char
+ * check_replace_text_has_escape
   *
- * check whether replace_text contains escape char.
+ * Returns 0 if text contains no backslashes that need processing.
+ * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
+ * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
   */
-static bool
-check_replace_text_has_escape_char(const text *replace_text)
+static int
+check_replace_text_has_escape(const text *replace_text)
  {
+   int         result = 0;
     const char *p = VARDATA_ANY(replace_text);
     const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
  
-   if (pg_database_encoding_max_length() == 1)
-   {
-       for (; p < p_end; p++)
-       {
-           if (*p == '\\')
-               return true;
-       }
-   }
-   else
+   while (p < p_end)
     {
-       for (; p < p_end; p += pg_mblen(p))
+       /* Find next escape char, if any. */
+       p = memchr(p, '\\', p_end - p);
+       if (p == NULL)
+           break;
+       p++;
+       /* Note: a backslash at the end doesn't require extra processing. */
+       if (p < p_end)
         {
-           if (*p == '\\')
-               return true;
+           if (*p >= '1' && *p <= '9')
+               return 2;       /* Found a submatch specifier, so done */
+           result = 1;         /* Found some other sequence, keep looking */
+           p++;
         }
     }
-
-   return false;
+   return result;
  }
  
  /*
@@ -4403,25 +4405,17 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
  {
     const char *p = VARDATA_ANY(replace_text);
     const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
-   int         eml = pg_database_encoding_max_length();
  
-   for (;;)
+   while (p < p_end)
     {
         const char *chunk_start = p;
         int         so;
         int         eo;
  
-       /* Find next escape char. */
-       if (eml == 1)
-       {
-           for (; p < p_end && *p != '\\'; p++)
-                /* nothing */ ;
-       }
-       else
-       {
-           for (; p < p_end && *p != '\\'; p += pg_mblen(p))
-                /* nothing */ ;
-       }
+       /* Find next escape char, if any. */
+       p = memchr(p, '\\', p_end - p);
+       if (p == NULL)
+           p = p_end;
  
         /* Copy the text we just scanned over, if any. */
         if (p > chunk_start)
@@ -4473,7 +4467,7 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
             continue;
         }
  
-       if (so != -1 && eo != -1)
+       if (so >= 0 && eo >= 0)
         {
             /*
              * Copy the text that is back reference of regexp.  Note so and eo
@@ -4491,36 +4485,37 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
     }
  }
  
-#define REGEXP_REPLACE_BACKREF_CNT     10
-
  /*
   * replace_text_regexp
   *
- * replace substring(s) in src_text that match regexp with replace_text.
+ * replace substring(s) in src_text that match pattern with replace_text.
+ * The replace_text can contain backslash markers to substitute
+ * (parts of) the matched text.
   *
+ * cflags: regexp compile flags.
+ * collation: collation to use.
   * search_start: the character (not byte) offset in src_text at which to
   * begin searching.
   * n: if 0, replace all matches; if > 0, replace only the N'th match.
- *
- * Note: to avoid having to include regex.h in builtins.h, we declare
- * the regexp argument as void *, but really it's regex_t *.
   */
  text *
-replace_text_regexp(text *src_text, void *regexp,
+replace_text_regexp(text *src_text, text *pattern_text,
                     text *replace_text,
+                   int cflags, Oid collation,
                     int search_start, int n)
  {
     text       *ret_text;
-   regex_t    *re = (regex_t *) regexp;
+   regex_t    *re;
     int         src_text_len = VARSIZE_ANY_EXHDR(src_text);
     int         nmatches = 0;
     StringInfoData buf;
-   regmatch_t  pmatch[REGEXP_REPLACE_BACKREF_CNT];
+   regmatch_t  pmatch[10];     /* main match, plus \1 to \9 */
+   int         nmatch = lengthof(pmatch);
     pg_wchar   *data;
     size_t      data_len;
     int         data_pos;
     char       *start_ptr;
-   bool        have_escape;
+   int         escape_status;
  
     initStringInfo(&buf);
  
@@ -4528,8 +4523,19 @@ replace_text_regexp(text *src_text, void *regexp,
     data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
     data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
  
-   /* Check whether replace_text has escape char. */
-   have_escape = check_replace_text_has_escape_char(replace_text);
+   /* Check whether replace_text has escapes, especially regexp submatches. */
+   escape_status = check_replace_text_has_escape(replace_text);
+
+   /* If no regexp submatches, we can use REG_NOSUB. */
+   if (escape_status < 2)
+   {
+       cflags |= REG_NOSUB;
+       /* Also tell pg_regexec we only want the whole-match location. */
+       nmatch = 1;
+   }
+
+   /* Prepare the regexp. */
+   re = RE_compile_and_cache(pattern_text, cflags, collation);
  
     /* start_ptr points to the data_pos'th character of src_text */
     start_ptr = (char *) VARDATA_ANY(src_text);
@@ -4546,7 +4552,7 @@ replace_text_regexp(text *src_text, void *regexp,
                                     data_len,
                                     search_start,
                                     NULL,   /* no details */
-                                   REGEXP_REPLACE_BACKREF_CNT,
+                                   nmatch,
                                     pmatch,
                                     0);
  
@@ -4602,10 +4608,9 @@ replace_text_regexp(text *src_text, void *regexp,
         }
  
         /*
-        * Copy the replace_text. Process back references when the
-        * replace_text has escape characters.
+        * Copy the replace_text, processing escapes if any are present.
          */
-       if (have_escape)
+       if (escape_status > 0)
             appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
                                          start_ptr, data_pos);
         else
diff --git a/src/include/utils/varlena.h b/src/include/utils/varlena.h

index 6645e2af137a29c5e512406e7cf37bb0c56d929d..cd12252ed93b53f92bd5c44326fd2257ae0d8ccd 100644 (file)
--- a/src/include/utils/varlena.h
+++ b/src/include/utils/varlena.h
@@ -33,8 +33,9 @@ extern bool SplitDirectoriesString(char *rawstring, char separator,
                                    List **namelist);
  extern bool SplitGUCList(char *rawstring, char separator,
                          List **namelist);
-extern text *replace_text_regexp(text *src_text, void *regexp,
+extern text *replace_text_regexp(text *src_text, text *pattern_text,
                                  text *replace_text,
+                                int cflags, Oid collation,
                                  int search_start, int n);
  
  #endif
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out

index a9efd74c7b4fca18b2963f77a24064aac612b6e4..0f95b9400b697dc69fa11a7ca2cbf7402cf8b114 100644 (file)
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -571,13 +571,32 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
  SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
  ERROR:  invalid escape string
  HINT:  Escape string must be empty or one character.
--- Test back reference in regexp_replace
+-- Test backslash escapes in regexp_replace's replacement string
  SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
   regexp_replace 
  ----------------
   (111) 222-3333
  (1 row)
  
+SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g');
+  regexp_replace   
+-------------------
+ fXooYbaXrrYbaXzzY
+(1 row)
+
+SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\\\Y', 'g');
+ regexp_replace 
+----------------
+ fX\YbaX\YbaX\Y
+(1 row)
+
+-- not an error, though perhaps it should be:
+SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\Y\\1Z\\');
+ regexp_replace  
+-----------------
+ fX\YoZ\barrbazz
+(1 row)
+
  SELECT regexp_replace('AAA   BBB   CCC   ', E'\\s+', ' ', 'g');
   regexp_replace 
  ----------------
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql

index 6a029cc36920d3e7e5a64974c222f9f5dda2f12b..8c379182cb9d608e036c0e071c894b23695c7f52 100644 (file)
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -187,8 +187,13 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true;
  SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
  SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
  
--- Test back reference in regexp_replace
+-- Test backslash escapes in regexp_replace's replacement string
  SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
+SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g');
+SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\\\Y', 'g');
+-- not an error, though perhaps it should be:
+SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\Y\\1Z\\');
+
  SELECT regexp_replace('AAA   BBB   CCC   ', E'\\s+', ' ', 'g');
  SELECT regexp_replace('AAA', '^|$', 'Z', 'g');
  SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi');
author	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 10 Aug 2021 00:53:25 +0000 (20:53 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 10 Aug 2021 00:53:25 +0000 (20:53 -0400)
src/backend/utils/adt/regexp.c		patch \| blob \| blame \| history
src/backend/utils/adt/varlena.c		patch \| blob \| blame \| history
src/include/utils/varlena.h		patch \| blob \| blame \| history
src/test/regress/expected/strings.out		patch \| blob \| blame \| history
src/test/regress/sql/strings.sql		patch \| blob \| blame \| history