Repair bug in regexp split performance improvements.
authorAndrew Gierth <rhodiumtoad@postgresql.org>
Wed, 12 Sep 2018 18:31:06 +0000 (19:31 +0100)
committerAndrew Gierth <rhodiumtoad@postgresql.org>
Wed, 12 Sep 2018 18:31:06 +0000 (19:31 +0100)
Commit c8ea87e4b introduced a temporary conversion buffer for
substrings extracted during regexp splits. Unfortunately the code that
sized it was failing to ignore the effects of ignored degenerate
regexp matches, so for regexp_split_* calls it could under-size the
buffer in such cases.

Fix, and add some regression test cases (though those will only catch
the bug if run in a multibyte encoding).

Backpatch to 9.3 as the faulty code was.

Thanks to the PostGIS project, Regina Obe and Paul Ramsey for the
report (via IRC) and assistance in analysis. Patch by me.

src/backend/utils/adt/regexp.c
src/test/regress/expected/strings.out
src/test/regress/sql/strings.sql

index d8b692123421b5e46e2b5c48f6e832b64a5def9e..171fcc8a448cc7b1e4f42b843821cef94dbfc5af 100644 (file)
@@ -982,6 +982,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
    int         array_len;
    int         array_idx;
    int         prev_match_end;
+   int         prev_valid_match_end;
    int         start_search;
    int         maxlen = 0;     /* largest fetch length in characters */
 
@@ -1024,6 +1025,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
 
    /* search for the pattern, perhaps repeatedly */
    prev_match_end = 0;
+   prev_valid_match_end = 0;
    start_search = 0;
    while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
                            pmatch_len, pmatch))
@@ -1076,13 +1078,15 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
            matchctx->nmatches++;
 
            /*
-            * check length of unmatched portion between end of previous match
-            * and start of current one
+            * check length of unmatched portion between end of previous valid
+            * (nondegenerate, or degenerate but not ignored) match and start
+            * of current one
             */
            if (fetching_unmatched &&
                pmatch[0].rm_so >= 0 &&
-               (pmatch[0].rm_so - prev_match_end) > maxlen)
-               maxlen = (pmatch[0].rm_so - prev_match_end);
+               (pmatch[0].rm_so - prev_valid_match_end) > maxlen)
+               maxlen = (pmatch[0].rm_so - prev_valid_match_end);
+           prev_valid_match_end = pmatch[0].rm_eo;
        }
        prev_match_end = pmatch[0].rm_eo;
 
@@ -1108,8 +1112,8 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
     * input string
     */
    if (fetching_unmatched &&
-       (wide_len - prev_match_end) > maxlen)
-       maxlen = (wide_len - prev_match_end);
+       (wide_len - prev_valid_match_end) > maxlen)
+       maxlen = (wide_len - prev_valid_match_end);
 
    /*
     * Keep a note of the end position of the string for the benefit of
index cbe66c375ca179a9bec2b55ce27b03512a2c0415..189bdffdca084cbf126620ebc3e9428485ff68f2 100644 (file)
@@ -674,6 +674,24 @@ SELECT regexp_split_to_array('123456','.');
  {"","","","","","",""}
 (1 row)
 
+SELECT regexp_split_to_array('123456','');
+ regexp_split_to_array 
+-----------------------
+ {1,2,3,4,5,6}
+(1 row)
+
+SELECT regexp_split_to_array('123456','(?:)');
+ regexp_split_to_array 
+-----------------------
+ {1,2,3,4,5,6}
+(1 row)
+
+SELECT regexp_split_to_array('1','');
+ regexp_split_to_array 
+-----------------------
+ {1}
+(1 row)
+
 -- errors
 SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'zippy') AS foo;
 ERROR:  invalid regexp option: "z"
index 5a82237870e205f55a6e7ae7c869195758f82f5c..f2203ef1b1d23af8b5d797f1c6dcbb82d7113c1b 100644 (file)
@@ -188,6 +188,9 @@ SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', 'nom
 SELECT regexp_split_to_array('123456','1');
 SELECT regexp_split_to_array('123456','6');
 SELECT regexp_split_to_array('123456','.');
+SELECT regexp_split_to_array('123456','');
+SELECT regexp_split_to_array('123456','(?:)');
+SELECT regexp_split_to_array('1','');
 -- errors
 SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'zippy') AS foo;
 SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'iz');