Add three new regexp functions: regexp_matches, regexp_split_to_array,

and regexp_split_to_table. These functions provide access to the capture groups resulting from a POSIX regular expression match, and provide the ability to split a string on a POSIX regular expression, respectively. Patch from Jeremy Drake; code review by Neil Conway, additional comments and suggestions from Tom and Peter E. This patch bumps the catversion, adds some regression tests, and updates the docs.
author: Neil Conway 2007-03-20 05:45:00 +0000
committer: Neil Conway 2007-03-20 05:45:00 +0000
commit: 9eb78beeae01f2f0ccafc5d66a2003ea7e3952f9 (patch)
tree: 8e4139ea736ab91319fff21b8654ef4d9150ac55 /src/test
parent: 5e96b04a7cb1f35aa1b75680f8bfbeac7cedc178 (diff)
2 files changed, 277 insertions, 3 deletions
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 20081f699b..e11dfc35d7 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -217,9 +217,232 @@ SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi');
  Z Z
 (1 row)
 
--- invalid option of REGEXP_REPLACE
+-- invalid regexp option
 SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z');
-ERROR:  invalid option of regexp_replace: z
+ERROR:  invalid regexp option: z
+-- set so we can tell NULL from empty string
+\pset null '\\N'
+-- return all matches from regexp
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$);
+ regexp_matches 
+----------------
+ {bar,beque}
+(1 row)
+
+-- test case insensitive
+SELECT regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'i');
+ regexp_matches 
+----------------
+ {bAR,bEqUE}
+(1 row)
+
+-- global option - more than one match
+SELECT regexp_matches('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'g');
+ regexp_matches 
+----------------
+ {bar,beque}
+ {bazil,barf}
+(2 rows)
+
+-- empty capture group (matched empty string)
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(.*)(beque)$re$);
+ regexp_matches 
+----------------
+ {bar,"",beque}
+(1 row)
+
+-- no match
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)(beque)$re$);
+ regexp_matches 
+----------------
+(0 rows)
+
+-- optional capture group did not match, null entry in array
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$);
+  regexp_matches  
+------------------
+ {bar,NULL,beque}
+(1 row)
+
+-- no capture groups
+SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$);
+ regexp_matches 
+----------------
+ {barbeque}
+(1 row)
+
+-- give me errors
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'zipper');
+ERROR:  invalid regexp option: z
+SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$);
+ERROR:  invalid regular expression: parentheses () not balanced
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$);
+ERROR:  invalid regular expression: invalid repetition count(s)
+-- split string on regexp
+SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', $re$\s+$re$) AS foo;
+  foo   | length 
+--------+--------
+ the    |      3
+ quick  |      5
+ brown  |      5
+ fox    |      3
+ jumped |      6
+ over   |      4
+ the    |      3
+ lazy   |      4
+ dog    |      3
+(9 rows)
+
+SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', $re$\s+$re$);
+             regexp_split_to_array              
+------------------------------------------------
+ {the,quick,brown,fox,jumped,over,the,lazy,dog}
+(1 row)
+
+SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', $re$\s*$re$) AS foo;
+ foo | length 
+-----+--------
+ t   |      1
+ h   |      1
+ e   |      1
+ q   |      1
+ u   |      1
+ i   |      1
+ c   |      1
+ k   |      1
+ b   |      1
+ r   |      1
+ o   |      1
+ w   |      1
+ n   |      1
+ f   |      1
+ o   |      1
+ x   |      1
+ j   |      1
+ u   |      1
+ m   |      1
+ p   |      1
+ e   |      1
+ d   |      1
+ o   |      1
+ v   |      1
+ e   |      1
+ r   |      1
+ t   |      1
+ h   |      1
+ e   |      1
+ l   |      1
+ a   |      1
+ z   |      1
+ y   |      1
+ d   |      1
+ o   |      1
+ g   |      1
+(36 rows)
+
+SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', $re$\s*$re$);
+                           regexp_split_to_array                           
+---------------------------------------------------------------------------
+ {t,h,e,q,u,i,c,k,b,r,o,w,n,f,o,x,j,u,m,p,e,d,o,v,e,r,t,h,e,l,a,z,y,d,o,g}
+(1 row)
+
+SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', '') AS foo;
+ foo | length 
+-----+--------
+ t   |      1
+ h   |      1
+ e   |      1
+     |      1
+ q   |      1
+ u   |      1
+ i   |      1
+ c   |      1
+ k   |      1
+     |      1
+ b   |      1
+ r   |      1
+ o   |      1
+ w   |      1
+ n   |      1
+     |      1
+ f   |      1
+ o   |      1
+ x   |      1
+     |      1
+ j   |      1
+ u   |      1
+ m   |      1
+ p   |      1
+ e   |      1
+ d   |      1
+     |      1
+ o   |      1
+ v   |      1
+ e   |      1
+ r   |      1
+     |      1
+ t   |      1
+ h   |      1
+ e   |      1
+     |      1
+ l   |      1
+ a   |      1
+ z   |      1
+ y   |      1
+     |      1
+ d   |      1
+ o   |      1
+ g   |      1
+(44 rows)
+
+SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', '');
+                                           regexp_split_to_array                                           
+-----------------------------------------------------------------------------------------------------------
+ {t,h,e," ",q,u,i,c,k," ",b,r,o,w,n," ",f,o,x," ",j,u,m,p,e,d," ",o,v,e,r," ",t,h,e," ",l,a,z,y," ",d,o,g}
+(1 row)
+
+-- case insensitive
+SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'i') AS foo;
+          foo          | length 
+-----------------------+--------
+ th                    |      2
+  QUick bROWn FOx jUMP |     21
+ d ov                  |      4
+ r TH                  |      4
+  lazy dOG             |      9
+(5 rows)
+
+SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'i');
+                 regexp_split_to_array                  
+--------------------------------------------------------
+ {th," QUick bROWn FOx jUMP","d ov","r TH"," lazy dOG"}
+(1 row)
+
+-- no match of pattern
+SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', 'nomatch') AS foo;
+                     foo                      | length 
+----------------------------------------------+--------
+ the quick brown fox jumped over the lazy dog |     44
+(1 row)
+
+SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', 'nomatch');
+              regexp_split_to_array               
+--------------------------------------------------
+ {"the quick brown fox jumped over the lazy dog"}
+(1 row)
+
+-- errors
+SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy') AS foo;
+ERROR:  invalid regexp option: z
+SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy');
+ERROR:  invalid regexp option: z
+-- global option meaningless for regexp_split
+SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g') AS foo;
+ERROR:  regexp_split does not support the global option
+SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g');
+ERROR:  regexp_split does not support the global option
+-- change NULL-display back
+\pset null ''
 -- E021-11 position expression
 SELECT POSITION('4' IN '1234567890') = '4' AS "4";
  4 
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 570d9a27f4..389ff63517 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -85,9 +85,60 @@ SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3
 SELECT regexp_replace('AAA   BBB   CCC   ', E'\\s+', ' ', 'g');
 SELECT regexp_replace('AAA', '^|$', 'Z', 'g');
 SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi');
--- invalid option of REGEXP_REPLACE
+-- invalid regexp option
 SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z');
 
+-- set so we can tell NULL from empty string
+\pset null '\\N'
+
+-- return all matches from regexp
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$);
+
+-- test case insensitive
+SELECT regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'i');
+
+-- global option - more than one match
+SELECT regexp_matches('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'g');
+
+-- empty capture group (matched empty string)
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(.*)(beque)$re$);
+-- no match
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)(beque)$re$);
+-- optional capture group did not match, null entry in array
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$);
+
+-- no capture groups
+SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$);
+
+-- give me errors
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'zipper');
+SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$);
+SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$);
+
+-- split string on regexp
+SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', $re$\s+$re$) AS foo;
+SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', $re$\s+$re$);
+
+SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', $re$\s*$re$) AS foo;
+SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', $re$\s*$re$);
+SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', '') AS foo;
+SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', '');
+-- case insensitive
+SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'i') AS foo;
+SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'i');
+-- no match of pattern
+SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', 'nomatch') AS foo;
+SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', 'nomatch');
+-- errors
+SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy') AS foo;
+SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy');
+-- global option meaningless for regexp_split
+SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g') AS foo;
+SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g');
+
+-- change NULL-display back
+\pset null ''
+
 -- E021-11 position expression
 SELECT POSITION('4' IN '1234567890') = '4' AS "4";
author	Neil Conway	2007-03-20 05:45:00 +0000
committer	Neil Conway	2007-03-20 05:45:00 +0000
commit	9eb78beeae01f2f0ccafc5d66a2003ea7e3952f9 (patch)
tree	8e4139ea736ab91319fff21b8654ef4d9150ac55 /src/test
parent	5e96b04a7cb1f35aa1b75680f8bfbeac7cedc178 (diff)