From d46911e584d48ee01d5bf75699a77fbf635c865d Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 28 May 2025 08:58:40 +0900 Subject: Fix conversion of SIMILAR TO regexes for character classes The code that translates SIMILAR TO pattern matching expressions to POSIX-style regular expressions did not consider that square brackets can be nested. For example, in an expression like [[:alpha:]%_], the logic replaced the placeholders '_' and '%' but it should not. This commit fixes the conversion logic by tracking the nesting level of square brackets marking character class areas, while considering that in expressions like []] or [^]] the first closing square bracket is a regular character. Multiple tests are added to show how the conversions should or should not apply applied while in a character class area, with specific cases added for all the characters converted outside character classes like an opening parenthesis '(', dollar sign '$', etc. Author: Laurenz Albe Reviewed-by: Tom Lane Reviewed-by: Michael Paquier Discussion: https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at Backpatch-through: 13 --- src/test/regress/expected/strings.out | 62 +++++++++++++++++++++++++++++++++++ src/test/regress/sql/strings.sql | 20 +++++++++++ 2 files changed, 82 insertions(+) (limited to 'src/test') diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 174f0a68331..9c0aba5c029 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -614,6 +614,68 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null; SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error; ERROR: invalid escape string HINT: Escape string must be empty or one character. +-- Characters that should be left alone in character classes when a +-- SIMILAR TO regexp pattern is converted to POSIX style. +-- Underscore "_" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_'; + QUERY PLAN +------------------------------------------------ + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:.[_[:alpha:]_].)$'::text) +(2 rows) + +-- Percentage "%" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%'; + QUERY PLAN +-------------------------------------------------- + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:.*[%[:alnum:]%].*)$'::text) +(2 rows) + +-- Dot "." +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].'; + QUERY PLAN +-------------------------------------------------- + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:\.[.[:alnum:].]\.)$'::text) +(2 rows) + +-- Dollar "$" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$'; + QUERY PLAN +-------------------------------------------------- + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text) +(2 rows) + +-- Opening parenthesis "(" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](]('; +ERROR: invalid regular expression: parentheses () not balanced +-- Caret "^" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^'; + QUERY PLAN +------------------------------------------------------------------------ + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text) +(2 rows) + +-- Closing square bracket "]" at the beginning of character class +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%'; + QUERY PLAN +------------------------------------------------ + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:[]%][^]%][^%].*)$'::text) +(2 rows) + +-- Closing square bracket effective after two carets at the beginning +-- of character class. +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^'; + QUERY PLAN +--------------------------------------- + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:[^^]\^)$'::text) +(2 rows) + -- Test backslash escapes in regexp_replace's replacement string SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); regexp_replace diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index f7b325baadf..2a4fa534997 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -197,6 +197,26 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true; SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null; SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error; +-- Characters that should be left alone in character classes when a +-- SIMILAR TO regexp pattern is converted to POSIX style. +-- Underscore "_" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_'; +-- Percentage "%" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%'; +-- Dot "." +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].'; +-- Dollar "$" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$'; +-- Opening parenthesis "(" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](]('; +-- Caret "^" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^'; +-- Closing square bracket "]" at the beginning of character class +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%'; +-- Closing square bracket effective after two carets at the beginning +-- of character class. +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^'; + -- Test backslash escapes in regexp_replace's replacement string SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g'); -- cgit v1.2.3