diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/regex/regc_pg_locale.c | 6 | ||||
-rw-r--r-- | src/backend/utils/adt/pg_locale.c | 7 | ||||
-rw-r--r-- | src/backend/utils/adt/pg_locale_builtin.c | 12 | ||||
-rw-r--r-- | src/bin/initdb/initdb.c | 6 | ||||
-rw-r--r-- | src/include/catalog/catversion.h | 2 | ||||
-rw-r--r-- | src/include/catalog/pg_collation.dat | 3 | ||||
-rw-r--r-- | src/include/utils/pg_locale.h | 1 | ||||
-rw-r--r-- | src/test/regress/expected/collate.utf8.out | 160 | ||||
-rw-r--r-- | src/test/regress/sql/collate.utf8.sql | 60 |
9 files changed, 248 insertions, 9 deletions
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 2360d08efae..ed7411df83d 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -307,7 +307,7 @@ pg_wc_isdigit(pg_wchar c) return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISDIGIT)); case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isdigit(c, true); + return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full); case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswdigit_l((wint_t) c, pg_regex_locale->info.lt); @@ -361,7 +361,7 @@ pg_wc_isalnum(pg_wchar c) return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALNUM)); case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isalnum(c, true); + return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full); case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalnum_l((wint_t) c, pg_regex_locale->info.lt); @@ -505,7 +505,7 @@ pg_wc_ispunct(pg_wchar c) return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPUNCT)); case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_ispunct(c, true); + return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full); case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswpunct_l((wint_t) c, pg_regex_locale->info.lt); diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 875cca6efc8..94444acd2c5 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1590,8 +1590,11 @@ builtin_locale_encoding(const char *locale) { if (strcmp(locale, "C") == 0) return -1; - if (strcmp(locale, "C.UTF-8") == 0) + else if (strcmp(locale, "C.UTF-8") == 0) return PG_UTF8; + else if (strcmp(locale, "PG_UNICODE_FAST") == 0) + return PG_UTF8; + ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -1616,6 +1619,8 @@ builtin_validate_locale(int encoding, const char *locale) canonical_name = "C"; else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0) canonical_name = "C.UTF-8"; + else if (strcmp(locale, "PG_UNICODE_FAST") == 0) + canonical_name = "PG_UNICODE_FAST"; if (!canonical_name) ereport(ERROR, diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index fef5b6e6d38..436e32c0ca0 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -78,7 +78,8 @@ size_t strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - return unicode_strlower(dest, destsize, src, srclen, false); + return unicode_strlower(dest, destsize, src, srclen, + locale->info.builtin.casemap_full); } size_t @@ -93,7 +94,8 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, .prev_alnum = false, }; - return unicode_strtitle(dest, destsize, src, srclen, false, + return unicode_strtitle(dest, destsize, src, srclen, + locale->info.builtin.casemap_full, initcap_wbnext, &wbstate); } @@ -101,7 +103,8 @@ size_t strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - return unicode_strupper(dest, destsize, src, srclen, false); + return unicode_strupper(dest, destsize, src, srclen, + locale->info.builtin.casemap_full); } pg_locale_t @@ -142,6 +145,7 @@ create_pg_locale_builtin(Oid collid, MemoryContext context) result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); result->info.builtin.locale = MemoryContextStrdup(context, locstr); + result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0); result->provider = COLLPROVIDER_BUILTIN; result->deterministic = true; result->collate_is_c = true; @@ -164,6 +168,8 @@ get_collation_actual_version_builtin(const char *collcollate) return "1"; else if (strcmp(collcollate, "C.UTF-8") == 0) return "1"; + else if (strcmp(collcollate, "PG_UNICODE_FAST") == 0) + return "1"; else ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index ea4b66b3bf5..759672a9b97 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2489,6 +2489,8 @@ setlocales(void) else if (strcmp(datlocale, "C.UTF-8") == 0 || strcmp(datlocale, "C.UTF8") == 0) canonname = "C.UTF-8"; + else if (strcmp(datlocale, "PG_UNICODE_FAST") == 0) + canonname = "PG_UNICODE_FAST"; else pg_fatal("invalid locale name \"%s\" for builtin provider", datlocale); @@ -2782,7 +2784,9 @@ setup_locale_encoding(void) if (locale_provider == COLLPROVIDER_BUILTIN) { - if (strcmp(datlocale, "C.UTF-8") == 0 && encodingid != PG_UTF8) + if ((strcmp(datlocale, "C.UTF-8") == 0 || + strcmp(datlocale, "PG_UNICODE_FAST") == 0) && + encodingid != PG_UTF8) pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"", datlocale, "UTF-8"); } diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 54856ab214d..28de0c83342 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202501162 +#define CATALOG_VERSION_NO 202501171 #endif diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat index 5fa2d33e94b..fb76c421931 100644 --- a/src/include/catalog/pg_collation.dat +++ b/src/include/catalog/pg_collation.dat @@ -33,5 +33,8 @@ descr => 'sorts by Unicode code point; Unicode and POSIX character semantics', collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6', colllocale => 'C.UTF-8', collversion => '1' }, +{ oid => '9535', descr => 'sorts by Unicode code point; Unicode character semantics', + collname => 'pg_unicode_fast', collprovider => 'b', collencoding => '6', + colllocale => 'PG_UNICODE_FAST', collversion => '1' }, ] diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index ec42ca3da4c..2bc3a7df2d9 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -108,6 +108,7 @@ struct pg_locale_struct struct { const char *locale; + bool casemap_full; } builtin; locale_t lt; #ifdef USE_ICU diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out index 4558d2521a2..8b7176a2756 100644 --- a/src/test/regress/expected/collate.utf8.out +++ b/src/test/regress/expected/collate.utf8.out @@ -160,3 +160,163 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed t (1 row) +-- +-- Test PG_UNICODE_FAST +-- +CREATE COLLATION regress_pg_unicode_fast ( + provider = builtin, locale = 'unicode'); -- fails +ERROR: invalid locale name "unicode" for builtin provider +CREATE COLLATION regress_pg_unicode_fast ( + provider = builtin, locale = 'PG_UNICODE_FAST'); +CREATE TABLE test_pg_unicode_fast ( + t TEXT COLLATE PG_UNICODE_FAST +); +INSERT INTO test_pg_unicode_fast VALUES + ('abc DEF 123abc'), + ('ábc sßs ßss DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM test_pg_unicode_fast; + t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes +-----------------+-----------------+------------------+-------------------+---------+---------------+-----------------+--------------- + abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14 + ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF | 19 | 19 | 19 | 19 + DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | Džxxdž Džxxdž Džxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20 + ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6 + ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6 + ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4 +(6 rows) + +DROP TABLE test_pg_unicode_fast; +-- test Final_Sigma +SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 + lower +------- + ας +(1 row) + +SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030 + lower +------- + ας0 +(1 row) + +SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 + lower +------- + ἀς̓ +(1 row) + +SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 + lower +------- + ᾳςͅ +(1 row) + +-- test !Final_Sigma +SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3 + lower +------- + σ +(1 row) + +SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3 + lower +------- + 0σ +(1 row) + +SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391 + lower +------- + ασα +(1 row) + +SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391 + lower +------- + ἀσ̓α +(1 row) + +SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391 + lower +------- + ᾳσͅα +(1 row) + +-- properties +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation + ?column? +---------- + t +(1 row) + +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +-- case mapping +SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed + ?column? +---------- + t +(1 row) + diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql index 87fe06ddf1b..46e9c5232ad 100644 --- a/src/test/regress/sql/collate.utf8.sql +++ b/src/test/regress/sql/collate.utf8.sql @@ -80,3 +80,63 @@ SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8; SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8; SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed + +-- +-- Test PG_UNICODE_FAST +-- + +CREATE COLLATION regress_pg_unicode_fast ( + provider = builtin, locale = 'unicode'); -- fails +CREATE COLLATION regress_pg_unicode_fast ( + provider = builtin, locale = 'PG_UNICODE_FAST'); + +CREATE TABLE test_pg_unicode_fast ( + t TEXT COLLATE PG_UNICODE_FAST +); +INSERT INTO test_pg_unicode_fast VALUES + ('abc DEF 123abc'), + ('ábc sßs ßss DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); + +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM test_pg_unicode_fast; + +DROP TABLE test_pg_unicode_fast; + +-- test Final_Sigma +SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 +SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030 +SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 +SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 + +-- test !Final_Sigma +SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3 +SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3 +SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391 +SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391 +SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391 + +-- properties + +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST; +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; +SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST; +SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST; + +-- case mapping + +SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST; +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST; +SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST; +SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST; +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed |