summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/regex/regc_pg_locale.c6
-rw-r--r--src/backend/utils/adt/pg_locale.c7
-rw-r--r--src/backend/utils/adt/pg_locale_builtin.c12
-rw-r--r--src/bin/initdb/initdb.c6
-rw-r--r--src/include/catalog/catversion.h2
-rw-r--r--src/include/catalog/pg_collation.dat3
-rw-r--r--src/include/utils/pg_locale.h1
-rw-r--r--src/test/regress/expected/collate.utf8.out160
-rw-r--r--src/test/regress/sql/collate.utf8.sql60
9 files changed, 248 insertions, 9 deletions
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index 2360d08efae..ed7411df83d 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -307,7 +307,7 @@ pg_wc_isdigit(pg_wchar c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISDIGIT));
case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_isdigit(c, true);
+ return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
@@ -361,7 +361,7 @@ pg_wc_isalnum(pg_wchar c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISALNUM));
case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_isalnum(c, true);
+ return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
@@ -505,7 +505,7 @@ pg_wc_ispunct(pg_wchar c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISPUNCT));
case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_ispunct(c, true);
+ return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 875cca6efc8..94444acd2c5 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1590,8 +1590,11 @@ builtin_locale_encoding(const char *locale)
{
if (strcmp(locale, "C") == 0)
return -1;
- if (strcmp(locale, "C.UTF-8") == 0)
+ else if (strcmp(locale, "C.UTF-8") == 0)
return PG_UTF8;
+ else if (strcmp(locale, "PG_UNICODE_FAST") == 0)
+ return PG_UTF8;
+
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
@@ -1616,6 +1619,8 @@ builtin_validate_locale(int encoding, const char *locale)
canonical_name = "C";
else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0)
canonical_name = "C.UTF-8";
+ else if (strcmp(locale, "PG_UNICODE_FAST") == 0)
+ canonical_name = "PG_UNICODE_FAST";
if (!canonical_name)
ereport(ERROR,
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index fef5b6e6d38..436e32c0ca0 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -78,7 +78,8 @@ size_t
strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
- return unicode_strlower(dest, destsize, src, srclen, false);
+ return unicode_strlower(dest, destsize, src, srclen,
+ locale->info.builtin.casemap_full);
}
size_t
@@ -93,7 +94,8 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
.prev_alnum = false,
};
- return unicode_strtitle(dest, destsize, src, srclen, false,
+ return unicode_strtitle(dest, destsize, src, srclen,
+ locale->info.builtin.casemap_full,
initcap_wbnext, &wbstate);
}
@@ -101,7 +103,8 @@ size_t
strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
- return unicode_strupper(dest, destsize, src, srclen, false);
+ return unicode_strupper(dest, destsize, src, srclen,
+ locale->info.builtin.casemap_full);
}
pg_locale_t
@@ -142,6 +145,7 @@ create_pg_locale_builtin(Oid collid, MemoryContext context)
result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
result->info.builtin.locale = MemoryContextStrdup(context, locstr);
+ result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0);
result->provider = COLLPROVIDER_BUILTIN;
result->deterministic = true;
result->collate_is_c = true;
@@ -164,6 +168,8 @@ get_collation_actual_version_builtin(const char *collcollate)
return "1";
else if (strcmp(collcollate, "C.UTF-8") == 0)
return "1";
+ else if (strcmp(collcollate, "PG_UNICODE_FAST") == 0)
+ return "1";
else
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index ea4b66b3bf5..759672a9b97 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2489,6 +2489,8 @@ setlocales(void)
else if (strcmp(datlocale, "C.UTF-8") == 0 ||
strcmp(datlocale, "C.UTF8") == 0)
canonname = "C.UTF-8";
+ else if (strcmp(datlocale, "PG_UNICODE_FAST") == 0)
+ canonname = "PG_UNICODE_FAST";
else
pg_fatal("invalid locale name \"%s\" for builtin provider",
datlocale);
@@ -2782,7 +2784,9 @@ setup_locale_encoding(void)
if (locale_provider == COLLPROVIDER_BUILTIN)
{
- if (strcmp(datlocale, "C.UTF-8") == 0 && encodingid != PG_UTF8)
+ if ((strcmp(datlocale, "C.UTF-8") == 0 ||
+ strcmp(datlocale, "PG_UNICODE_FAST") == 0) &&
+ encodingid != PG_UTF8)
pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"",
datlocale, "UTF-8");
}
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 54856ab214d..28de0c83342 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202501162
+#define CATALOG_VERSION_NO 202501171
#endif
diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat
index 5fa2d33e94b..fb76c421931 100644
--- a/src/include/catalog/pg_collation.dat
+++ b/src/include/catalog/pg_collation.dat
@@ -33,5 +33,8 @@
descr => 'sorts by Unicode code point; Unicode and POSIX character semantics',
collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6',
colllocale => 'C.UTF-8', collversion => '1' },
+{ oid => '9535', descr => 'sorts by Unicode code point; Unicode character semantics',
+ collname => 'pg_unicode_fast', collprovider => 'b', collencoding => '6',
+ colllocale => 'PG_UNICODE_FAST', collversion => '1' },
]
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index ec42ca3da4c..2bc3a7df2d9 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -108,6 +108,7 @@ struct pg_locale_struct
struct
{
const char *locale;
+ bool casemap_full;
} builtin;
locale_t lt;
#ifdef USE_ICU
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out
index 4558d2521a2..8b7176a2756 100644
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -160,3 +160,163 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
t
(1 row)
+--
+-- Test PG_UNICODE_FAST
+--
+CREATE COLLATION regress_pg_unicode_fast (
+ provider = builtin, locale = 'unicode'); -- fails
+ERROR: invalid locale name "unicode" for builtin provider
+CREATE COLLATION regress_pg_unicode_fast (
+ provider = builtin, locale = 'PG_UNICODE_FAST');
+CREATE TABLE test_pg_unicode_fast (
+ t TEXT COLLATE PG_UNICODE_FAST
+);
+INSERT INTO test_pg_unicode_fast VALUES
+ ('abc DEF 123abc'),
+ ('ábc sßs ßss DÉF'),
+ ('DŽxxDŽ džxxDž Džxxdž'),
+ ('ȺȺȺ'),
+ ('ⱥⱥⱥ'),
+ ('ⱥȺ');
+SELECT
+ t, lower(t), initcap(t), upper(t),
+ length(convert_to(t, 'UTF8')) AS t_bytes,
+ length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
+ length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
+ length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
+ FROM test_pg_unicode_fast;
+ t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes
+-----------------+-----------------+------------------+-------------------+---------+---------------+-----------------+---------------
+ abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14
+ ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF | 19 | 19 | 19 | 19
+ DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | Džxxdž Džxxdž Džxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20
+ ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6
+ ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6
+ ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4
+(6 rows)
+
+DROP TABLE test_pg_unicode_fast;
+-- test Final_Sigma
+SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3
+ lower
+-------
+ ας
+(1 row)
+
+SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030
+ lower
+-------
+ ας0
+(1 row)
+
+SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343
+ lower
+-------
+ ἀς̓
+(1 row)
+
+SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345
+ lower
+-------
+ ᾳςͅ
+(1 row)
+
+-- test !Final_Sigma
+SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3
+ lower
+-------
+ σ
+(1 row)
+
+SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3
+ lower
+-------
+ 0σ
+(1 row)
+
+SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391
+ lower
+-------
+ ασα
+(1 row)
+
+SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391
+ lower
+-------
+ ἀσ̓α
+(1 row)
+
+SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
+ lower
+-------
+ ᾳσͅα
+(1 row)
+
+-- properties
+SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+-- case mapping
+SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed
+ ?column?
+----------
+ t
+(1 row)
+
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql
index 87fe06ddf1b..46e9c5232ad 100644
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -80,3 +80,63 @@ SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8;
SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
+
+--
+-- Test PG_UNICODE_FAST
+--
+
+CREATE COLLATION regress_pg_unicode_fast (
+ provider = builtin, locale = 'unicode'); -- fails
+CREATE COLLATION regress_pg_unicode_fast (
+ provider = builtin, locale = 'PG_UNICODE_FAST');
+
+CREATE TABLE test_pg_unicode_fast (
+ t TEXT COLLATE PG_UNICODE_FAST
+);
+INSERT INTO test_pg_unicode_fast VALUES
+ ('abc DEF 123abc'),
+ ('ábc sßs ßss DÉF'),
+ ('DŽxxDŽ džxxDž Džxxdž'),
+ ('ȺȺȺ'),
+ ('ⱥⱥⱥ'),
+ ('ⱥȺ');
+
+SELECT
+ t, lower(t), initcap(t), upper(t),
+ length(convert_to(t, 'UTF8')) AS t_bytes,
+ length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
+ length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
+ length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
+ FROM test_pg_unicode_fast;
+
+DROP TABLE test_pg_unicode_fast;
+
+-- test Final_Sigma
+SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3
+SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030
+SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343
+SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345
+
+-- test !Final_Sigma
+SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3
+SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3
+SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391
+SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391
+SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
+
+-- properties
+
+SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
+SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST;
+SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
+SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation
+SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST;
+SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST;
+
+-- case mapping
+
+SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST;
+SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST;
+SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST;
+SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST;
+SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed