summaryrefslogtreecommitdiff
path: root/src/bin/initdb
diff options
context:
space:
mode:
authorJeff Davis2024-03-19 22:24:41 +0000
committerJeff Davis2024-03-19 22:24:41 +0000
commitf69319f2f1fb16eda4b535bcccec90dff3a6795e (patch)
tree48077a7e6eb0309218b09a3be483aec37a6f204f /src/bin/initdb
parentfd0398fcb099980fbedbb7750356ef234408c1c9 (diff)
Support C.UTF-8 locale in the new builtin collation provider.
The builtin C.UTF-8 locale has similar semantics to the libc locale of the same name. That is, code point sort order (fast, memcmp-based) combined with Unicode semantics for character operations such as pattern matching, regular expressions, and LOWER()/INITCAP()/UPPER(). The character semantics are based on Unicode simple case mappings. The builtin provider's C.UTF-8 offers several important advantages over libc: * faster sorting -- benefits from additional optimizations such as abbreviated keys and varstrfastcmp_c * faster case conversion, e.g. LOWER(), at least compared with some libc implementations * available on all platforms with identical semantics, and the semantics are stable, testable, and documentable within a given Postgres major version Being based on memcmp, the builtin C.UTF-8 locale does not offer natural language sort order. But it is an improvement for most use cases that might otherwise use libc's "C.UTF-8" locale, as well as many use cases that use libc's "C" locale. Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneider
Diffstat (limited to 'src/bin/initdb')
-rw-r--r--src/bin/initdb/initdb.c16
-rw-r--r--src/bin/initdb/t/001_initdb.pl17
2 files changed, 32 insertions, 1 deletions
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index c2daff17179..30e17bd1d1e 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2403,9 +2403,16 @@ setlocales(void)
if (locale_provider == COLLPROVIDER_BUILTIN)
{
- if (strcmp(datlocale, "C") != 0)
+ if (strcmp(datlocale, "C") == 0)
+ canonname = "C";
+ else if (strcmp(datlocale, "C.UTF-8") == 0 ||
+ strcmp(datlocale, "C.UTF8") == 0)
+ canonname = "C.UTF-8";
+ else
pg_fatal("invalid locale name \"%s\" for builtin provider",
datlocale);
+
+ datlocale = canonname;
}
else if (locale_provider == COLLPROVIDER_ICU)
{
@@ -2695,6 +2702,13 @@ setup_locale_encoding(void)
!check_locale_encoding(lc_collate, encodingid))
exit(1); /* check_locale_encoding printed the error */
+ if (locale_provider == COLLPROVIDER_BUILTIN)
+ {
+ if (strcmp(datlocale, "C.UTF-8") == 0 && encodingid != PG_UTF8)
+ pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"",
+ datlocale, "UTF-8");
+ }
+
if (locale_provider == COLLPROVIDER_ICU &&
!check_icu_locale_encoding(encodingid))
exit(1);
diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl
index 3478f58b02a..c63d3206d99 100644
--- a/src/bin/initdb/t/001_initdb.pl
+++ b/src/bin/initdb/t/001_initdb.pl
@@ -199,6 +199,23 @@ command_ok(
command_ok(
[
'initdb', '--no-sync',
+ '--locale-provider=builtin', '-E UTF-8',
+ '--builtin-locale=C.UTF-8', "$tempdir/data8"
+ ],
+ 'locale provider builtin with -E UTF-8 --builtin-locale=C.UTF-8');
+
+command_fails(
+ [
+ 'initdb', '--no-sync',
+ '--locale-provider=builtin', '-E SQL_ASCII',
+ '--builtin-locale=C.UTF-8', "$tempdir/data9"
+ ],
+ 'locale provider builtin with --builtin-locale=C.UTF-8 fails for SQL_ASCII'
+);
+
+command_ok(
+ [
+ 'initdb', '--no-sync',
'--locale-provider=builtin', '--lc-ctype=C',
'--locale=C', "$tempdir/data10"
],