diff options
author | Jeff Davis | 2024-03-19 22:24:41 +0000 |
---|---|---|
committer | Jeff Davis | 2024-03-19 22:24:41 +0000 |
commit | f69319f2f1fb16eda4b535bcccec90dff3a6795e (patch) | |
tree | 48077a7e6eb0309218b09a3be483aec37a6f204f /src/bin/initdb | |
parent | fd0398fcb099980fbedbb7750356ef234408c1c9 (diff) |
Support C.UTF-8 locale in the new builtin collation provider.
The builtin C.UTF-8 locale has similar semantics to the libc locale of
the same name. That is, code point sort order (fast, memcmp-based)
combined with Unicode semantics for character operations such as
pattern matching, regular expressions, and
LOWER()/INITCAP()/UPPER(). The character semantics are based on
Unicode simple case mappings.
The builtin provider's C.UTF-8 offers several important advantages
over libc:
* faster sorting -- benefits from additional optimizations such as
abbreviated keys and varstrfastcmp_c
* faster case conversion, e.g. LOWER(), at least compared with some
libc implementations
* available on all platforms with identical semantics, and the
semantics are stable, testable, and documentable within a given
Postgres major version
Being based on memcmp, the builtin C.UTF-8 locale does not offer
natural language sort order. But it is an improvement for most use
cases that might otherwise use libc's "C.UTF-8" locale, as well as
many use cases that use libc's "C" locale.
Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com
Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneider
Diffstat (limited to 'src/bin/initdb')
-rw-r--r-- | src/bin/initdb/initdb.c | 16 | ||||
-rw-r--r-- | src/bin/initdb/t/001_initdb.pl | 17 |
2 files changed, 32 insertions, 1 deletions
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index c2daff17179..30e17bd1d1e 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2403,9 +2403,16 @@ setlocales(void) if (locale_provider == COLLPROVIDER_BUILTIN) { - if (strcmp(datlocale, "C") != 0) + if (strcmp(datlocale, "C") == 0) + canonname = "C"; + else if (strcmp(datlocale, "C.UTF-8") == 0 || + strcmp(datlocale, "C.UTF8") == 0) + canonname = "C.UTF-8"; + else pg_fatal("invalid locale name \"%s\" for builtin provider", datlocale); + + datlocale = canonname; } else if (locale_provider == COLLPROVIDER_ICU) { @@ -2695,6 +2702,13 @@ setup_locale_encoding(void) !check_locale_encoding(lc_collate, encodingid)) exit(1); /* check_locale_encoding printed the error */ + if (locale_provider == COLLPROVIDER_BUILTIN) + { + if (strcmp(datlocale, "C.UTF-8") == 0 && encodingid != PG_UTF8) + pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"", + datlocale, "UTF-8"); + } + if (locale_provider == COLLPROVIDER_ICU && !check_icu_locale_encoding(encodingid)) exit(1); diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 3478f58b02a..c63d3206d99 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -199,6 +199,23 @@ command_ok( command_ok( [ 'initdb', '--no-sync', + '--locale-provider=builtin', '-E UTF-8', + '--builtin-locale=C.UTF-8', "$tempdir/data8" + ], + 'locale provider builtin with -E UTF-8 --builtin-locale=C.UTF-8'); + +command_fails( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '-E SQL_ASCII', + '--builtin-locale=C.UTF-8', "$tempdir/data9" + ], + 'locale provider builtin with --builtin-locale=C.UTF-8 fails for SQL_ASCII' +); + +command_ok( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', '--lc-ctype=C', '--locale=C', "$tempdir/data10" ], |