PostgreSQL Source Code git master
pg_locale_icu.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for ICU
4 *
5 * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_icu.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#ifdef USE_ICU
15#include <unicode/ucnv.h>
16#include <unicode/ustring.h>
17
18/*
19 * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
20 * (see
21 * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
22 */
23#if U_ICU_VERSION_MAJOR_NUM >= 53
24#define HAVE_UCOL_STRCOLLUTF8 1
25#else
26#undef HAVE_UCOL_STRCOLLUTF8
27#endif
28
29#endif
30
31#include "access/htup_details.h"
32#include "catalog/pg_database.h"
34#include "mb/pg_wchar.h"
35#include "miscadmin.h"
36#include "utils/builtins.h"
37#include "utils/formatting.h"
38#include "utils/memutils.h"
39#include "utils/pg_locale.h"
40#include "utils/syscache.h"
41
42/*
43 * Size of stack buffer to use for string transformations, used to avoid heap
44 * allocations in typical cases. This should be large enough that most strings
45 * will fit, but small enough that we feel comfortable putting it on the
46 * stack.
47 */
48#define TEXTBUFLEN 1024
49
51
52#ifdef USE_ICU
53
54extern UCollator *pg_ucol_open(const char *loc_str);
55
56static size_t strlower_icu(char *dest, size_t destsize, const char *src,
57 ssize_t srclen, pg_locale_t locale);
58static size_t strtitle_icu(char *dest, size_t destsize, const char *src,
59 ssize_t srclen, pg_locale_t locale);
60static size_t strupper_icu(char *dest, size_t destsize, const char *src,
61 ssize_t srclen, pg_locale_t locale);
62static size_t strfold_icu(char *dest, size_t destsize, const char *src,
63 ssize_t srclen, pg_locale_t locale);
64static int strncoll_icu(const char *arg1, ssize_t len1,
65 const char *arg2, ssize_t len2,
67static size_t strnxfrm_icu(char *dest, size_t destsize,
68 const char *src, ssize_t srclen,
70extern char *get_collation_actual_version_icu(const char *collcollate);
71
72typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
73 const UChar *src, int32_t srcLength,
74 const char *locale,
75 UErrorCode *pErrorCode);
76
77/*
78 * Converter object for converting between ICU's UChar strings and C strings
79 * in database encoding. Since the database encoding doesn't change, we only
80 * need one of these per session.
81 */
82static UConverter *icu_converter = NULL;
83
84static UCollator *make_icu_collator(const char *iculocstr,
85 const char *icurules);
86static int strncoll_icu(const char *arg1, ssize_t len1,
87 const char *arg2, ssize_t len2,
89static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
90 const char *src, ssize_t srclen,
92#ifdef HAVE_UCOL_STRCOLLUTF8
93static int strncoll_icu_utf8(const char *arg1, ssize_t len1,
94 const char *arg2, ssize_t len2,
96#endif
97static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
98 const char *src, ssize_t srclen,
100static void init_icu_converter(void);
101static size_t uchar_length(UConverter *converter,
102 const char *str, int32_t len);
103static int32_t uchar_convert(UConverter *converter,
104 UChar *dest, int32_t destlen,
105 const char *src, int32_t srclen);
106static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
107 size_t nbytes);
108static size_t icu_from_uchar(char *dest, size_t destsize,
109 const UChar *buff_uchar, int32_t len_uchar);
110static void icu_set_collation_attributes(UCollator *collator, const char *loc,
111 UErrorCode *status);
112static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
113 UChar **buff_dest, UChar *buff_source,
114 int32_t len_source);
115static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
116 const UChar *src, int32_t srcLength,
117 const char *locale,
118 UErrorCode *pErrorCode);
119static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
120 const UChar *src, int32_t srcLength,
121 const char *locale,
122 UErrorCode *pErrorCode);
123
124/*
125 * XXX: many of the functions below rely on casts directly from pg_wchar to
126 * UChar32, which is correct for the UTF-8 encoding, but not in general.
127 */
128
129static pg_wchar
130toupper_icu(pg_wchar wc, pg_locale_t locale)
131{
132 return u_toupper(wc);
133}
134
135static pg_wchar
136tolower_icu(pg_wchar wc, pg_locale_t locale)
137{
138 return u_tolower(wc);
139}
140
141static const struct collate_methods collate_methods_icu = {
142 .strncoll = strncoll_icu,
143 .strnxfrm = strnxfrm_icu,
144 .strnxfrm_prefix = strnxfrm_prefix_icu,
145 .strxfrm_is_safe = true,
146};
147
148static const struct collate_methods collate_methods_icu_utf8 = {
149#ifdef HAVE_UCOL_STRCOLLUTF8
150 .strncoll = strncoll_icu_utf8,
151#else
152 .strncoll = strncoll_icu,
153#endif
154 .strnxfrm = strnxfrm_icu,
155 .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
156 .strxfrm_is_safe = true,
157};
158
159static bool
160wc_isdigit_icu(pg_wchar wc, pg_locale_t locale)
161{
162 return u_isdigit(wc);
163}
164
165static bool
166wc_isalpha_icu(pg_wchar wc, pg_locale_t locale)
167{
168 return u_isalpha(wc);
169}
170
171static bool
172wc_isalnum_icu(pg_wchar wc, pg_locale_t locale)
173{
174 return u_isalnum(wc);
175}
176
177static bool
178wc_isupper_icu(pg_wchar wc, pg_locale_t locale)
179{
180 return u_isupper(wc);
181}
182
183static bool
184wc_islower_icu(pg_wchar wc, pg_locale_t locale)
185{
186 return u_islower(wc);
187}
188
189static bool
190wc_isgraph_icu(pg_wchar wc, pg_locale_t locale)
191{
192 return u_isgraph(wc);
193}
194
195static bool
196wc_isprint_icu(pg_wchar wc, pg_locale_t locale)
197{
198 return u_isprint(wc);
199}
200
201static bool
202wc_ispunct_icu(pg_wchar wc, pg_locale_t locale)
203{
204 return u_ispunct(wc);
205}
206
207static bool
208wc_isspace_icu(pg_wchar wc, pg_locale_t locale)
209{
210 return u_isspace(wc);
211}
212
213static bool
214wc_isxdigit_icu(pg_wchar wc, pg_locale_t locale)
215{
216 return u_isxdigit(wc);
217}
218
219static bool
220wc_iscased_icu(pg_wchar wc, pg_locale_t locale)
221{
222 return u_hasBinaryProperty(wc, UCHAR_CASED);
223}
224
225static const struct ctype_methods ctype_methods_icu = {
226 .strlower = strlower_icu,
227 .strtitle = strtitle_icu,
228 .strupper = strupper_icu,
229 .strfold = strfold_icu,
230 .wc_isdigit = wc_isdigit_icu,
231 .wc_isalpha = wc_isalpha_icu,
232 .wc_isalnum = wc_isalnum_icu,
233 .wc_isupper = wc_isupper_icu,
234 .wc_islower = wc_islower_icu,
235 .wc_isgraph = wc_isgraph_icu,
236 .wc_isprint = wc_isprint_icu,
237 .wc_ispunct = wc_ispunct_icu,
238 .wc_isspace = wc_isspace_icu,
239 .wc_isxdigit = wc_isxdigit_icu,
240 .wc_iscased = wc_iscased_icu,
241 .wc_toupper = toupper_icu,
242 .wc_tolower = tolower_icu,
243};
244#endif
245
248{
249#ifdef USE_ICU
250 bool deterministic;
251 const char *iculocstr;
252 const char *icurules = NULL;
253 UCollator *collator;
254 pg_locale_t result;
255
256 if (collid == DEFAULT_COLLATION_OID)
257 {
258 HeapTuple tp;
259 Datum datum;
260 bool isnull;
261
263 if (!HeapTupleIsValid(tp))
264 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
265
266 /* default database collation is always deterministic */
267 deterministic = true;
268 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
269 Anum_pg_database_datlocale);
270 iculocstr = TextDatumGetCString(datum);
271 datum = SysCacheGetAttr(DATABASEOID, tp,
272 Anum_pg_database_daticurules, &isnull);
273 if (!isnull)
274 icurules = TextDatumGetCString(datum);
275
276 ReleaseSysCache(tp);
277 }
278 else
279 {
280 Form_pg_collation collform;
281 HeapTuple tp;
282 Datum datum;
283 bool isnull;
284
286 if (!HeapTupleIsValid(tp))
287 elog(ERROR, "cache lookup failed for collation %u", collid);
288 collform = (Form_pg_collation) GETSTRUCT(tp);
289 deterministic = collform->collisdeterministic;
290 datum = SysCacheGetAttrNotNull(COLLOID, tp,
291 Anum_pg_collation_colllocale);
292 iculocstr = TextDatumGetCString(datum);
293 datum = SysCacheGetAttr(COLLOID, tp,
294 Anum_pg_collation_collicurules, &isnull);
295 if (!isnull)
296 icurules = TextDatumGetCString(datum);
297
298 ReleaseSysCache(tp);
299 }
300
301 collator = make_icu_collator(iculocstr, icurules);
302
303 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
304 result->icu.locale = MemoryContextStrdup(context, iculocstr);
305 result->icu.ucol = collator;
306 result->deterministic = deterministic;
307 result->collate_is_c = false;
308 result->ctype_is_c = false;
310 result->collate = &collate_methods_icu_utf8;
311 else
312 result->collate = &collate_methods_icu;
313 result->ctype = &ctype_methods_icu;
314
315 return result;
316#else
317 /* could get here if a collation was created by a build with ICU */
319 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
320 errmsg("ICU is not supported in this build")));
321
322 return NULL;
323#endif
324}
325
326#ifdef USE_ICU
327
328/*
329 * Wrapper around ucol_open() to handle API differences for older ICU
330 * versions.
331 *
332 * Ensure that no path leaks a UCollator.
333 */
334UCollator *
335pg_ucol_open(const char *loc_str)
336{
337 UCollator *collator;
338 UErrorCode status;
339 const char *orig_str = loc_str;
340 char *fixed_str = NULL;
341
342 /*
343 * Must never open default collator, because it depends on the environment
344 * and may change at any time. Should not happen, but check here to catch
345 * bugs that might be hard to catch otherwise.
346 *
347 * NB: the default collator is not the same as the collator for the root
348 * locale. The root locale may be specified as the empty string, "und", or
349 * "root". The default collator is opened by passing NULL to ucol_open().
350 */
351 if (loc_str == NULL)
352 elog(ERROR, "opening default collator is not supported");
353
354 /*
355 * In ICU versions 54 and earlier, "und" is not a recognized spelling of
356 * the root locale. If the first component of the locale is "und", replace
357 * with "root" before opening.
358 */
359 if (U_ICU_VERSION_MAJOR_NUM < 55)
360 {
361 char lang[ULOC_LANG_CAPACITY];
362
363 status = U_ZERO_ERROR;
364 uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
365 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
366 {
368 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
369 errmsg("could not get language from locale \"%s\": %s",
370 loc_str, u_errorName(status))));
371 }
372
373 if (strcmp(lang, "und") == 0)
374 {
375 const char *remainder = loc_str + strlen("und");
376
377 fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
378 strcpy(fixed_str, "root");
379 strcat(fixed_str, remainder);
380
381 loc_str = fixed_str;
382 }
383 }
384
385 status = U_ZERO_ERROR;
386 collator = ucol_open(loc_str, &status);
387 if (U_FAILURE(status))
389 /* use original string for error report */
390 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
391 errmsg("could not open collator for locale \"%s\": %s",
392 orig_str, u_errorName(status))));
393
394 if (U_ICU_VERSION_MAJOR_NUM < 54)
395 {
396 status = U_ZERO_ERROR;
397 icu_set_collation_attributes(collator, loc_str, &status);
398
399 /*
400 * Pretend the error came from ucol_open(), for consistent error
401 * message across ICU versions.
402 */
403 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
404 {
405 ucol_close(collator);
407 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
408 errmsg("could not open collator for locale \"%s\": %s",
409 orig_str, u_errorName(status))));
410 }
411 }
412
413 if (fixed_str != NULL)
414 pfree(fixed_str);
415
416 return collator;
417}
418
419/*
420 * Create a UCollator with the given locale string and rules.
421 *
422 * Ensure that no path leaks a UCollator.
423 */
424static UCollator *
425make_icu_collator(const char *iculocstr, const char *icurules)
426{
427 if (!icurules)
428 {
429 /* simple case without rules */
430 return pg_ucol_open(iculocstr);
431 }
432 else
433 {
434 UCollator *collator_std_rules;
435 UCollator *collator_all_rules;
436 const UChar *std_rules;
437 UChar *my_rules;
438 UChar *all_rules;
439 int32_t length;
440 int32_t total;
441 UErrorCode status;
442
443 /*
444 * If rules are specified, we extract the rules of the standard
445 * collation, add our own rules, and make a new collator with the
446 * combined rules.
447 */
448 icu_to_uchar(&my_rules, icurules, strlen(icurules));
449
450 collator_std_rules = pg_ucol_open(iculocstr);
451
452 std_rules = ucol_getRules(collator_std_rules, &length);
453
454 total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
455
456 /* avoid leaking collator on OOM */
457 all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
458 if (!all_rules)
459 {
460 ucol_close(collator_std_rules);
462 (errcode(ERRCODE_OUT_OF_MEMORY),
463 errmsg("out of memory")));
464 }
465
466 u_strcpy(all_rules, std_rules);
467 u_strcat(all_rules, my_rules);
468
469 ucol_close(collator_std_rules);
470
471 status = U_ZERO_ERROR;
472 collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
473 UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
474 NULL, &status);
475 if (U_FAILURE(status))
476 {
478 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
479 errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
480 iculocstr, icurules, u_errorName(status))));
481 }
482
483 return collator_all_rules;
484 }
485}
486
487static size_t
488strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
490{
491 int32_t len_uchar;
492 int32_t len_conv;
493 UChar *buff_uchar;
494 UChar *buff_conv;
495 size_t result_len;
496
497 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
498 len_conv = icu_convert_case(u_strToLower, locale,
499 &buff_conv, buff_uchar, len_uchar);
500 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
501 pfree(buff_uchar);
502 pfree(buff_conv);
503
504 return result_len;
505}
506
507static size_t
508strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
510{
511 int32_t len_uchar;
512 int32_t len_conv;
513 UChar *buff_uchar;
514 UChar *buff_conv;
515 size_t result_len;
516
517 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
518 len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
519 &buff_conv, buff_uchar, len_uchar);
520 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
521 pfree(buff_uchar);
522 pfree(buff_conv);
523
524 return result_len;
525}
526
527static size_t
528strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
530{
531 int32_t len_uchar;
532 int32_t len_conv;
533 UChar *buff_uchar;
534 UChar *buff_conv;
535 size_t result_len;
536
537 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
538 len_conv = icu_convert_case(u_strToUpper, locale,
539 &buff_conv, buff_uchar, len_uchar);
540 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
541 pfree(buff_uchar);
542 pfree(buff_conv);
543
544 return result_len;
545}
546
547static size_t
548strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
550{
551 int32_t len_uchar;
552 int32_t len_conv;
553 UChar *buff_uchar;
554 UChar *buff_conv;
555 size_t result_len;
556
557 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
558 len_conv = icu_convert_case(u_strFoldCase_default, locale,
559 &buff_conv, buff_uchar, len_uchar);
560 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
561 pfree(buff_uchar);
562 pfree(buff_conv);
563
564 return result_len;
565}
566
567/*
568 * strncoll_icu_utf8
569 *
570 * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
571 * database encoding. An argument length of -1 means the string is
572 * NUL-terminated.
573 */
574#ifdef HAVE_UCOL_STRCOLLUTF8
575int
576strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
578{
579 int result;
580 UErrorCode status;
581
583
584 status = U_ZERO_ERROR;
585 result = ucol_strcollUTF8(locale->icu.ucol,
586 arg1, len1,
587 arg2, len2,
588 &status);
589 if (U_FAILURE(status))
591 (errmsg("collation failed: %s", u_errorName(status))));
592
593 return result;
594}
595#endif
596
597/* 'srclen' of -1 means the strings are NUL-terminated */
598size_t
599strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
601{
602 char sbuf[TEXTBUFLEN];
603 char *buf = sbuf;
604 UChar *uchar;
605 int32_t ulen;
606 size_t uchar_bsize;
607 Size result_bsize;
608
609 init_icu_converter();
610
611 ulen = uchar_length(icu_converter, src, srclen);
612
613 uchar_bsize = (ulen + 1) * sizeof(UChar);
614
615 if (uchar_bsize > TEXTBUFLEN)
616 buf = palloc(uchar_bsize);
617
618 uchar = (UChar *) buf;
619
620 ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
621
622 result_bsize = ucol_getSortKey(locale->icu.ucol,
623 uchar, ulen,
624 (uint8_t *) dest, destsize);
625
626 /*
627 * ucol_getSortKey() counts the nul-terminator in the result length, but
628 * this function should not.
629 */
630 Assert(result_bsize > 0);
631 result_bsize--;
632
633 if (buf != sbuf)
634 pfree(buf);
635
636 /* if dest is defined, it should be nul-terminated */
637 Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
638
639 return result_bsize;
640}
641
642/* 'srclen' of -1 means the strings are NUL-terminated */
643size_t
644strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
645 const char *src, ssize_t srclen,
647{
648 size_t result;
649 UCharIterator iter;
650 uint32_t state[2];
651 UErrorCode status;
652
654
655 uiter_setUTF8(&iter, src, srclen);
656 state[0] = state[1] = 0; /* won't need that again */
657 status = U_ZERO_ERROR;
658 result = ucol_nextSortKeyPart(locale->icu.ucol,
659 &iter,
660 state,
661 (uint8_t *) dest,
662 destsize,
663 &status);
664 if (U_FAILURE(status))
666 (errmsg("sort key generation failed: %s",
667 u_errorName(status))));
668
669 return result;
670}
671
672char *
673get_collation_actual_version_icu(const char *collcollate)
674{
675 UCollator *collator;
676 UVersionInfo versioninfo;
677 char buf[U_MAX_VERSION_STRING_LENGTH];
678
679 collator = pg_ucol_open(collcollate);
680
681 ucol_getVersion(collator, versioninfo);
682 ucol_close(collator);
683
684 u_versionToString(versioninfo, buf);
685 return pstrdup(buf);
686}
687
688/*
689 * Convert a string in the database encoding into a string of UChars.
690 *
691 * The source string at buff is of length nbytes
692 * (it needn't be nul-terminated)
693 *
694 * *buff_uchar receives a pointer to the palloc'd result string, and
695 * the function's result is the number of UChars generated.
696 *
697 * The result string is nul-terminated, though most callers rely on the
698 * result length instead.
699 */
700static int32_t
701icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
702{
703 int32_t len_uchar;
704
705 init_icu_converter();
706
707 len_uchar = uchar_length(icu_converter, buff, nbytes);
708
709 *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
710 len_uchar = uchar_convert(icu_converter,
711 *buff_uchar, len_uchar + 1, buff, nbytes);
712
713 return len_uchar;
714}
715
716/*
717 * Convert a string of UChars into the database encoding.
718 *
719 * The source string at buff_uchar is of length len_uchar
720 * (it needn't be nul-terminated)
721 *
722 * *result receives a pointer to the palloc'd result string, and the
723 * function's result is the number of bytes generated (not counting nul).
724 *
725 * The result string is nul-terminated.
726 */
727static size_t
728icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
729{
730 UErrorCode status;
731 int32_t len_result;
732
733 init_icu_converter();
734
735 status = U_ZERO_ERROR;
736 len_result = ucnv_fromUChars(icu_converter, NULL, 0,
737 buff_uchar, len_uchar, &status);
738 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
740 (errmsg("%s failed: %s", "ucnv_fromUChars",
741 u_errorName(status))));
742
743 if (len_result + 1 > destsize)
744 return len_result;
745
746 status = U_ZERO_ERROR;
747 len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
748 buff_uchar, len_uchar, &status);
749 if (U_FAILURE(status) ||
750 status == U_STRING_NOT_TERMINATED_WARNING)
752 (errmsg("%s failed: %s", "ucnv_fromUChars",
753 u_errorName(status))));
754
755 return len_result;
756}
757
758static int32_t
759icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
760 UChar **buff_dest, UChar *buff_source, int32_t len_source)
761{
762 UErrorCode status;
763 int32_t len_dest;
764
765 len_dest = len_source; /* try first with same length */
766 *buff_dest = palloc(len_dest * sizeof(**buff_dest));
767 status = U_ZERO_ERROR;
768 len_dest = func(*buff_dest, len_dest, buff_source, len_source,
769 mylocale->icu.locale, &status);
770 if (status == U_BUFFER_OVERFLOW_ERROR)
771 {
772 /* try again with adjusted length */
773 pfree(*buff_dest);
774 *buff_dest = palloc(len_dest * sizeof(**buff_dest));
775 status = U_ZERO_ERROR;
776 len_dest = func(*buff_dest, len_dest, buff_source, len_source,
777 mylocale->icu.locale, &status);
778 }
779 if (U_FAILURE(status))
781 (errmsg("case conversion failed: %s", u_errorName(status))));
782 return len_dest;
783}
784
785static int32_t
786u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
787 const UChar *src, int32_t srcLength,
788 const char *locale,
789 UErrorCode *pErrorCode)
790{
791 return u_strToTitle(dest, destCapacity, src, srcLength,
792 NULL, locale, pErrorCode);
793}
794
795static int32_t
796u_strFoldCase_default(UChar *dest, int32_t destCapacity,
797 const UChar *src, int32_t srcLength,
798 const char *locale,
799 UErrorCode *pErrorCode)
800{
801 uint32 options = U_FOLD_CASE_DEFAULT;
802 char lang[3];
803 UErrorCode status;
804
805 /*
806 * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
807 * folding does not accept a locale. Instead it just supports a single
808 * option relevant to Turkic languages 'az' and 'tr'; check for those
809 * languages to enable the option.
810 */
811 status = U_ZERO_ERROR;
812 uloc_getLanguage(locale, lang, 3, &status);
813 if (U_SUCCESS(status))
814 {
815 /*
816 * The option name is confusing, but it causes u_strFoldCase to use
817 * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
818 */
819 if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
820 options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
821 }
822
823 return u_strFoldCase(dest, destCapacity, src, srcLength,
824 options, pErrorCode);
825}
826
827/*
828 * strncoll_icu
829 *
830 * Convert the arguments from the database encoding to UChar strings, then
831 * call ucol_strcoll(). An argument length of -1 means that the string is
832 * NUL-terminated.
833 *
834 * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
835 * caller should call that instead.
836 */
837static int
838strncoll_icu(const char *arg1, ssize_t len1,
839 const char *arg2, ssize_t len2, pg_locale_t locale)
840{
841 char sbuf[TEXTBUFLEN];
842 char *buf = sbuf;
843 int32_t ulen1;
844 int32_t ulen2;
845 size_t bufsize1;
846 size_t bufsize2;
847 UChar *uchar1,
848 *uchar2;
849 int result;
850
851 /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
852#ifdef HAVE_UCOL_STRCOLLUTF8
854#endif
855
856 init_icu_converter();
857
858 ulen1 = uchar_length(icu_converter, arg1, len1);
859 ulen2 = uchar_length(icu_converter, arg2, len2);
860
861 bufsize1 = (ulen1 + 1) * sizeof(UChar);
862 bufsize2 = (ulen2 + 1) * sizeof(UChar);
863
864 if (bufsize1 + bufsize2 > TEXTBUFLEN)
865 buf = palloc(bufsize1 + bufsize2);
866
867 uchar1 = (UChar *) buf;
868 uchar2 = (UChar *) (buf + bufsize1);
869
870 ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
871 ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
872
873 result = ucol_strcoll(locale->icu.ucol,
874 uchar1, ulen1,
875 uchar2, ulen2);
876
877 if (buf != sbuf)
878 pfree(buf);
879
880 return result;
881}
882
883/* 'srclen' of -1 means the strings are NUL-terminated */
884static size_t
885strnxfrm_prefix_icu(char *dest, size_t destsize,
886 const char *src, ssize_t srclen,
888{
889 char sbuf[TEXTBUFLEN];
890 char *buf = sbuf;
891 UCharIterator iter;
892 uint32_t state[2];
893 UErrorCode status;
894 int32_t ulen = -1;
895 UChar *uchar = NULL;
896 size_t uchar_bsize;
897 Size result_bsize;
898
899 /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
901
902 init_icu_converter();
903
904 ulen = uchar_length(icu_converter, src, srclen);
905
906 uchar_bsize = (ulen + 1) * sizeof(UChar);
907
908 if (uchar_bsize > TEXTBUFLEN)
909 buf = palloc(uchar_bsize);
910
911 uchar = (UChar *) buf;
912
913 ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
914
915 uiter_setString(&iter, uchar, ulen);
916 state[0] = state[1] = 0; /* won't need that again */
917 status = U_ZERO_ERROR;
918 result_bsize = ucol_nextSortKeyPart(locale->icu.ucol,
919 &iter,
920 state,
921 (uint8_t *) dest,
922 destsize,
923 &status);
924 if (U_FAILURE(status))
926 (errmsg("sort key generation failed: %s",
927 u_errorName(status))));
928
929 return result_bsize;
930}
931
932static void
933init_icu_converter(void)
934{
935 const char *icu_encoding_name;
936 UErrorCode status;
937 UConverter *conv;
938
939 if (icu_converter)
940 return; /* already done */
941
942 icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
943 if (!icu_encoding_name)
945 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
946 errmsg("encoding \"%s\" not supported by ICU",
948
949 status = U_ZERO_ERROR;
950 conv = ucnv_open(icu_encoding_name, &status);
951 if (U_FAILURE(status))
953 (errmsg("could not open ICU converter for encoding \"%s\": %s",
954 icu_encoding_name, u_errorName(status))));
955
956 icu_converter = conv;
957}
958
959/*
960 * Find length, in UChars, of given string if converted to UChar string.
961 *
962 * A length of -1 indicates that the input string is NUL-terminated.
963 */
964static size_t
965uchar_length(UConverter *converter, const char *str, int32_t len)
966{
967 UErrorCode status = U_ZERO_ERROR;
968 int32_t ulen;
969
970 ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
971 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
973 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
974 return ulen;
975}
976
977/*
978 * Convert the given source string into a UChar string, stored in dest, and
979 * return the length (in UChars).
980 *
981 * A srclen of -1 indicates that the input string is NUL-terminated.
982 */
983static int32_t
984uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
985 const char *src, int32_t srclen)
986{
987 UErrorCode status = U_ZERO_ERROR;
988 int32_t ulen;
989
990 status = U_ZERO_ERROR;
991 ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
992 if (U_FAILURE(status))
994 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
995 return ulen;
996}
997
998/*
999 * Parse collation attributes from the given locale string and apply them to
1000 * the open collator.
1001 *
1002 * First, the locale string is canonicalized to an ICU format locale ID such
1003 * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
1004 * the key-value arguments.
1005 *
1006 * Starting with ICU version 54, the attributes are processed automatically by
1007 * ucol_open(), so this is only necessary for emulating this behavior on older
1008 * versions.
1009 */
1011static void
1012icu_set_collation_attributes(UCollator *collator, const char *loc,
1013 UErrorCode *status)
1014{
1015 int32_t len;
1016 char *icu_locale_id;
1017 char *lower_str;
1018 char *str;
1019 char *token;
1020
1021 /*
1022 * The input locale may be a BCP 47 language tag, e.g.
1023 * "und-u-kc-ks-level1", which expresses the same attributes in a
1024 * different form. It will be converted to the equivalent ICU format
1025 * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
1026 * uloc_canonicalize().
1027 */
1028 *status = U_ZERO_ERROR;
1029 len = uloc_canonicalize(loc, NULL, 0, status);
1030 icu_locale_id = palloc(len + 1);
1031 *status = U_ZERO_ERROR;
1032 len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
1033 if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
1034 return;
1035
1036 lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
1037
1038 pfree(icu_locale_id);
1039
1040 str = strchr(lower_str, '@');
1041 if (!str)
1042 return;
1043 str++;
1044
1045 while ((token = strsep(&str, ";")))
1046 {
1047 char *e = strchr(token, '=');
1048
1049 if (e)
1050 {
1051 char *name;
1052 char *value;
1053 UColAttribute uattr;
1054 UColAttributeValue uvalue;
1055
1056 *status = U_ZERO_ERROR;
1057
1058 *e = '\0';
1059 name = token;
1060 value = e + 1;
1061
1062 /*
1063 * See attribute name and value lists in ICU i18n/coll.cpp
1064 */
1065 if (strcmp(name, "colstrength") == 0)
1066 uattr = UCOL_STRENGTH;
1067 else if (strcmp(name, "colbackwards") == 0)
1068 uattr = UCOL_FRENCH_COLLATION;
1069 else if (strcmp(name, "colcaselevel") == 0)
1070 uattr = UCOL_CASE_LEVEL;
1071 else if (strcmp(name, "colcasefirst") == 0)
1072 uattr = UCOL_CASE_FIRST;
1073 else if (strcmp(name, "colalternate") == 0)
1074 uattr = UCOL_ALTERNATE_HANDLING;
1075 else if (strcmp(name, "colnormalization") == 0)
1076 uattr = UCOL_NORMALIZATION_MODE;
1077 else if (strcmp(name, "colnumeric") == 0)
1078 uattr = UCOL_NUMERIC_COLLATION;
1079 else
1080 /* ignore if unknown */
1081 continue;
1082
1083 if (strcmp(value, "primary") == 0)
1084 uvalue = UCOL_PRIMARY;
1085 else if (strcmp(value, "secondary") == 0)
1086 uvalue = UCOL_SECONDARY;
1087 else if (strcmp(value, "tertiary") == 0)
1088 uvalue = UCOL_TERTIARY;
1089 else if (strcmp(value, "quaternary") == 0)
1090 uvalue = UCOL_QUATERNARY;
1091 else if (strcmp(value, "identical") == 0)
1092 uvalue = UCOL_IDENTICAL;
1093 else if (strcmp(value, "no") == 0)
1094 uvalue = UCOL_OFF;
1095 else if (strcmp(value, "yes") == 0)
1096 uvalue = UCOL_ON;
1097 else if (strcmp(value, "shifted") == 0)
1098 uvalue = UCOL_SHIFTED;
1099 else if (strcmp(value, "non-ignorable") == 0)
1100 uvalue = UCOL_NON_IGNORABLE;
1101 else if (strcmp(value, "lower") == 0)
1102 uvalue = UCOL_LOWER_FIRST;
1103 else if (strcmp(value, "upper") == 0)
1104 uvalue = UCOL_UPPER_FIRST;
1105 else
1106 {
1107 *status = U_ILLEGAL_ARGUMENT_ERROR;
1108 break;
1109 }
1110
1111 ucol_setAttribute(collator, uattr, uvalue, status);
1112 }
1113 }
1114
1115 pfree(lower_str);
1116}
1117
1118#endif /* USE_ICU */
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define pg_attribute_unused()
Definition: c.h:138
uint32_t uint32
Definition: c.h:552
size_t Size
Definition: c.h:624
Oid collid
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
const char * get_encoding_name_for_icu(int encoding)
Definition: encnames.c:472
#define MCXT_ALLOC_NO_OOM
Definition: fe_memutils.h:29
char * asc_tolower(const char *buff, size_t nbytes)
Definition: formatting.c:1888
Oid MyDatabaseId
Definition: globals.c:94
Assert(PointerIsAligned(start, uint64))
const char * str
size_t remainder
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
static void * GETSTRUCT(const HeapTupleData *tuple)
Definition: htup_details.h:728
#define token
Definition: indent_globs.h:126
static struct @171 value
static char * locale
Definition: initdb.c:140
unsigned int pg_wchar
Definition: mbprint.c:31
int GetDatabaseEncoding(void)
Definition: mbutils.c:1264
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition: mcxt.c:1746
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1263
char * pstrdup(const char *in)
Definition: mcxt.c:1759
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
void * palloc_extended(Size size, int flags)
Definition: mcxt.c:1417
FormData_pg_collation * Form_pg_collation
Definition: pg_collation.h:58
const void size_t len
pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context)
#define TEXTBUFLEN
Definition: pg_locale_icu.c:48
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition: pg_test_fsync.c:71
@ PG_UTF8
Definition: pg_wchar.h:232
#define pg_encoding_to_char
Definition: pg_wchar.h:630
char * strsep(char **stringp, const char *delim)
Definition: strsep.c:49
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:262
uint64_t Datum
Definition: postgres.h:70
unsigned int Oid
Definition: postgres_ext.h:32
e
Definition: preproc-init.c:82
int(* strncoll)(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition: pg_locale.h:75
size_t(* strlower)(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
Definition: pg_locale.h:101
const struct ctype_methods * ctype
Definition: pg_locale.h:152
const struct collate_methods * collate
Definition: pg_locale.h:151
const char * locale
Definition: pg_locale.h:158
Definition: regguts.h:323
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:264
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:220
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition: syscache.c:595
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition: syscache.c:625
const char * name