PostgreSQL Source Code git master
unicode_case.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 * unicode_case.c
3 * Unicode case mapping and case conversion.
4 *
5 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
6 *
7 * IDENTIFICATION
8 * src/common/unicode_case.c
9 *
10 *-------------------------------------------------------------------------
11 */
12#ifndef FRONTEND
13#include "postgres.h"
14#else
15#include "postgres_fe.h"
16#endif
17
18#include "common/unicode_case.h"
21#include "mb/pg_wchar.h"
22
24{
28};
29
30/*
31 * Map for each case kind.
32 */
33static const pg_wchar *const casekind_map[NCaseKind] =
34{
39};
40
41static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
42static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
43 CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
44 void *wbstate);
45static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
46 const char *src, size_t srclen, size_t srcoff,
47 pg_wchar *simple, const pg_wchar **special);
48
51{
53
54 return cp != 0 ? cp : code;
55}
56
59{
61
62 return cp != 0 ? cp : code;
63}
64
67{
69
70 return cp != 0 ? cp : code;
71}
72
75{
77
78 return cp != 0 ? cp : code;
79}
80
81/*
82 * unicode_strlower()
83 *
84 * Convert src to lowercase, and return the result length (not including
85 * terminating NUL).
86 *
87 * String src must be encoded in UTF-8. If srclen < 0, src must be
88 * NUL-terminated.
89 *
90 * Result string is stored in dst, truncating if larger than dstsize. If
91 * dstsize is greater than the result length, dst will be NUL-terminated;
92 * otherwise not.
93 *
94 * If dstsize is zero, dst may be NULL. This is useful for calculating the
95 * required buffer size before allocating.
96 *
97 * If full is true, use special case mappings if available and if the
98 * conditions are satisfied.
99 */
100size_t
101unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
102 bool full)
103{
104 return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
105 NULL);
106}
107
108/*
109 * unicode_strtitle()
110 *
111 * Convert src to titlecase, and return the result length (not including
112 * terminating NUL).
113 *
114 * String src must be encoded in UTF-8. If srclen < 0, src must be
115 * NUL-terminated.
116 *
117 * Result string is stored in dst, truncating if larger than dstsize. If
118 * dstsize is greater than the result length, dst will be NUL-terminated;
119 * otherwise not.
120 *
121 * If dstsize is zero, dst may be NULL. This is useful for calculating the
122 * required buffer size before allocating.
123 *
124 * If full is true, use special case mappings if available and if the
125 * conditions are satisfied. Otherwise, use only simple mappings and use
126 * uppercase instead of titlecase.
127 *
128 * Titlecasing requires knowledge about word boundaries, which is provided by
129 * the callback wbnext. A word boundary is the offset of the start of a word
130 * or the offset of the character immediately following a word.
131 *
132 * The caller is expected to initialize and free the callback state
133 * wbstate. The callback should first return offset 0 for the first boundary;
134 * then the offset of each subsequent word boundary; then the total length of
135 * the string to indicate the final boundary.
136 */
137size_t
138unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
139 bool full, WordBoundaryNext wbnext, void *wbstate)
140{
141 return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
142 wbstate);
143}
144
145/*
146 * unicode_strupper()
147 *
148 * Convert src to uppercase, and return the result length (not including
149 * terminating NUL).
150 *
151 * String src must be encoded in UTF-8. If srclen < 0, src must be
152 * NUL-terminated.
153 *
154 * Result string is stored in dst, truncating if larger than dstsize. If
155 * dstsize is greater than the result length, dst will be NUL-terminated;
156 * otherwise not.
157 *
158 * If dstsize is zero, dst may be NULL. This is useful for calculating the
159 * required buffer size before allocating.
160 *
161 * If full is true, use special case mappings if available and if the
162 * conditions are satisfied.
163 */
164size_t
165unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
166 bool full)
167{
168 return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
169 NULL);
170}
171
172/*
173 * unicode_strfold()
174 *
175 * Case fold src, and return the result length (not including terminating
176 * NUL).
177 *
178 * String src must be encoded in UTF-8. If srclen < 0, src must be
179 * NUL-terminated.
180 *
181 * Result string is stored in dst, truncating if larger than dstsize. If
182 * dstsize is greater than the result length, dst will be NUL-terminated;
183 * otherwise not.
184 *
185 * If dstsize is zero, dst may be NULL. This is useful for calculating the
186 * required buffer size before allocating.
187 */
188size_t
189unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
190 bool full)
191{
192 return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
193 NULL);
194}
195
196/*
197 * Implement Unicode Default Case Conversion algorithm.
198 *
199 * If str_casekind is CaseLower or CaseUpper, map each character in the string
200 * for which a mapping is available.
201 *
202 * If str_casekind is CaseTitle, maps characters found on a word boundary to
203 * titlecase (or uppercase if full is false) and other characters to
204 * lowercase. NB: does not currently implement the Unicode behavior in which
205 * the word boundary is adjusted to the next Cased character. That behavior
206 * could be implemented as an option, but it doesn't match the default
207 * behavior of ICU, nor does it match the documented behavior of INITCAP().
208 *
209 * If full is true, use special mappings for relevant characters, which can
210 * map a single codepoint to multiple codepoints, or depend on conditions.
211 */
212static size_t
213convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
214 CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
215 void *wbstate)
216{
217 /* character CaseKind varies while titlecasing */
218 CaseKind chr_casekind = str_casekind;
219 size_t srcoff = 0;
220 size_t result_len = 0;
221 size_t boundary = 0;
222
223 Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
224 (str_casekind != CaseTitle && !wbnext && !wbstate));
225
226 if (str_casekind == CaseTitle)
227 {
228 boundary = wbnext(wbstate);
229 Assert(boundary == 0); /* start of text is always a boundary */
230 }
231
232 while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
233 {
234 pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
235 int u1len = unicode_utf8len(u1);
236 pg_wchar simple = 0;
237 const pg_wchar *special = NULL;
238 enum CaseMapResult casemap_result;
239
240 if (str_casekind == CaseTitle)
241 {
242 if (srcoff == boundary)
243 {
244 chr_casekind = full ? CaseTitle : CaseUpper;
245 boundary = wbnext(wbstate);
246 }
247 else
248 chr_casekind = CaseLower;
249 }
250
251 casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
252 &simple, &special);
253
254 switch (casemap_result)
255 {
256 case CASEMAP_SELF:
257 /* no mapping; copy bytes from src */
258 Assert(simple == 0);
259 Assert(special == NULL);
260 if (result_len + u1len <= dstsize)
261 memcpy(dst + result_len, src + srcoff, u1len);
262
263 result_len += u1len;
264 break;
265 case CASEMAP_SIMPLE:
266 {
267 /* replace with single character */
268 pg_wchar u2 = simple;
269 pg_wchar u2len = unicode_utf8len(u2);
270
271 Assert(special == NULL);
272 if (result_len + u2len <= dstsize)
273 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
274
275 result_len += u2len;
276 }
277 break;
278 case CASEMAP_SPECIAL:
279 /* replace with up to MAX_CASE_EXPANSION characters */
280 Assert(simple == 0);
281 for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
282 {
283 pg_wchar u2 = special[i];
284 size_t u2len = unicode_utf8len(u2);
285
286 if (result_len + u2len <= dstsize)
287 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
288
289 result_len += u2len;
290 }
291 break;
292 }
293
294 srcoff += u1len;
295 }
296
297 if (result_len < dstsize)
298 dst[result_len] = '\0';
299
300 return result_len;
301}
302
303/*
304 * Check that the condition matches Final_Sigma, described in Unicode Table
305 * 3-17. The character at the given offset must be directly preceded by a
306 * Cased character, and must not be directly followed by a Cased character.
307 *
308 * Case_Ignorable characters are ignored. NB: some characters may be both
309 * Cased and Case_Ignorable, in which case they are ignored.
310 */
311static bool
312check_final_sigma(const unsigned char *str, size_t len, size_t offset)
313{
314 /* the start of the string is not preceded by a Cased character */
315 if (offset == 0)
316 return false;
317
318 /* iterate backwards, looking for Cased character */
319 for (int i = offset - 1; i >= 0; i--)
320 {
321 if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
322 {
323 pg_wchar curr = utf8_to_unicode(str + i);
324
325 if (pg_u_prop_case_ignorable(curr))
326 continue;
327 else if (pg_u_prop_cased(curr))
328 break;
329 else
330 return false;
331 }
332 else if ((str[i] & 0xC0) == 0x80)
333 continue;
334
335 Assert(false); /* invalid UTF-8 */
336 }
337
338 /* end of string is not followed by a Cased character */
339 if (offset == len)
340 return true;
341
342 /* iterate forwards, looking for Cased character */
343 for (int i = offset + 1; i < len && str[i] != '\0'; i++)
344 {
345 if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
346 {
347 pg_wchar curr = utf8_to_unicode(str + i);
348
349 if (pg_u_prop_case_ignorable(curr))
350 continue;
351 else if (pg_u_prop_cased(curr))
352 return false;
353 else
354 break;
355 }
356 else if ((str[i] & 0xC0) == 0x80)
357 continue;
358
359 Assert(false); /* invalid UTF-8 */
360 }
361
362 return true;
363}
364
365/*
366 * Unicode allows for special casing to be applied only under certain
367 * circumstances. The only currently-supported condition is Final_Sigma.
368 */
369static bool
370check_special_conditions(int conditions, const char *str, size_t len,
371 size_t offset)
372{
373 if (conditions == 0)
374 return true;
375 else if (conditions == PG_U_FINAL_SIGMA)
376 return check_final_sigma((unsigned char *) str, len, offset);
377
378 /* no other conditions supported */
379 Assert(false);
380 return false;
381}
382
383/*
384 * Map the given character to the requested case.
385 *
386 * If full is true, and a special case mapping is found and the conditions are
387 * met, 'special' is set to the mapping result (which is an array of up to
388 * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
389 *
390 * Otherwise, search for a simple mapping, and if found, set 'simple' to the
391 * result and return CASEMAP_SIMPLE.
392 *
393 * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
394 * character without modification.
395 */
396static enum CaseMapResult
397casemap(pg_wchar u1, CaseKind casekind, bool full,
398 const char *src, size_t srclen, size_t srcoff,
399 pg_wchar *simple, const pg_wchar **special)
400{
401 uint16 idx;
402
403 /* Fast path for codepoints < 0x80 */
404 if (u1 < 0x80)
405 {
406 /*
407 * The first elements in all tables are reserved as 0 (as NULL). The
408 * data starts at index 1, not 0.
409 */
410 *simple = casekind_map[casekind][u1 + 1];
411
412 return CASEMAP_SIMPLE;
413 }
414
415 idx = case_index(u1);
416
417 if (idx == 0)
418 return CASEMAP_SELF;
419
420 if (full && case_map_special[idx] &&
422 src, srclen, srcoff))
423 {
424 *special = special_case[case_map_special[idx]].map[casekind];
425 return CASEMAP_SPECIAL;
426 }
427
428 *simple = casekind_map[casekind][idx];
429
430 return CASEMAP_SIMPLE;
431}
432
433/*
434 * Find entry in simple case map.
435 * If the entry does not exist, 0 will be returned.
436 */
437static pg_wchar
439{
440 /* Fast path for codepoints < 0x80 */
441 if (ucs < 0x80)
442 /* The first elements in all tables are reserved as 0 (as NULL). */
443 return map[ucs + 1];
444 return map[case_index(ucs)];
445}
Datum idx(PG_FUNCTION_ARGS)
Definition: _int_op.c:262
uint16_t uint16
Definition: c.h:501
Assert(PointerIsAligned(start, uint64))
const char * str
int i
Definition: isn.c:77
static pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53
unsigned int pg_wchar
Definition: mbprint.c:31
const void size_t len
static unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: pg_wchar.h:575
static int unicode_utf8len(pg_wchar c)
Definition: pg_wchar.h:607
pg_wchar map[NCaseKind][MAX_CASE_EXPANSION]
pg_wchar unicode_uppercase_simple(pg_wchar code)
Definition: unicode_case.c:66
pg_wchar unicode_titlecase_simple(pg_wchar code)
Definition: unicode_case.c:58
size_t unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:165
static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full, const char *src, size_t srclen, size_t srcoff, pg_wchar *simple, const pg_wchar **special)
Definition: unicode_case.c:397
size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:101
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, CaseKind str_casekind, bool full, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:213
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:138
static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map)
Definition: unicode_case.c:438
static bool check_special_conditions(int conditions, const char *str, size_t len, size_t offset)
Definition: unicode_case.c:370
static const pg_wchar *const casekind_map[NCaseKind]
Definition: unicode_case.c:33
pg_wchar unicode_lowercase_simple(pg_wchar code)
Definition: unicode_case.c:50
size_t unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:189
pg_wchar unicode_casefold_simple(pg_wchar code)
Definition: unicode_case.c:74
CaseMapResult
Definition: unicode_case.c:24
@ CASEMAP_SPECIAL
Definition: unicode_case.c:27
@ CASEMAP_SIMPLE
Definition: unicode_case.c:26
@ CASEMAP_SELF
Definition: unicode_case.c:25
static bool check_final_sigma(const unsigned char *str, size_t len, size_t offset)
Definition: unicode_case.c:312
size_t(* WordBoundaryNext)(void *wbstate)
Definition: unicode_case.h:19
static const uint8 case_map_special[1704]
#define MAX_CASE_EXPANSION
#define PG_U_FINAL_SIGMA
@ CaseFold
@ CaseTitle
@ NCaseKind
@ CaseLower
@ CaseUpper
static const pg_special_case special_case[106]
static const pg_wchar case_map_title[1704]
static const pg_wchar case_map_upper[1704]
static const pg_wchar case_map_fold[1704]
static const pg_wchar case_map_lower[1704]
static uint16 case_index(pg_wchar cp)
bool pg_u_prop_cased(pg_wchar code)
bool pg_u_prop_case_ignorable(pg_wchar code)