PostgreSQL Source Code git master
case_test.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 * case_test.c
3 * Program to test Unicode case mapping functions.
4 *
5 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
6 *
7 * IDENTIFICATION
8 * src/common/unicode/case_test.c
9 *
10 *-------------------------------------------------------------------------
11 */
12#include "postgres_fe.h"
13
14#include <locale.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <wctype.h>
19
20#ifdef USE_ICU
21#include <unicode/ucasemap.h>
22#include <unicode/uchar.h>
23#endif
24#include "common/unicode_case.h"
27
28/* enough to hold largest source or result string, including NUL */
29#define BUFSZ 256
30
31#ifdef USE_ICU
32static UCaseMap * casemap = NULL;
33#endif
34
35typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
36 ssize_t srclen);
37
38/* simple boundary iterator copied from pg_locale_builtin.c */
40{
41 const char *str;
42 size_t len;
43 size_t offset;
44 bool posix;
45 bool init;
46 bool prev_alnum;
47};
48
49static size_t
51{
52 struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
53
54 while (wbstate->offset < wbstate->len &&
55 wbstate->str[wbstate->offset] != '\0')
56 {
57 pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
58 wbstate->offset);
59 bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
60
61 if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
62 {
63 size_t prev_offset = wbstate->offset;
64
65 wbstate->init = true;
66 wbstate->offset += unicode_utf8len(u);
67 wbstate->prev_alnum = curr_alnum;
68 return prev_offset;
69 }
70
71 wbstate->offset += unicode_utf8len(u);
72 }
73
74 return wbstate->len;
75}
76
77#ifdef USE_ICU
78
79static void
80icu_test_simple(pg_wchar code)
81{
86 pg_wchar iculower = u_tolower(code);
87 pg_wchar icutitle = u_totitle(code);
88 pg_wchar icuupper = u_toupper(code);
89 pg_wchar icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
90
91 if (lower != iculower || title != icutitle || upper != icuupper ||
92 fold != icufold)
93 {
94 printf("case_test: FAILURE for codepoint 0x%06x\n", code);
95 printf("case_test: Postgres lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
96 lower, title, upper, fold);
97 printf("case_test: ICU lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
98 iculower, icutitle, icuupper, icufold);
99 printf("\n");
100 exit(1);
101 }
102}
103
104static void
105icu_test_full(char *str)
106{
107 char lower[BUFSZ];
108 char title[BUFSZ];
109 char upper[BUFSZ];
110 char fold[BUFSZ];
111 char icu_lower[BUFSZ];
112 char icu_title[BUFSZ];
113 char icu_upper[BUFSZ];
114 char icu_fold[BUFSZ];
115 UErrorCode status;
116
117 /* full case mapping doesn't use posix semantics */
118 struct WordBoundaryState wbstate = {
119 .str = str,
120 .len = strlen(str),
121 .offset = 0,
122 .posix = false,
123 .init = false,
124 .prev_alnum = false,
125 };
126
127 unicode_strlower(lower, BUFSZ, str, -1, true);
128 unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
129 unicode_strupper(upper, BUFSZ, str, -1, true);
130 unicode_strfold(fold, BUFSZ, str, -1, true);
131 status = U_ZERO_ERROR;
132 ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
133 status = U_ZERO_ERROR;
134 ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
135 status = U_ZERO_ERROR;
136 ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
137 status = U_ZERO_ERROR;
138 ucasemap_utf8FoldCase(casemap, icu_fold, BUFSZ, str, -1, &status);
139
140 if (strcmp(lower, icu_lower) != 0)
141 {
142 printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,
143 icu_lower);
144 exit(1);
145 }
146 if (strcmp(title, icu_title) != 0)
147 {
148 printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,
149 icu_title);
150 exit(1);
151 }
152 if (strcmp(upper, icu_upper) != 0)
153 {
154 printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,
155 icu_upper);
156 exit(1);
157 }
158 if (strcmp(fold, icu_fold) != 0)
159 {
160 printf("case_test: str='%s' fold='%s' icu_fold='%s'\n", str, fold,
161 icu_fold);
162 exit(1);
163 }
164}
165
166/*
167 * Exhaustively compare case mappings with the results from ICU.
168 */
169static void
170test_icu(void)
171{
172 int successful = 0;
173 int skipped_mismatch = 0;
174
175 for (pg_wchar code = 0; code <= 0x10ffff; code++)
176 {
177 pg_unicode_category category = unicode_category(code);
178
179 if (category != PG_U_UNASSIGNED)
180 {
181 uint8_t icu_category = u_charType(code);
182 char code_str[5] = {0};
183
184 if (icu_category == PG_U_UNASSIGNED)
185 {
186 skipped_mismatch++;
187 continue;
188 }
189
190 icu_test_simple(code);
191 unicode_to_utf8(code, (unsigned char *) code_str);
192 icu_test_full(code_str);
193
194 successful++;
195 }
196 }
197
198 if (skipped_mismatch > 0)
199 printf("case_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",
200 skipped_mismatch);
201
202 printf("case_test: ICU simple mapping test: %d codepoints successful\n",
203 successful);
204}
205#endif
206
207static void
208test_convert(TestFunc tfunc, const char *test_string, const char *expected)
209{
210 size_t src1len = strlen(test_string);
211 size_t src2len = -1; /* NUL-terminated */
212 size_t dst1len = strlen(expected);
213 size_t dst2len = strlen(expected) + 1; /* NUL-terminated */
214 char *src1 = malloc(src1len);
215 char *dst1 = malloc(dst1len);
216 char *src2 = strdup(test_string);
217 char *dst2 = malloc(dst2len);
218 size_t needed;
219
220 memcpy(src1, test_string, src1len); /* not NUL-terminated */
221
222 /* neither source nor destination are NUL-terminated */
223 memset(dst1, 0x7F, dst1len);
224 needed = tfunc(dst1, dst1len, src1, src1len);
225 if (needed != strlen(expected))
226 {
227 printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
228 test_string, needed, strlen(expected));
229 exit(1);
230 }
231 if (memcmp(dst1, expected, dst1len) != 0)
232 {
233 printf("case_test: convert_case test1 FAILURE: test: '%s' result: '%.*s' expected: '%s'\n",
234 test_string, (int) dst1len, dst1, expected);
235 exit(1);
236 }
237
238 /* destination is NUL-terminated and source is not */
239 memset(dst2, 0x7F, dst2len);
240 needed = tfunc(dst2, dst2len, src1, src1len);
241 if (needed != strlen(expected))
242 {
243 printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
244 test_string, needed, strlen(expected));
245 exit(1);
246 }
247 if (strcmp(dst2, expected) != 0)
248 {
249 printf("case_test: convert_case test2 FAILURE: test: '%s' result: '%s' expected: '%s'\n",
250 test_string, dst2, expected);
251 exit(1);
252 }
253
254 /* source is NUL-terminated and destination is not */
255 memset(dst1, 0x7F, dst1len);
256 needed = tfunc(dst1, dst1len, src2, src2len);
257 if (needed != strlen(expected))
258 {
259 printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
260 test_string, needed, strlen(expected));
261 printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
262 exit(1);
263 }
264 if (memcmp(dst1, expected, dst1len) != 0)
265 {
266 printf("case_test: convert_case test3 FAILURE: test: '%s' result: '%.*s' expected: '%s'\n",
267 test_string, (int) dst1len, dst1, expected);
268 exit(1);
269 }
270
271 /* both source and destination are NUL-terminated */
272 memset(dst2, 0x7F, dst2len);
273 needed = tfunc(dst2, dst2len, src2, src2len);
274 if (needed != strlen(expected))
275 {
276 printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
277 test_string, needed, strlen(expected));
278 exit(1);
279 }
280 if (strcmp(dst2, expected) != 0)
281 {
282 printf("case_test: convert_case test4 FAILURE: test: '%s' result: '%s' expected: '%s'\n",
283 test_string, dst2, expected);
284 exit(1);
285 }
286
287 free(src1);
288 free(dst1);
289 free(src2);
290 free(dst2);
291}
292
293static size_t
294tfunc_lower(char *dst, size_t dstsize, const char *src,
295 ssize_t srclen)
296{
297 return unicode_strlower(dst, dstsize, src, srclen, true);
298}
299
300static size_t
301tfunc_title(char *dst, size_t dstsize, const char *src,
302 ssize_t srclen)
303{
304 struct WordBoundaryState wbstate = {
305 .str = src,
306 .len = srclen,
307 .offset = 0,
308 .init = false,
309 .prev_alnum = false,
310 };
311
312 return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
313 &wbstate);
314}
315
316static size_t
317tfunc_upper(char *dst, size_t dstsize, const char *src,
318 ssize_t srclen)
319{
320 return unicode_strupper(dst, dstsize, src, srclen, true);
321}
322
323static size_t
324tfunc_fold(char *dst, size_t dstsize, const char *src,
325 ssize_t srclen)
326{
327 return unicode_strfold(dst, dstsize, src, srclen, true);
328}
329
330static void
332{
333 /* test string with no case changes */
334 test_convert(tfunc_lower, "√∞", "√∞");
335 /* test adjust-to-cased behavior */
336 test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");
337 /* test string with case changes */
338 test_convert(tfunc_upper, "abc", "ABC");
339 /* test string with case changes and byte length changes */
340 test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");
341 /* test special case conversions */
342 test_convert(tfunc_upper, "ß", "SS");
343 test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
344 test_convert(tfunc_upper, "ıiIİ", "IIIİ");
345 test_convert(tfunc_fold, "ıiIİ", "ıiii\u0307");
346 /* test final sigma */
347 test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
348 test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
349 test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
350 test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
351 /* test that alphanumerics are word characters */
352 test_convert(tfunc_title, "λλ", "Λλ");
353 test_convert(tfunc_title, "1a", "1a");
354 /* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */
355 test_convert(tfunc_title, "\uFF11a", "\uFF11a");
356
357
358#ifdef USE_ICU
359 icu_test_full("");
360 icu_test_full("ȺȺȺ");
361 icu_test_full("ßßß");
362 icu_test_full("√∞");
363 icu_test_full("a b");
364 icu_test_full("abc 123xyz");
365 icu_test_full("σςΣ ΣΣΣ");
366 icu_test_full("ıiIİ");
367 icu_test_full("\uFF11a");
368 /* test <alpha><iota_subscript><acute> */
369 icu_test_full("\u0391\u0345\u0301");
370#endif
371
372 printf("case_test: convert_case: success\n");
373}
374
375int
376main(int argc, char **argv)
377{
378#ifdef USE_ICU
379 UErrorCode status = U_ZERO_ERROR;
380
381 /*
382 * Disable ICU's word break adjustment for titlecase to match the expected
383 * behavior of unicode_strtitle().
384 */
385 casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);
386 if (U_FAILURE(status))
387 {
388 printf("case_test: failure opening UCaseMap: %s\n",
389 u_errorName(status));
390 exit(1);
391 }
392#endif
393
394 printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
395#ifdef USE_ICU
396 printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
397 test_icu();
398#else
399 printf("case_test: ICU not available; skipping\n");
400#endif
401
403
404#ifdef USE_ICU
405 ucasemap_close(casemap);
406#endif
407 exit(0);
408}
static void test_convert_case()
Definition: case_test.c:331
static void test_convert(TestFunc tfunc, const char *test_string, const char *expected)
Definition: case_test.c:208
int main(int argc, char **argv)
Definition: case_test.c:376
static size_t tfunc_lower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:294
static size_t initcap_wbnext(void *state)
Definition: case_test.c:50
static size_t tfunc_title(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:301
static size_t tfunc_upper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:317
#define BUFSZ
Definition: case_test.c:29
static size_t tfunc_fold(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:324
size_t(* TestFunc)(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:35
const char * str
#define free(a)
Definition: header.h:65
#define malloc(a)
Definition: header.h:50
static pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53
unsigned int pg_wchar
Definition: mbprint.c:31
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:49
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:80
static unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: pg_wchar.h:575
static int unicode_utf8len(pg_wchar c)
Definition: pg_wchar.h:607
#define printf(...)
Definition: port.h:245
Definition: regguts.h:323
pg_wchar unicode_uppercase_simple(pg_wchar code)
Definition: unicode_case.c:66
pg_wchar unicode_titlecase_simple(pg_wchar code)
Definition: unicode_case.c:58
size_t unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:165
static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full, const char *src, size_t srclen, size_t srcoff, pg_wchar *simple, const pg_wchar **special)
Definition: unicode_case.c:397
size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:101
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:138
pg_wchar unicode_lowercase_simple(pg_wchar code)
Definition: unicode_case.c:50
size_t unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:189
pg_wchar unicode_casefold_simple(pg_wchar code)
Definition: unicode_case.c:74
bool pg_u_isalnum(pg_wchar code, bool posix)
pg_unicode_category unicode_category(pg_wchar code)
pg_unicode_category
@ PG_U_UNASSIGNED
#define PG_UNICODE_VERSION