Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for builtin provider
4 : *
5 : * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_builtin.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #include "catalog/pg_database.h"
15 : #include "catalog/pg_collation.h"
16 : #include "common/unicode_case.h"
17 : #include "common/unicode_category.h"
18 : #include "mb/pg_wchar.h"
19 : #include "miscadmin.h"
20 : #include "utils/builtins.h"
21 : #include "utils/pg_locale.h"
22 : #include "utils/syscache.h"
23 :
24 : extern pg_locale_t create_pg_locale_builtin(Oid collid,
25 : MemoryContext context);
26 : extern char *get_collation_actual_version_builtin(const char *collcollate);
27 :
28 : struct WordBoundaryState
29 : {
30 : const char *str;
31 : size_t len;
32 : size_t offset;
33 : bool posix;
34 : bool init;
35 : bool prev_alnum;
36 : };
37 :
38 : /*
39 : * Simple word boundary iterator that draws boundaries each time the result of
40 : * pg_u_isalnum() changes.
41 : */
42 : static size_t
43 824 : initcap_wbnext(void *state)
44 : {
45 824 : struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
46 :
47 1700 : while (wbstate->offset < wbstate->len &&
48 1506 : wbstate->str[wbstate->offset] != '\0')
49 : {
50 1506 : pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
51 1506 : wbstate->offset);
52 1506 : bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
53 :
54 1506 : if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
55 : {
56 630 : size_t prev_offset = wbstate->offset;
57 :
58 630 : wbstate->init = true;
59 630 : wbstate->offset += unicode_utf8len(u);
60 630 : wbstate->prev_alnum = curr_alnum;
61 630 : return prev_offset;
62 : }
63 :
64 876 : wbstate->offset += unicode_utf8len(u);
65 : }
66 :
67 194 : return wbstate->len;
68 : }
69 :
70 : static size_t
71 13022 : strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
72 : pg_locale_t locale)
73 : {
74 26044 : return unicode_strlower(dest, destsize, src, srclen,
75 13022 : locale->info.builtin.casemap_full);
76 : }
77 :
78 : static size_t
79 194 : strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
80 : pg_locale_t locale)
81 : {
82 194 : struct WordBoundaryState wbstate = {
83 : .str = src,
84 : .len = srclen,
85 : .offset = 0,
86 194 : .posix = !locale->info.builtin.casemap_full,
87 : .init = false,
88 : .prev_alnum = false,
89 : };
90 :
91 388 : return unicode_strtitle(dest, destsize, src, srclen,
92 194 : locale->info.builtin.casemap_full,
93 : initcap_wbnext, &wbstate);
94 : }
95 :
96 : static size_t
97 316882 : strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
98 : pg_locale_t locale)
99 : {
100 633764 : return unicode_strupper(dest, destsize, src, srclen,
101 316882 : locale->info.builtin.casemap_full);
102 : }
103 :
104 : static size_t
105 12 : strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
106 : pg_locale_t locale)
107 : {
108 24 : return unicode_strfold(dest, destsize, src, srclen,
109 12 : locale->info.builtin.casemap_full);
110 : }
111 :
112 : static bool
113 65660 : wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale)
114 : {
115 65660 : return pg_u_isdigit(wc, !locale->info.builtin.casemap_full);
116 : }
117 :
118 : static bool
119 4118 : wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale)
120 : {
121 4118 : return pg_u_isalpha(wc);
122 : }
123 :
124 : static bool
125 36860 : wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale)
126 : {
127 36860 : return pg_u_isalnum(wc, !locale->info.builtin.casemap_full);
128 : }
129 :
130 : static bool
131 24576 : wc_isupper_builtin(pg_wchar wc, pg_locale_t locale)
132 : {
133 24576 : return pg_u_isupper(wc);
134 : }
135 :
136 : static bool
137 0 : wc_islower_builtin(pg_wchar wc, pg_locale_t locale)
138 : {
139 0 : return pg_u_islower(wc);
140 : }
141 :
142 : static bool
143 0 : wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale)
144 : {
145 0 : return pg_u_isgraph(wc);
146 : }
147 :
148 : static bool
149 0 : wc_isprint_builtin(pg_wchar wc, pg_locale_t locale)
150 : {
151 0 : return pg_u_isprint(wc);
152 : }
153 :
154 : static bool
155 24576 : wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale)
156 : {
157 24576 : return pg_u_ispunct(wc, !locale->info.builtin.casemap_full);
158 : }
159 :
160 : static bool
161 16398 : wc_isspace_builtin(pg_wchar wc, pg_locale_t locale)
162 : {
163 16398 : return pg_u_isspace(wc);
164 : }
165 :
166 : static bool
167 0 : char_is_cased_builtin(char ch, pg_locale_t locale)
168 : {
169 0 : return IS_HIGHBIT_SET(ch) ||
170 0 : (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
171 : }
172 :
173 : static pg_wchar
174 528 : wc_toupper_builtin(pg_wchar wc, pg_locale_t locale)
175 : {
176 528 : return unicode_uppercase_simple(wc);
177 : }
178 :
179 : static pg_wchar
180 528 : wc_tolower_builtin(pg_wchar wc, pg_locale_t locale)
181 : {
182 528 : return unicode_lowercase_simple(wc);
183 : }
184 :
185 : static const struct ctype_methods ctype_methods_builtin = {
186 : .strlower = strlower_builtin,
187 : .strtitle = strtitle_builtin,
188 : .strupper = strupper_builtin,
189 : .strfold = strfold_builtin,
190 : .wc_isdigit = wc_isdigit_builtin,
191 : .wc_isalpha = wc_isalpha_builtin,
192 : .wc_isalnum = wc_isalnum_builtin,
193 : .wc_isupper = wc_isupper_builtin,
194 : .wc_islower = wc_islower_builtin,
195 : .wc_isgraph = wc_isgraph_builtin,
196 : .wc_isprint = wc_isprint_builtin,
197 : .wc_ispunct = wc_ispunct_builtin,
198 : .wc_isspace = wc_isspace_builtin,
199 : .char_is_cased = char_is_cased_builtin,
200 : .wc_tolower = wc_tolower_builtin,
201 : .wc_toupper = wc_toupper_builtin,
202 : };
203 :
204 : pg_locale_t
205 1804 : create_pg_locale_builtin(Oid collid, MemoryContext context)
206 : {
207 : const char *locstr;
208 : pg_locale_t result;
209 :
210 1804 : if (collid == DEFAULT_COLLATION_OID)
211 : {
212 : HeapTuple tp;
213 : Datum datum;
214 :
215 1748 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
216 1748 : if (!HeapTupleIsValid(tp))
217 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
218 1748 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
219 : Anum_pg_database_datlocale);
220 1748 : locstr = TextDatumGetCString(datum);
221 1748 : ReleaseSysCache(tp);
222 : }
223 : else
224 : {
225 : HeapTuple tp;
226 : Datum datum;
227 :
228 56 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
229 56 : if (!HeapTupleIsValid(tp))
230 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
231 56 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
232 : Anum_pg_collation_colllocale);
233 56 : locstr = TextDatumGetCString(datum);
234 56 : ReleaseSysCache(tp);
235 : }
236 :
237 1804 : builtin_validate_locale(GetDatabaseEncoding(), locstr);
238 :
239 1804 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
240 :
241 1804 : result->info.builtin.locale = MemoryContextStrdup(context, locstr);
242 1804 : result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0);
243 1804 : result->deterministic = true;
244 1804 : result->collate_is_c = true;
245 1804 : result->ctype_is_c = (strcmp(locstr, "C") == 0);
246 1804 : if (!result->ctype_is_c)
247 1782 : result->ctype = &ctype_methods_builtin;
248 :
249 1804 : return result;
250 : }
251 :
252 : char *
253 1880 : get_collation_actual_version_builtin(const char *collcollate)
254 : {
255 : /*
256 : * The only two supported locales (C and C.UTF-8) are both based on memcmp
257 : * and are not expected to change, but track the version anyway.
258 : *
259 : * Note that the character semantics may change for some locales, but the
260 : * collation version only tracks changes to sort order.
261 : */
262 1880 : if (strcmp(collcollate, "C") == 0)
263 48 : return "1";
264 1832 : else if (strcmp(collcollate, "C.UTF-8") == 0)
265 1808 : return "1";
266 24 : else if (strcmp(collcollate, "PG_UNICODE_FAST") == 0)
267 24 : return "1";
268 : else
269 0 : ereport(ERROR,
270 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
271 : errmsg("invalid locale name \"%s\" for builtin provider",
272 : collcollate)));
273 :
274 : return NULL; /* keep compiler quiet */
275 : }
|