Add unicode_strtitle() for Unicode Default Case Conversion.
authorJeff Davis <jdavis@postgresql.org>
Sat, 30 Mar 2024 00:35:07 +0000 (17:35 -0700)
committerJeff Davis <jdavis@postgresql.org>
Sat, 30 Mar 2024 00:35:07 +0000 (17:35 -0700)
This brings the titlecasing implementation for the builtin provider
out of formatting.c and into unicode_case.c, along with
unicode_strlower() and unicode_strupper(). Accepts an arbitrary word
boundary callback.

Simple for now, but can be extended to support the Unicode Default
Case Conversion algorithm with full case mapping.

Discussion: https://postgr.es/m/3bc653b5d562ae9e2838b11cb696816c328a489a.camel@j-davis.com
Reviewed-by: Peter Eisentraut
src/backend/utils/adt/formatting.c
src/common/unicode_case.c
src/include/common/unicode_case.h

index 79df80704d751e4f0ef31ff56077ffd27789306a..8736ada4be296116862e3aa699e8a1b325ad0be8 100644 (file)
@@ -1922,6 +1922,47 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
    return result;
 }
 
+struct WordBoundaryState
+{
+   const char *str;
+   size_t      len;
+   size_t      offset;
+   bool        init;
+   bool        prev_alnum;
+};
+
+/*
+ * Simple word boundary iterator that draws boundaries each time the result of
+ * pg_u_isalnum() changes.
+ */
+static size_t
+initcap_wbnext(void *state)
+{
+   struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+
+   while (wbstate->offset < wbstate->len &&
+          wbstate->str[wbstate->offset] != '\0')
+   {
+       pg_wchar    u = utf8_to_unicode((unsigned char *) wbstate->str +
+                                       wbstate->offset);
+       bool        curr_alnum = pg_u_isalnum(u, true);
+
+       if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+       {
+           size_t      prev_offset = wbstate->offset;
+
+           wbstate->init = true;
+           wbstate->offset += unicode_utf8len(u);
+           wbstate->prev_alnum = curr_alnum;
+           return prev_offset;
+       }
+
+       wbstate->offset += unicode_utf8len(u);
+   }
+
+   return wbstate->len;
+}
+
 /*
  * collation-aware, wide-character-aware initcap function
  *
@@ -1980,56 +2021,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 #endif
        if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
        {
-           const unsigned char *src = (unsigned char *) buff;
+           const char *src = buff;
            size_t      srclen = nbytes;
-           unsigned char *dst;
            size_t      dstsize;
-           int         srcoff = 0;
-           int         dstoff = 0;
+           char       *dst;
+           size_t      needed;
+           struct WordBoundaryState wbstate = {
+               .str = src,
+               .len = srclen,
+               .offset = 0,
+               .init = false,
+               .prev_alnum = false,
+           };
 
            Assert(GetDatabaseEncoding() == PG_UTF8);
 
-           /* overflow paranoia */
-           if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN))
-               ereport(ERROR,
-                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                        errmsg("out of memory")));
-
-           /* result is at most srclen codepoints plus terminating NUL */
-           dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1;
-           dst = (unsigned char *) palloc(dstsize);
+           /* first try buffer of equal size plus terminating NUL */
+           dstsize = srclen + 1;
+           dst = palloc(dstsize);
 
-           while (srcoff < nbytes)
+           needed = unicode_strtitle(dst, dstsize, src, srclen,
+                                     initcap_wbnext, &wbstate);
+           if (needed + 1 > dstsize)
            {
-               pg_wchar    u1 = utf8_to_unicode(src + srcoff);
-               pg_wchar    u2;
-               int         u1len = unicode_utf8len(u1);
-               int         u2len;
-
-               if (wasalnum)
-                   u2 = unicode_lowercase_simple(u1);
-               else
-                   u2 = unicode_uppercase_simple(u1);
+               /* reset iterator */
+               wbstate.offset = 0;
+               wbstate.init = false;
 
-               u2len = unicode_utf8len(u2);
-
-               Assert(dstoff + u2len + 1 <= dstsize);
-
-               wasalnum = pg_u_isalnum(u2, true);
-
-               unicode_to_utf8(u2, dst + dstoff);
-               srcoff += u1len;
-               dstoff += u2len;
+               /* grow buffer if needed and retry */
+               dstsize = needed + 1;
+               dst = repalloc(dst, dstsize);
+               needed = unicode_strtitle(dst, dstsize, src, srclen,
+                                         initcap_wbnext, &wbstate);
+               Assert(needed + 1 == dstsize);
            }
 
-           Assert(dstoff + 1 <= dstsize);
-           *(dst + dstoff) = '\0';
-           dstoff++;
-
-           /* allocate result buffer of the right size and free workspace */
-           result = palloc(dstoff);
-           memcpy(result, dst, dstoff);
-           pfree(dst);
+           result = dst;
        }
        else
        {
index 5e77490006fc60cdf86549f55431e8d50b53ad1c..bc423b0890c4d7d6a3a9954878139a3da6d2ca87 100644 (file)
@@ -21,8 +21,9 @@
 #include "mb/pg_wchar.h"
 
 static const pg_case_map *find_case_map(pg_wchar ucs);
-static size_t convert_case(char *dst, size_t dstsize, const char *src,
-                          ssize_t srclen, CaseKind casekind);
+static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+                          CaseKind str_casekind, WordBoundaryNext wbnext,
+                          void *wbstate);
 
 pg_wchar
 unicode_lowercase_simple(pg_wchar code)
@@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code)
 size_t
 unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 {
-   return convert_case(dst, dstsize, src, srclen, CaseLower);
+   return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
+}
+
+/*
+ * unicode_strtitle()
+ *
+ * Convert src to titlecase, and return the result length (not including
+ * terminating NUL).
+ *
+ * String src must be encoded in UTF-8. If srclen < 0, src must be
+ * NUL-terminated.
+ *
+ * Result string is stored in dst, truncating if larger than dstsize. If
+ * dstsize is greater than the result length, dst will be NUL-terminated;
+ * otherwise not.
+ *
+ * If dstsize is zero, dst may be NULL. This is useful for calculating the
+ * required buffer size before allocating.
+ *
+ * Titlecasing requires knowledge about word boundaries, which is provided by
+ * the callback wbnext. A word boundary is the offset of the start of a word
+ * or the offset of the character immediately following a word.
+ *
+ * The caller is expected to initialize and free the callback state
+ * wbstate. The callback should first return offset 0 for the first boundary;
+ * then the offset of each subsequent word boundary; then the total length of
+ * the string to indicate the final boundary.
+ */
+size_t
+unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+                WordBoundaryNext wbnext, void *wbstate)
+{
+   return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
+                       wbstate);
 }
 
 /*
@@ -89,20 +123,34 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 size_t
 unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 {
-   return convert_case(dst, dstsize, src, srclen, CaseUpper);
+   return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
 }
 
 /*
- * Implement Unicode Default Case Conversion algorithm.
+ * If str_casekind is CaseLower or CaseUpper, map each character in the string
+ * for which a mapping is available.
  *
- * Map each character in the string for which a mapping is available.
+ * If str_casekind is CaseTitle, maps characters found on a word boundary to
+ * uppercase and other characters to lowercase.
  */
 static size_t
 convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-            CaseKind casekind)
+            CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
 {
+   /* character CaseKind varies while titlecasing */
+   CaseKind    chr_casekind = str_casekind;
    size_t      srcoff = 0;
    size_t      result_len = 0;
+   size_t      boundary = 0;
+
+   Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
+          (str_casekind != CaseTitle && !wbnext && !wbstate));
+
+   if (str_casekind == CaseTitle)
+   {
+       boundary = wbnext(wbstate);
+       Assert(boundary == 0);  /* start of text is always a boundary */
+   }
 
    while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
    {
@@ -110,9 +158,21 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
        int         u1len = unicode_utf8len(u1);
        const       pg_case_map *casemap = find_case_map(u1);
 
+       if (str_casekind == CaseTitle)
+       {
+           if (srcoff == boundary)
+           {
+               chr_casekind = CaseUpper;
+               boundary = wbnext(wbstate);
+           }
+           else
+               chr_casekind = CaseLower;
+       }
+
+       /* perform mapping, update result_len, and write to dst */
        if (casemap)
        {
-           pg_wchar    u2 = casemap->simplemap[casekind];
+           pg_wchar    u2 = casemap->simplemap[chr_casekind];
            pg_wchar    u2len = unicode_utf8len(u2);
 
            if (result_len + u2len <= dstsize)
index df36d8db2135b94e8a5aceb875edd352d3fe518a..c0c3382e79ec0cf34928182ca1cff968aa25e029 100644 (file)
 
 #include "mb/pg_wchar.h"
 
+typedef size_t (*WordBoundaryNext) (void *wbstate);
+
 pg_wchar   unicode_lowercase_simple(pg_wchar ucs);
 pg_wchar   unicode_titlecase_simple(pg_wchar ucs);
 pg_wchar   unicode_uppercase_simple(pg_wchar ucs);
 size_t     unicode_strlower(char *dst, size_t dstsize, const char *src,
                             ssize_t srclen);
+size_t     unicode_strtitle(char *dst, size_t dstsize, const char *src,
+                            ssize_t srclen, WordBoundaryNext wbnext,
+                            void *wbstate);
 size_t     unicode_strupper(char *dst, size_t dstsize, const char *src,
                             ssize_t srclen);