doxygen/G.https/unicode__case_8c_source.html

/*-------------------------------------------------------------------------

 * unicode_case.c

 *      Unicode case mapping and case conversion.

 *

 * Portions Copyright (c) 2017-2026, PostgreSQL Global Development Group

 *

 * IDENTIFICATION

 *    src/common/unicode_case.c

 *

 *-------------------------------------------------------------------------

 */

#ifndef FRONTEND

#include "postgres.h"

#else

#include "postgres_fe.h"

#endif


#include "common/unicode_case.h"

#include "common/unicode_case_table.h"

#include "common/unicode_category.h"

#include "mb/pg_wchar.h"


enum CaseMapResult

{

    CASEMAP_SELF,

    CASEMAP_SIMPLE,

    CASEMAP_SPECIAL,

};


/*

 * Map for each case kind.

 */

static const char32_t *const casekind_map[NCaseKind] =

{

    [CaseLower] = case_map_lower,

    [CaseTitle] = case_map_title,

    [CaseUpper] = case_map_upper,

    [CaseFold] = case_map_fold,

};


static char32_t find_case_map(char32_t ucs, const char32_t *map);

static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,

                           CaseKind str_casekind, bool full, WordBoundaryNext wbnext,

                           void *wbstate);

static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,

                                  const char *src, size_t srclen, size_t srcoff,

                                  char32_t *simple, const char32_t **special);


char32_t

unicode_lowercase_simple(char32_t code)

{

    char32_t    cp = find_case_map(code, case_map_lower);


    return cp != 0 ? cp : code;

}


char32_t

unicode_titlecase_simple(char32_t code)

{

    char32_t    cp = find_case_map(code, case_map_title);


    return cp != 0 ? cp : code;

}


char32_t

unicode_uppercase_simple(char32_t code)

{

    char32_t    cp = find_case_map(code, case_map_upper);


    return cp != 0 ? cp : code;

}


char32_t

unicode_casefold_simple(char32_t code)

{

    char32_t    cp = find_case_map(code, case_map_fold);


    return cp != 0 ? cp : code;

}


/*

 * unicode_strlower()

 *

 * Convert src to lowercase, and return the result length (not including

 * terminating NUL).

 *

 * String src must be encoded in UTF-8. If srclen < 0, src must be

 * NUL-terminated.

 *

 * Result string is stored in dst, truncating if larger than dstsize. If

 * dstsize is greater than the result length, dst will be NUL-terminated;

 * otherwise not.

 *

 * If dstsize is zero, dst may be NULL. This is useful for calculating the

 * required buffer size before allocating.

 *

 * If full is true, use special case mappings if available and if the

 * conditions are satisfied.

 */

size_t

unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,

                 bool full)

{

    return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,

                        NULL);

}


/*

 * unicode_strtitle()

 *

 * Convert src to titlecase, and return the result length (not including

 * terminating NUL).

 *

 * String src must be encoded in UTF-8. If srclen < 0, src must be

 * NUL-terminated.

 *

 * Result string is stored in dst, truncating if larger than dstsize. If

 * dstsize is greater than the result length, dst will be NUL-terminated;

 * otherwise not.

 *

 * If dstsize is zero, dst may be NULL. This is useful for calculating the

 * required buffer size before allocating.

 *

 * If full is true, use special case mappings if available and if the

 * conditions are satisfied. Otherwise, use only simple mappings and use

 * uppercase instead of titlecase.

 *

 * Titlecasing requires knowledge about word boundaries, which is provided by

 * the callback wbnext. A word boundary is the offset of the start of a word

 * or the offset of the character immediately following a word.

 *

 * The caller is expected to initialize and free the callback state

 * wbstate. The callback should first return offset 0 for the first boundary;

 * then the offset of each subsequent word boundary; then the total length of

 * the string to indicate the final boundary.

 */

size_t

unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,

                 bool full, WordBoundaryNext wbnext, void *wbstate)

{

    return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,

                        wbstate);

}


/*

 * unicode_strupper()

 *

 * Convert src to uppercase, and return the result length (not including

 * terminating NUL).

 *

 * String src must be encoded in UTF-8. If srclen < 0, src must be

 * NUL-terminated.

 *

 * Result string is stored in dst, truncating if larger than dstsize. If

 * dstsize is greater than the result length, dst will be NUL-terminated;

 * otherwise not.

 *

 * If dstsize is zero, dst may be NULL. This is useful for calculating the

 * required buffer size before allocating.

 *

 * If full is true, use special case mappings if available and if the

 * conditions are satisfied.

 */

size_t

unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,

                 bool full)

{

    return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,

                        NULL);

}


/*

 * unicode_strfold()

 *

 * Case fold src, and return the result length (not including terminating

 * NUL).

 *

 * String src must be encoded in UTF-8. If srclen < 0, src must be

 * NUL-terminated.

 *

 * Result string is stored in dst, truncating if larger than dstsize. If

 * dstsize is greater than the result length, dst will be NUL-terminated;

 * otherwise not.

 *

 * If dstsize is zero, dst may be NULL. This is useful for calculating the

 * required buffer size before allocating.

 */

size_t

unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,

                bool full)

{

    return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,

                        NULL);

}


/*

 * Implement Unicode Default Case Conversion algorithm.

 *

 * If str_casekind is CaseLower or CaseUpper, map each character in the string

 * for which a mapping is available.

 *

 * If str_casekind is CaseTitle, maps characters found on a word boundary to

 * titlecase (or uppercase if full is false) and other characters to

 * lowercase. NB: does not currently implement the Unicode behavior in which

 * the word boundary is adjusted to the next Cased character. That behavior

 * could be implemented as an option, but it doesn't match the default

 * behavior of ICU, nor does it match the documented behavior of INITCAP().

 *

 * If full is true, use special mappings for relevant characters, which can

 * map a single codepoint to multiple codepoints, or depend on conditions.

 */

static size_t

convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,

             CaseKind str_casekind, bool full, WordBoundaryNext wbnext,

             void *wbstate)

{

    /* character CaseKind varies while titlecasing */

    CaseKind    chr_casekind = str_casekind;

    size_t      srcoff = 0;

    size_t      result_len = 0;

    size_t      boundary = 0;


    Assert((str_casekind == CaseTitle && wbnext && wbstate) ||

           (str_casekind != CaseTitle && !wbnext && !wbstate));


    if (str_casekind == CaseTitle)

    {

        boundary = wbnext(wbstate);

        Assert(boundary == 0);  /* start of text is always a boundary */

    }


    while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')

    {

        char32_t    u1 = utf8_to_unicode((unsigned char *) src + srcoff);

        int         u1len = unicode_utf8len(u1);

        char32_t    simple = 0;

        const char32_t *special = NULL;

        enum CaseMapResult casemap_result;


        if (str_casekind == CaseTitle)

        {

            if (srcoff == boundary)

            {

                chr_casekind = full ? CaseTitle : CaseUpper;

                boundary = wbnext(wbstate);

            }

            else

                chr_casekind = CaseLower;

        }


        casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,

                                 &simple, &special);


        switch (casemap_result)

        {

            case CASEMAP_SELF:

                /* no mapping; copy bytes from src */

                Assert(simple == 0);

                Assert(special == NULL);

                if (result_len + u1len <= dstsize)

                    memcpy(dst + result_len, src + srcoff, u1len);


                result_len += u1len;

                break;

            case CASEMAP_SIMPLE:

                {

                    /* replace with single character */

                    char32_t    u2 = simple;

                    char32_t    u2len = unicode_utf8len(u2);


                    Assert(special == NULL);

                    if (result_len + u2len <= dstsize)

                        unicode_to_utf8(u2, (unsigned char *) dst + result_len);


                    result_len += u2len;

                }

                break;

            case CASEMAP_SPECIAL:

                /* replace with up to MAX_CASE_EXPANSION characters */

                Assert(simple == 0);

                for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)

                {

                    char32_t    u2 = special[i];

                    size_t      u2len = unicode_utf8len(u2);


                    if (result_len + u2len <= dstsize)

                        unicode_to_utf8(u2, (unsigned char *) dst + result_len);


                    result_len += u2len;

                }

                break;

        }


        srcoff += u1len;

    }


    if (result_len < dstsize)

        dst[result_len] = '\0';


    return result_len;

}


/*

 * Check that the condition matches Final_Sigma, described in Unicode Table

 * 3-17. The character at the given offset must be directly preceded by a

 * Cased character, and must not be directly followed by a Cased character.

 *

 * Case_Ignorable characters are ignored. NB: some characters may be both

 * Cased and Case_Ignorable, in which case they are ignored.

 */

static bool

check_final_sigma(const unsigned char *str, size_t len, size_t offset)

{

    /* the start of the string is not preceded by a Cased character */

    if (offset == 0)

        return false;


    /* iterate backwards, looking for Cased character */

    for (int i = offset - 1; i >= 0; i--)

    {

        if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)

        {

            char32_t    curr = utf8_to_unicode(str + i);


            if (pg_u_prop_case_ignorable(curr))

                continue;

            else if (pg_u_prop_cased(curr))

                break;

            else

                return false;

        }

        else if ((str[i] & 0xC0) == 0x80)

            continue;


        Assert(false);          /* invalid UTF-8 */

    }


    /* end of string is not followed by a Cased character */

    if (offset == len)

        return true;


    /* iterate forwards, looking for Cased character */

    for (int i = offset + 1; i < len && str[i] != '\0'; i++)

    {

        if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)

        {

            char32_t    curr = utf8_to_unicode(str + i);


            if (pg_u_prop_case_ignorable(curr))

                continue;

            else if (pg_u_prop_cased(curr))

                return false;

            else

                break;

        }

        else if ((str[i] & 0xC0) == 0x80)

            continue;


        Assert(false);          /* invalid UTF-8 */

    }


    return true;

}


/*

 * Unicode allows for special casing to be applied only under certain

 * circumstances. The only currently-supported condition is Final_Sigma.

 */

static bool

check_special_conditions(int conditions, const char *str, size_t len,

                         size_t offset)

{

    if (conditions == 0)

        return true;

    else if (conditions == PG_U_FINAL_SIGMA)

        return check_final_sigma((unsigned char *) str, len, offset);


    /* no other conditions supported */

    Assert(false);

    return false;

}


/*

 * Map the given character to the requested case.

 *

 * If full is true, and a special case mapping is found and the conditions are

 * met, 'special' is set to the mapping result (which is an array of up to

 * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.

 *

 * Otherwise, search for a simple mapping, and if found, set 'simple' to the

 * result and return CASEMAP_SIMPLE.

 *

 * If no mapping is found, return CASEMAP_SELF, and the caller should copy the

 * character without modification.

 */

static enum CaseMapResult

casemap(char32_t u1, CaseKind casekind, bool full,

        const char *src, size_t srclen, size_t srcoff,

        char32_t *simple, const char32_t **special)

{

    uint16      idx;


    /* Fast path for codepoints < 0x80 */

    if (u1 < 0x80)

    {

        /*

         * The first elements in all tables are reserved as 0 (as NULL). The

         * data starts at index 1, not 0.

         */

        *simple = casekind_map[casekind][u1 + 1];


        return CASEMAP_SIMPLE;

    }


    idx = case_index(u1);


    if (idx == 0)

        return CASEMAP_SELF;


    if (full && case_map_special[idx] &&

        check_special_conditions(special_case[case_map_special[idx]].conditions,

                                 src, srclen, srcoff))

    {

        *special = special_case[case_map_special[idx]].map[casekind];

        return CASEMAP_SPECIAL;

    }


    *simple = casekind_map[casekind][idx];


    return CASEMAP_SIMPLE;

}


/*

 * Find entry in simple case map.

 * If the entry does not exist, 0 will be returned.

 */

static char32_t

find_case_map(char32_t ucs, const char32_t *map)

{

    /* Fast path for codepoints < 0x80 */

    if (ucs < 0x80)

        /* The first elements in all tables are reserved as 0 (as NULL). */

        return map[ucs + 1];

    return map[case_index(ucs)];

}

idx
Datum idx(PG_FUNCTION_ARGS)
Definition: _int_op.c:262

uint16
uint16_t uint16
Definition: c.h:551

Assert
Assert(PointerIsAligned(start, uint64))

str
const char * str
Definition: hashfn_unstable.h:254

i
int i
Definition: isn.c:77

utf8_to_unicode
static char32_t utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53

len
const void size_t len
Definition: pg_crc32c_sse42.c:28

pg_wchar.h

unicode_to_utf8
static unsigned char * unicode_to_utf8(char32_t c, unsigned char *utf8string)
Definition: pg_wchar.h:575

unicode_utf8len
static int unicode_utf8len(char32_t c)
Definition: pg_wchar.h:607

postgres.h

postgres_fe.h

pg_special_case::map
char32_t map[NCaseKind][MAX_CASE_EXPANSION]
Definition: unicode_case_table.h:47

unicode_titlecase_simple
char32_t unicode_titlecase_simple(char32_t code)
Definition: unicode_case.c:58

unicode_strupper
size_t unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:165

unicode_strlower
size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:101

unicode_casefold_simple
char32_t unicode_casefold_simple(char32_t code)
Definition: unicode_case.c:74

unicode_lowercase_simple
char32_t unicode_lowercase_simple(char32_t code)
Definition: unicode_case.c:50

find_case_map
static char32_t find_case_map(char32_t ucs, const char32_t *map)
Definition: unicode_case.c:438

convert_case
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, CaseKind str_casekind, bool full, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:213

unicode_strtitle
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:138

casekind_map
static const char32_t *const casekind_map[NCaseKind]
Definition: unicode_case.c:33

check_special_conditions
static bool check_special_conditions(int conditions, const char *str, size_t len, size_t offset)
Definition: unicode_case.c:370

unicode_strfold
size_t unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:189

casemap
static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full, const char *src, size_t srclen, size_t srcoff, char32_t *simple, const char32_t **special)
Definition: unicode_case.c:397

CaseMapResult
CaseMapResult
Definition: unicode_case.c:24

CASEMAP_SPECIAL
@ CASEMAP_SPECIAL
Definition: unicode_case.c:27

CASEMAP_SIMPLE
@ CASEMAP_SIMPLE
Definition: unicode_case.c:26

CASEMAP_SELF
@ CASEMAP_SELF
Definition: unicode_case.c:25

check_final_sigma
static bool check_final_sigma(const unsigned char *str, size_t len, size_t offset)
Definition: unicode_case.c:312

unicode_uppercase_simple
char32_t unicode_uppercase_simple(char32_t code)
Definition: unicode_case.c:66

unicode_case.h

WordBoundaryNext
size_t(* WordBoundaryNext)(void *wbstate)
Definition: unicode_case.h:17

unicode_case_table.h

case_map_special
static const uint8 case_map_special[1704]
Definition: unicode_case_table.h:7020

MAX_CASE_EXPANSION
#define MAX_CASE_EXPANSION
Definition: unicode_case_table.h:26

case_map_lower
static const char32_t case_map_lower[1704]
Definition: unicode_case_table.h:168

PG_U_FINAL_SIGMA
#define PG_U_FINAL_SIGMA
Definition: unicode_case_table.h:33

CaseKind
CaseKind
Definition: unicode_case_table.h:36

CaseFold
@ CaseFold
Definition: unicode_case_table.h:40

CaseTitle
@ CaseTitle
Definition: unicode_case_table.h:38

NCaseKind
@ NCaseKind
Definition: unicode_case_table.h:41

CaseLower
@ CaseLower
Definition: unicode_case_table.h:37

CaseUpper
@ CaseUpper
Definition: unicode_case_table.h:39

case_map_title
static const char32_t case_map_title[1704]
Definition: unicode_case_table.h:1881

special_case
static const pg_special_case special_case[106]
Definition: unicode_case_table.h:54

case_map_fold
static const char32_t case_map_fold[1704]
Definition: unicode_case_table.h:5307

case_index
static uint16 case_index(char32_t cp)
Definition: unicode_case_table.h:13524

case_map_upper
static const char32_t case_map_upper[1704]
Definition: unicode_case_table.h:3594

pg_u_prop_cased
bool pg_u_prop_cased(char32_t code)
Definition: unicode_category.c:144

pg_u_prop_case_ignorable
bool pg_u_prop_case_ignorable(char32_t code)
Definition: unicode_category.c:159

unicode_category.h