doxygen/G.https/case__test_8c_source.html

/*-------------------------------------------------------------------------

 * case_test.c

 *      Program to test Unicode case mapping functions.

 *

 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group

 *

 * IDENTIFICATION

 *    src/common/unicode/case_test.c

 *

 *-------------------------------------------------------------------------

 */

#include "postgres_fe.h"


#include <locale.h>

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <wctype.h>


#ifdef USE_ICU

#include <unicode/ucasemap.h>

#include <unicode/uchar.h>

#endif

#include "common/unicode_case.h"

#include "common/unicode_category.h"

#include "common/unicode_version.h"


/* enough to hold largest source or result string, including NUL */

#define BUFSZ 256


#ifdef USE_ICU

static UCaseMap * casemap = NULL;

#endif


typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,

                            ssize_t srclen);


/* simple boundary iterator copied from pg_locale_builtin.c */

struct WordBoundaryState

{

    const char *str;

    size_t      len;

    size_t      offset;

    bool        posix;

    bool        init;

    bool        prev_alnum;

};


static size_t

initcap_wbnext(void *state)

{

    struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;


    while (wbstate->offset < wbstate->len &&

           wbstate->str[wbstate->offset] != '\0')

    {

        pg_wchar    u = utf8_to_unicode((unsigned char *) wbstate->str +

                                        wbstate->offset);

        bool        curr_alnum = pg_u_isalnum(u, wbstate->posix);


        if (!wbstate->init || curr_alnum != wbstate->prev_alnum)

        {

            size_t      prev_offset = wbstate->offset;


            wbstate->init = true;

            wbstate->offset += unicode_utf8len(u);

            wbstate->prev_alnum = curr_alnum;

            return prev_offset;

        }


        wbstate->offset += unicode_utf8len(u);

    }


    return wbstate->len;

}


#ifdef USE_ICU


static void

icu_test_simple(pg_wchar code)

{

    pg_wchar    lower = unicode_lowercase_simple(code);

    pg_wchar    title = unicode_titlecase_simple(code);

    pg_wchar    upper = unicode_uppercase_simple(code);

    pg_wchar    fold = unicode_casefold_simple(code);

    pg_wchar    iculower = u_tolower(code);

    pg_wchar    icutitle = u_totitle(code);

    pg_wchar    icuupper = u_toupper(code);

    pg_wchar    icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);


    if (lower != iculower || title != icutitle || upper != icuupper ||

        fold != icufold)

    {

        printf("case_test: FAILURE for codepoint 0x%06x\n", code);

        printf("case_test: Postgres lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",

               lower, title, upper, fold);

        printf("case_test: ICU lower/title/upper/fold:      0x%06x/0x%06x/0x%06x/0x%06x\n",

               iculower, icutitle, icuupper, icufold);

        printf("\n");

        exit(1);

    }

}


static void

icu_test_full(char *str)

{

    char        lower[BUFSZ];

    char        title[BUFSZ];

    char        upper[BUFSZ];

    char        fold[BUFSZ];

    char        icu_lower[BUFSZ];

    char        icu_title[BUFSZ];

    char        icu_upper[BUFSZ];

    char        icu_fold[BUFSZ];

    UErrorCode  status;


    /* full case mapping doesn't use posix semantics */

    struct WordBoundaryState wbstate = {

        .str = str,

        .len = strlen(str),

        .offset = 0,

        .posix = false,

        .init = false,

        .prev_alnum = false,

    };


    unicode_strlower(lower, BUFSZ, str, -1, true);

    unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);

    unicode_strupper(upper, BUFSZ, str, -1, true);

    unicode_strfold(fold, BUFSZ, str, -1, true);

    status = U_ZERO_ERROR;

    ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);

    status = U_ZERO_ERROR;

    ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);

    status = U_ZERO_ERROR;

    ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);

    status = U_ZERO_ERROR;

    ucasemap_utf8FoldCase(casemap, icu_fold, BUFSZ, str, -1, &status);


    if (strcmp(lower, icu_lower) != 0)

    {

        printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,

               icu_lower);

        exit(1);

    }

    if (strcmp(title, icu_title) != 0)

    {

        printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,

               icu_title);

        exit(1);

    }

    if (strcmp(upper, icu_upper) != 0)

    {

        printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,

               icu_upper);

        exit(1);

    }

    if (strcmp(fold, icu_fold) != 0)

    {

        printf("case_test: str='%s' fold='%s' icu_fold='%s'\n", str, fold,

               icu_fold);

        exit(1);

    }

}


/*

 * Exhaustively compare case mappings with the results from ICU.

 */

static void

test_icu(void)

{

    int         successful = 0;

    int         skipped_mismatch = 0;


    for (pg_wchar code = 0; code <= 0x10ffff; code++)

    {

        pg_unicode_category category = unicode_category(code);


        if (category != PG_U_UNASSIGNED)

        {

            uint8_t     icu_category = u_charType(code);

            char        code_str[5] = {0};


            if (icu_category == PG_U_UNASSIGNED)

            {

                skipped_mismatch++;

                continue;

            }


            icu_test_simple(code);

            unicode_to_utf8(code, (unsigned char *) code_str);

            icu_test_full(code_str);


            successful++;

        }

    }


    if (skipped_mismatch > 0)

        printf("case_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",

               skipped_mismatch);


    printf("case_test: ICU simple mapping test: %d codepoints successful\n",

           successful);

}

#endif


static void

test_convert(TestFunc tfunc, const char *test_string, const char *expected)

{

    size_t      src1len = strlen(test_string);

    size_t      src2len = -1;   /* NUL-terminated */

    size_t      dst1len = strlen(expected);

    size_t      dst2len = strlen(expected) + 1; /* NUL-terminated */

    char       *src1 = malloc(src1len);

    char       *dst1 = malloc(dst1len);

    char       *src2 = strdup(test_string);

    char       *dst2 = malloc(dst2len);

    size_t      needed;


    memcpy(src1, test_string, src1len); /* not NUL-terminated */


    /* neither source nor destination are NUL-terminated */

    memset(dst1, 0x7F, dst1len);

    needed = tfunc(dst1, dst1len, src1, src1len);

    if (needed != strlen(expected))

    {

        printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",

               test_string, needed, strlen(expected));

        exit(1);

    }

    if (memcmp(dst1, expected, dst1len) != 0)

    {

        printf("case_test: convert_case test1 FAILURE: test: '%s' result: '%.*s' expected: '%s'\n",

               test_string, (int) dst1len, dst1, expected);

        exit(1);

    }


    /* destination is NUL-terminated and source is not */

    memset(dst2, 0x7F, dst2len);

    needed = tfunc(dst2, dst2len, src1, src1len);

    if (needed != strlen(expected))

    {

        printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",

               test_string, needed, strlen(expected));

        exit(1);

    }

    if (strcmp(dst2, expected) != 0)

    {

        printf("case_test: convert_case test2 FAILURE: test: '%s' result: '%s' expected: '%s'\n",

               test_string, dst2, expected);

        exit(1);

    }


    /* source is NUL-terminated and destination is not */

    memset(dst1, 0x7F, dst1len);

    needed = tfunc(dst1, dst1len, src2, src2len);

    if (needed != strlen(expected))

    {

        printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",

               test_string, needed, strlen(expected));

        printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);

        exit(1);

    }

    if (memcmp(dst1, expected, dst1len) != 0)

    {

        printf("case_test: convert_case test3 FAILURE: test: '%s' result: '%.*s' expected: '%s'\n",

               test_string, (int) dst1len, dst1, expected);

        exit(1);

    }


    /* both source and destination are NUL-terminated */

    memset(dst2, 0x7F, dst2len);

    needed = tfunc(dst2, dst2len, src2, src2len);

    if (needed != strlen(expected))

    {

        printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",

               test_string, needed, strlen(expected));

        exit(1);

    }

    if (strcmp(dst2, expected) != 0)

    {

        printf("case_test: convert_case test4 FAILURE: test: '%s' result: '%s' expected: '%s'\n",

               test_string, dst2, expected);

        exit(1);

    }


    free(src1);

    free(dst1);

    free(src2);

    free(dst2);

}


static size_t

tfunc_lower(char *dst, size_t dstsize, const char *src,

            ssize_t srclen)

{

    return unicode_strlower(dst, dstsize, src, srclen, true);

}


static size_t

tfunc_title(char *dst, size_t dstsize, const char *src,

            ssize_t srclen)

{

    struct WordBoundaryState wbstate = {

        .str = src,

        .len = srclen,

        .offset = 0,

        .init = false,

        .prev_alnum = false,

    };


    return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,

                            &wbstate);

}


static size_t

tfunc_upper(char *dst, size_t dstsize, const char *src,

            ssize_t srclen)

{

    return unicode_strupper(dst, dstsize, src, srclen, true);

}


static size_t

tfunc_fold(char *dst, size_t dstsize, const char *src,

           ssize_t srclen)

{

    return unicode_strfold(dst, dstsize, src, srclen, true);

}


static void

test_convert_case()

{

    /* test string with no case changes */

    test_convert(tfunc_lower, "√∞", "√∞");

    /* test adjust-to-cased behavior */

    test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");

    /* test string with case changes */

    test_convert(tfunc_upper, "abc", "ABC");

    /* test string with case changes and byte length changes */

    test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");

    /* test special case conversions */

    test_convert(tfunc_upper, "ß", "SS");

    test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");

    test_convert(tfunc_upper, "ıiIİ", "IIIİ");

    test_convert(tfunc_fold, "ıiIİ", "ıiii\u0307");

    /* test final sigma */

    test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");

    test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");

    test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");

    test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");

    /* test that alphanumerics are word characters */

    test_convert(tfunc_title, "λλ", "Λλ");

    test_convert(tfunc_title, "1a", "1a");

    /* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */

    test_convert(tfunc_title, "\uFF11a", "\uFF11a");


#ifdef USE_ICU

    icu_test_full("");

    icu_test_full("ȺȺȺ");

    icu_test_full("ßßß");

    icu_test_full("√∞");

    icu_test_full("a b");

    icu_test_full("abc 123xyz");

    icu_test_full("σςΣ ΣΣΣ");

    icu_test_full("ıiIİ");

    icu_test_full("\uFF11a");

    /* test <alpha><iota_subscript><acute> */

    icu_test_full("\u0391\u0345\u0301");

#endif


    printf("case_test: convert_case: success\n");

}


int

main(int argc, char **argv)

{

#ifdef USE_ICU

    UErrorCode  status = U_ZERO_ERROR;


    /*

     * Disable ICU's word break adjustment for titlecase to match the expected

     * behavior of unicode_strtitle().

     */

    casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);

    if (U_FAILURE(status))

    {

        printf("case_test: failure opening UCaseMap: %s\n",

               u_errorName(status));

        exit(1);

    }

#endif


    printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);

#ifdef USE_ICU

    printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);

    test_icu();

#else

    printf("case_test: ICU not available; skipping\n");

#endif


    test_convert_case();


#ifdef USE_ICU

    ucasemap_close(casemap);

#endif

    exit(0);

}

test_convert_case
static void test_convert_case()
Definition: case_test.c:331

test_convert
static void test_convert(TestFunc tfunc, const char *test_string, const char *expected)
Definition: case_test.c:208

main
int main(int argc, char **argv)
Definition: case_test.c:376

tfunc_lower
static size_t tfunc_lower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:294

initcap_wbnext
static size_t initcap_wbnext(void *state)
Definition: case_test.c:50

tfunc_title
static size_t tfunc_title(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:301

tfunc_upper
static size_t tfunc_upper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:317

BUFSZ
#define BUFSZ
Definition: case_test.c:29

tfunc_fold
static size_t tfunc_fold(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:324

TestFunc
size_t(* TestFunc)(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:35

str
const char * str
Definition: hashfn_unstable.h:254

free
#define free(a)
Definition: header.h:65

malloc
#define malloc(a)
Definition: header.h:50

utf8_to_unicode
static pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53

pg_wchar
unsigned int pg_wchar
Definition: mbprint.c:31

lower
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:49

upper
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:80

unicode_to_utf8
static unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: pg_wchar.h:575

unicode_utf8len
static int unicode_utf8len(pg_wchar c)
Definition: pg_wchar.h:607

printf
#define printf(...)
Definition: port.h:245

postgres_fe.h

string.h

WordBoundaryState
Definition: pg_locale_builtin.c:39

WordBoundaryState::prev_alnum
bool prev_alnum
Definition: pg_locale_builtin.c:45

WordBoundaryState::offset
size_t offset
Definition: pg_locale_builtin.c:42

WordBoundaryState::init
bool init
Definition: pg_locale_builtin.c:44

WordBoundaryState::len
size_t len
Definition: pg_locale_builtin.c:41

WordBoundaryState::str
const char * str
Definition: pg_locale_builtin.c:40

WordBoundaryState::posix
bool posix
Definition: pg_locale_builtin.c:43

state
Definition: regguts.h:323

unicode_uppercase_simple
pg_wchar unicode_uppercase_simple(pg_wchar code)
Definition: unicode_case.c:66

unicode_titlecase_simple
pg_wchar unicode_titlecase_simple(pg_wchar code)
Definition: unicode_case.c:58

unicode_strupper
size_t unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:165

casemap
static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full, const char *src, size_t srclen, size_t srcoff, pg_wchar *simple, const pg_wchar **special)
Definition: unicode_case.c:397

unicode_strlower
size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:101

unicode_strtitle
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:138

unicode_lowercase_simple
pg_wchar unicode_lowercase_simple(pg_wchar code)
Definition: unicode_case.c:50

unicode_strfold
size_t unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:189

unicode_casefold_simple
pg_wchar unicode_casefold_simple(pg_wchar code)
Definition: unicode_case.c:74

unicode_case.h

pg_u_isalnum
bool pg_u_isalnum(pg_wchar code, bool posix)
Definition: unicode_category.c:226

unicode_category
pg_unicode_category unicode_category(pg_wchar code)
Definition: unicode_category.c:85

unicode_category.h

pg_unicode_category
pg_unicode_category
Definition: unicode_category.h:31

PG_U_UNASSIGNED
@ PG_U_UNASSIGNED
Definition: unicode_category.h:32

unicode_version.h

PG_UNICODE_VERSION
#define PG_UNICODE_VERSION
Definition: unicode_version.h:14