diff options
Diffstat (limited to 'src/common')
-rw-r--r-- | src/common/Makefile | 1 | ||||
-rw-r--r-- | src/common/meson.build | 1 | ||||
-rw-r--r-- | src/common/unicode/Makefile | 15 | ||||
-rw-r--r-- | src/common/unicode/case_test.c | 100 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_case_table.pl | 134 | ||||
-rw-r--r-- | src/common/unicode/meson.build | 31 | ||||
-rw-r--r-- | src/common/unicode_case.c | 174 | ||||
-rw-r--r-- | src/common/wchar.c | 4 |
8 files changed, 455 insertions, 5 deletions
diff --git a/src/common/Makefile b/src/common/Makefile index 2ba5069dca4..3d83299432b 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -78,6 +78,7 @@ OBJS_COMMON = \ scram-common.o \ string.o \ stringinfo.o \ + unicode_case.o \ unicode_category.o \ unicode_norm.o \ username.o \ diff --git a/src/common/meson.build b/src/common/meson.build index 4eb16024cb2..de68e408fa3 100644 --- a/src/common/meson.build +++ b/src/common/meson.build @@ -32,6 +32,7 @@ common_sources = files( 'scram-common.c', 'string.c', 'stringinfo.c', + 'unicode_case.c', 'unicode_category.c', 'unicode_norm.c', 'username.c', diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile index 27f0408d8b8..c38ab36b1e4 100644 --- a/src/common/unicode/Makefile +++ b/src/common/unicode/Makefile @@ -21,8 +21,9 @@ CPPFLAGS += $(ICU_CFLAGS) # By default, do nothing. all: -update-unicode: unicode_category_table.h unicode_east_asian_fw_table.h unicode_nonspacing_table.h unicode_norm_hashfunc.h unicode_norm_table.h unicode_normprops_table.h unicode_version.h +update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian_fw_table.h unicode_nonspacing_table.h unicode_norm_hashfunc.h unicode_norm_table.h unicode_normprops_table.h unicode_version.h mv $^ $(top_srcdir)/src/include/common/ + $(MAKE) case-check $(MAKE) category-check $(MAKE) normalization-check @@ -35,6 +36,9 @@ CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.tx unicode_version.h: generate-unicode_version.pl $(PERL) $< --version $(UNICODE_VERSION) +unicode_case_table.h: generate-unicode_case_table.pl UnicodeData.txt + $(PERL) $< + unicode_category_table.h: generate-unicode_category_table.pl DerivedCoreProperties.txt PropList.txt UnicodeData.txt $(PERL) $< @@ -55,12 +59,17 @@ unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizat $(PERL) $^ >$@ # Test suite +case-check: case_test + ./case_test + category-check: category_test ./category_test normalization-check: norm_test ./norm_test +case_test: case_test.o ../unicode_case.o | submake-common + category_test: category_test.o ../unicode_category.o | submake-common norm_test: norm_test.o ../unicode_norm.o | submake-common @@ -79,7 +88,7 @@ norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt clean: - rm -f $(OBJS) category_test category_test.o norm_test norm_test.o + rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o distclean: clean - rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_category_table.h unicode_norm_table.h + rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c new file mode 100644 index 00000000000..7b82d5e0aad --- /dev/null +++ b/src/common/unicode/case_test.c @@ -0,0 +1,100 @@ +/*------------------------------------------------------------------------- + * case_test.c + * Program to test Unicode case mapping functions. + * + * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/common/unicode/case_test.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include <locale.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <wctype.h> + +#ifdef USE_ICU +#include <unicode/uchar.h> +#endif +#include "common/unicode_case.h" +#include "common/unicode_category.h" +#include "common/unicode_version.h" + +#ifdef USE_ICU + +static void +icu_test_simple(pg_wchar code) +{ + pg_wchar lower = unicode_lowercase_simple(code); + pg_wchar title = unicode_titlecase_simple(code); + pg_wchar upper = unicode_uppercase_simple(code); + pg_wchar iculower = u_tolower(code); + pg_wchar icutitle = u_totitle(code); + pg_wchar icuupper = u_toupper(code); + + if (lower != iculower || title != icutitle || upper != icuupper) + { + printf("case_test: FAILURE for codepoint 0x%06x\n", code); + printf("case_test: Postgres lower/title/upper: 0x%06x/0x%06x/0x%06x\n", + lower, title, upper); + printf("case_test: ICU lower/title/upper: 0x%06x/0x%06x/0x%06x\n", + iculower, icutitle, icuupper); + printf("\n"); + exit(1); + } +} + +static void +test_icu(void) +{ + int successful = 0; + int skipped_mismatch = 0; + + for (pg_wchar code = 0; code <= 0x10ffff; code++) + { + pg_unicode_category category = unicode_category(code); + + if (category != PG_U_UNASSIGNED) + { + uint8_t icu_category = u_charType(code); + + if (icu_category == PG_U_UNASSIGNED) + { + skipped_mismatch++; + continue; + } + + icu_test_simple(code); + successful++; + } + } + + if (skipped_mismatch > 0) + printf("case_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n", + skipped_mismatch); + + printf("case_test: ICU simple mapping test: %d codepoints successful\n", + successful); +} +#endif + +/* + * Exhaustively compare case mappings with the results from libc and ICU. + */ +int +main(int argc, char **argv) +{ + printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION); +#ifdef USE_ICU + printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION); + test_icu(); +#else + printf("case_test: ICU not available; skipping\n"); +#endif + + exit(0); +} diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl new file mode 100644 index 00000000000..44b785b8619 --- /dev/null +++ b/src/common/unicode/generate-unicode_case_table.pl @@ -0,0 +1,134 @@ +#!/usr/bin/perl +# +# Generate Unicode character case mappings. Does not include tailoring +# or locale-specific mappings. +# +# Input: UnicodeData.txt +# Output: unicode_case_table.h +# +# Copyright (c) 2000-2023, PostgreSQL Global Development Group + +use strict; +use warnings; +use Getopt::Long; + +use FindBin; +use lib "$FindBin::RealBin/../../tools/"; + +my $output_path = '.'; + +GetOptions('outdir:s' => \$output_path); + +my $output_table_file = "$output_path/unicode_case_table.h"; + +my $FH; + +my %simple = (); + +open($FH, '<', "$output_path/UnicodeData.txt") + or die "Could not open $output_path/UnicodeData.txt: $!."; +while (my $line = <$FH>) +{ + my @elts = split(';', $line); + my $code = hex($elts[0]); + my $simple_uppercase = hex($elts[12] =~ s/^\s+|\s+$//rg); + my $simple_lowercase = hex($elts[13] =~ s/^\s+|\s+$//rg); + my $simple_titlecase = hex($elts[14] =~ s/^\s+|\s+$//rg); + + die "codepoint $code out of range" if $code > 0x10FFFF; + die "Simple_Lowercase $code out of range" if $simple_lowercase > 0x10FFFF; + die "Simple_Titlecase $code out of range" if $simple_titlecase > 0x10FFFF; + die "Simple_Uppercase $code out of range" if $simple_uppercase > 0x10FFFF; + + if ($simple_lowercase || $simple_titlecase || $simple_uppercase) + { + $simple{$code} = { + Simple_Lowercase => ($simple_lowercase || $code), + Simple_Titlecase => ($simple_titlecase || $code), + Simple_Uppercase => ($simple_uppercase || $code) + }; + } +} +close $FH; + +# Start writing out the output files +open my $OT, '>', $output_table_file + or die "Could not open output file $output_table_file: $!\n"; + +# determine size of array given that codepoints <= 0x80 are dense and +# the rest of the entries are sparse +my $num_simple = 0x80; +foreach my $code (sort { $a <=> $b } (keys %simple)) +{ + $num_simple++ unless $code < 0x80; +} + +print $OT <<"EOS"; +/*------------------------------------------------------------------------- + * + * unicode_case_table.h + * Case mapping and information table. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/common/unicode_case_table.h + * + *------------------------------------------------------------------------- + */ + +/* + * File auto-generated by src/common/unicode/generate-unicode_case_table.pl, + * do not edit. There is deliberately not an #ifndef PG_UNICODE_CASE_TABLE_H + * here. + */ + +#include "common/unicode_case.h" +#include "mb/pg_wchar.h" + +typedef enum +{ + CaseLower = 0, + CaseTitle = 1, + CaseUpper = 2, + NCaseKind +} CaseKind; + +typedef struct +{ + pg_wchar codepoint; /* Unicode codepoint */ + pg_wchar simplemap[NCaseKind]; +} pg_case_map; + +/* + * Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup), + * sparse for higher codepoints (requiring scan or binary search). + */ +static const pg_case_map case_map[$num_simple] = +{ +EOS + +printf $OT "\t/* begin dense entries for codepoints < 0x80 */\n"; +for (my $code = 0; $code < 0x80; $code++) +{ + my $lc = ($simple{$code}{Simple_Lowercase} || $code); + my $tc = ($simple{$code}{Simple_Titlecase} || $code); + my $uc = ($simple{$code}{Simple_Uppercase} || $code); + printf $OT + "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n", + $code, $lc, $tc, $uc; +} +printf $OT "\n"; + +printf $OT "\t/* begin sparse entries for codepoints >= 0x80 */\n"; +foreach my $code (sort { $a <=> $b } (keys %simple)) +{ + next unless $code >= 0x80; # already output above + + my $map = $simple{$code}; + printf $OT + "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n", + $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase}, + $map->{Simple_Uppercase}; +} +print $OT "};\n"; diff --git a/src/common/unicode/meson.build b/src/common/unicode/meson.build index d7190bb8ca9..b9a4181c320 100644 --- a/src/common/unicode/meson.build +++ b/src/common/unicode/meson.build @@ -25,6 +25,16 @@ endforeach update_unicode_targets = [] update_unicode_targets += \ + custom_target('unicode_case_table.h', + input: [unicode_data['UnicodeData.txt']], + output: ['unicode_case_table.h'], + command: [ + perl, files('generate-unicode_case_table.pl'), + '--outdir', '@OUTDIR@', '@INPUT@'], + build_by_default: false, + ) + +update_unicode_targets += \ custom_target('unicode_category_table.h', input: [unicode_data['UnicodeData.txt'], unicode_data['DerivedCoreProperties.txt'], unicode_data['PropList.txt']], output: ['unicode_category_table.h'], @@ -92,6 +102,17 @@ norm_test_table = custom_target('norm_test_table.h', inc = include_directories('.') +case_test = executable('case_test', + ['case_test.c'], + dependencies: [frontend_port_code, icu], + include_directories: inc, + link_with: [common_static, pgport_static], + build_by_default: false, + kwargs: default_bin_args + { + 'install': false, + } +) + category_test = executable('category_test', ['category_test.c'], dependencies: [frontend_port_code, icu], @@ -117,6 +138,16 @@ norm_test = executable('norm_test', update_unicode_dep = [] if not meson.is_cross_build() + update_unicode_dep += custom_target('case_test.run', + output: 'case_test.run', + input: update_unicode_targets, + command: [case_test, UNICODE_VERSION], + build_by_default: false, + build_always_stale: true, + ) +endif + +if not meson.is_cross_build() update_unicode_dep += custom_target('category_test.run', output: 'category_test.run', input: update_unicode_targets, diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c new file mode 100644 index 00000000000..842db173ba8 --- /dev/null +++ b/src/common/unicode_case.c @@ -0,0 +1,174 @@ +/*------------------------------------------------------------------------- + * unicode_case.c + * Unicode case mapping and case conversion. + * + * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/common/unicode_case.c + * + *------------------------------------------------------------------------- + */ +#ifndef FRONTEND +#include "postgres.h" +#else +#include "postgres_fe.h" +#endif + +#include "common/unicode_case.h" +#include "common/unicode_case_table.h" +#include "common/unicode_category.h" +#include "mb/pg_wchar.h" + +static const pg_case_map *find_case_map(pg_wchar ucs); +static size_t convert_case(char *dst, size_t dstsize, const char *src, size_t srclen, + CaseKind casekind); + +pg_wchar +unicode_lowercase_simple(pg_wchar code) +{ + const pg_case_map *map = find_case_map(code); + + return map ? map->simplemap[CaseLower] : code; +} + +pg_wchar +unicode_titlecase_simple(pg_wchar code) +{ + const pg_case_map *map = find_case_map(code); + + return map ? map->simplemap[CaseTitle] : code; +} + +pg_wchar +unicode_uppercase_simple(pg_wchar code) +{ + const pg_case_map *map = find_case_map(code); + + return map ? map->simplemap[CaseUpper] : code; +} + +/* + * unicode_strlower() + * + * Convert src to lowercase, and return the result length (not including + * terminating NUL). + * + * String src must be encoded in UTF-8. If srclen < 0, src must be + * NUL-terminated. + * + * Result string is stored in dst, truncating if larger than dstsize. If + * dstsize is greater than the result length, dst will be NUL-terminated; + * otherwise not. + * + * If dstsize is zero, dst may be NULL. This is useful for calculating the + * required buffer size before allocating. + */ +size_t +unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen) +{ + return convert_case(dst, dstsize, src, srclen, CaseLower); +} + +/* + * unicode_strupper() + * + * Convert src to uppercase, and return the result length (not including + * terminating NUL). + * + * String src must be encoded in UTF-8. If srclen < 0, src must be + * NUL-terminated. + * + * Result string is stored in dst, truncating if larger than dstsize. If + * dstsize is greater than the result length, dst will be NUL-terminated; + * otherwise not. + * + * If dstsize is zero, dst may be NULL. This is useful for calculating the + * required buffer size before allocating. + */ +size_t +unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen) +{ + return convert_case(dst, dstsize, src, srclen, CaseUpper); +} + +/* + * Implement Unicode Default Case Conversion algorithm. + * + * Map each character in the string for which a mapping is available. + */ +static size_t +convert_case(char *dst, size_t dstsize, const char *src, size_t srclen, + CaseKind casekind) +{ + size_t srcoff = 0; + size_t result_len = 0; + + while (src[srcoff] != '\0' && (srclen < 0 || srcoff < srclen)) + { + pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff); + int u1len = unicode_utf8len(u1); + const pg_case_map *casemap = find_case_map(u1); + + if (casemap) + { + pg_wchar u2 = casemap->simplemap[casekind]; + pg_wchar u2len = unicode_utf8len(u2); + + if (result_len + u2len < dstsize) + unicode_to_utf8(u2, (unsigned char *) dst + result_len); + + result_len += u2len; + } + else + { + /* no mapping; copy bytes from src */ + if (result_len + u1len < dstsize) + memcpy(dst + result_len, src + srcoff, u1len); + + result_len += u1len; + } + + srcoff += u1len; + } + + if (result_len < dstsize) + dst[result_len] = '\0'; + + return result_len; +} + +/* find entry in simple case map, if any */ +static const pg_case_map * +find_case_map(pg_wchar ucs) +{ + int min; + int mid; + int max; + + /* all chars <= 0x80 are stored in array for fast lookup */ + Assert(lengthof(case_map) >= 0x80); + if (ucs < 0x80) + { + const pg_case_map *map = &case_map[ucs]; + + Assert(map->codepoint == ucs); + return map; + } + + /* otherwise, binary search */ + min = 0x80; + max = lengthof(case_map) - 1; + while (max >= min) + { + mid = (min + max) / 2; + if (ucs > case_map[mid].codepoint) + min = mid + 1; + else if (ucs < case_map[mid].codepoint) + max = mid - 1; + else + return &case_map[mid]; + } + + return NULL; +} diff --git a/src/common/wchar.c b/src/common/wchar.c index 7e7a7507d5c..a238c0106c6 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -477,8 +477,8 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) /* - * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of - * space allocated. + * Map a Unicode code point to UTF-8. utf8string must have at least + * unicode_utf8len(c) bytes available. */ unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string) |