summaryrefslogtreecommitdiff
path: root/src/common
diff options
context:
space:
mode:
Diffstat (limited to 'src/common')
-rw-r--r--src/common/Makefile1
-rw-r--r--src/common/meson.build1
-rw-r--r--src/common/unicode/Makefile15
-rw-r--r--src/common/unicode/case_test.c100
-rw-r--r--src/common/unicode/generate-unicode_case_table.pl134
-rw-r--r--src/common/unicode/meson.build31
-rw-r--r--src/common/unicode_case.c174
-rw-r--r--src/common/wchar.c4
8 files changed, 455 insertions, 5 deletions
diff --git a/src/common/Makefile b/src/common/Makefile
index 2ba5069dca4..3d83299432b 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -78,6 +78,7 @@ OBJS_COMMON = \
scram-common.o \
string.o \
stringinfo.o \
+ unicode_case.o \
unicode_category.o \
unicode_norm.o \
username.o \
diff --git a/src/common/meson.build b/src/common/meson.build
index 4eb16024cb2..de68e408fa3 100644
--- a/src/common/meson.build
+++ b/src/common/meson.build
@@ -32,6 +32,7 @@ common_sources = files(
'scram-common.c',
'string.c',
'stringinfo.c',
+ 'unicode_case.c',
'unicode_category.c',
'unicode_norm.c',
'username.c',
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile
index 27f0408d8b8..c38ab36b1e4 100644
--- a/src/common/unicode/Makefile
+++ b/src/common/unicode/Makefile
@@ -21,8 +21,9 @@ CPPFLAGS += $(ICU_CFLAGS)
# By default, do nothing.
all:
-update-unicode: unicode_category_table.h unicode_east_asian_fw_table.h unicode_nonspacing_table.h unicode_norm_hashfunc.h unicode_norm_table.h unicode_normprops_table.h unicode_version.h
+update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian_fw_table.h unicode_nonspacing_table.h unicode_norm_hashfunc.h unicode_norm_table.h unicode_normprops_table.h unicode_version.h
mv $^ $(top_srcdir)/src/include/common/
+ $(MAKE) case-check
$(MAKE) category-check
$(MAKE) normalization-check
@@ -35,6 +36,9 @@ CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.tx
unicode_version.h: generate-unicode_version.pl
$(PERL) $< --version $(UNICODE_VERSION)
+unicode_case_table.h: generate-unicode_case_table.pl UnicodeData.txt
+ $(PERL) $<
+
unicode_category_table.h: generate-unicode_category_table.pl DerivedCoreProperties.txt PropList.txt UnicodeData.txt
$(PERL) $<
@@ -55,12 +59,17 @@ unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizat
$(PERL) $^ >$@
# Test suite
+case-check: case_test
+ ./case_test
+
category-check: category_test
./category_test
normalization-check: norm_test
./norm_test
+case_test: case_test.o ../unicode_case.o | submake-common
+
category_test: category_test.o ../unicode_category.o | submake-common
norm_test: norm_test.o ../unicode_norm.o | submake-common
@@ -79,7 +88,7 @@ norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt
clean:
- rm -f $(OBJS) category_test category_test.o norm_test norm_test.o
+ rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
distclean: clean
- rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_category_table.h unicode_norm_table.h
+ rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
new file mode 100644
index 00000000000..7b82d5e0aad
--- /dev/null
+++ b/src/common/unicode/case_test.c
@@ -0,0 +1,100 @@
+/*-------------------------------------------------------------------------
+ * case_test.c
+ * Program to test Unicode case mapping functions.
+ *
+ * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/common/unicode/case_test.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres_fe.h"
+
+#include <locale.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <wctype.h>
+
+#ifdef USE_ICU
+#include <unicode/uchar.h>
+#endif
+#include "common/unicode_case.h"
+#include "common/unicode_category.h"
+#include "common/unicode_version.h"
+
+#ifdef USE_ICU
+
+static void
+icu_test_simple(pg_wchar code)
+{
+ pg_wchar lower = unicode_lowercase_simple(code);
+ pg_wchar title = unicode_titlecase_simple(code);
+ pg_wchar upper = unicode_uppercase_simple(code);
+ pg_wchar iculower = u_tolower(code);
+ pg_wchar icutitle = u_totitle(code);
+ pg_wchar icuupper = u_toupper(code);
+
+ if (lower != iculower || title != icutitle || upper != icuupper)
+ {
+ printf("case_test: FAILURE for codepoint 0x%06x\n", code);
+ printf("case_test: Postgres lower/title/upper: 0x%06x/0x%06x/0x%06x\n",
+ lower, title, upper);
+ printf("case_test: ICU lower/title/upper: 0x%06x/0x%06x/0x%06x\n",
+ iculower, icutitle, icuupper);
+ printf("\n");
+ exit(1);
+ }
+}
+
+static void
+test_icu(void)
+{
+ int successful = 0;
+ int skipped_mismatch = 0;
+
+ for (pg_wchar code = 0; code <= 0x10ffff; code++)
+ {
+ pg_unicode_category category = unicode_category(code);
+
+ if (category != PG_U_UNASSIGNED)
+ {
+ uint8_t icu_category = u_charType(code);
+
+ if (icu_category == PG_U_UNASSIGNED)
+ {
+ skipped_mismatch++;
+ continue;
+ }
+
+ icu_test_simple(code);
+ successful++;
+ }
+ }
+
+ if (skipped_mismatch > 0)
+ printf("case_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",
+ skipped_mismatch);
+
+ printf("case_test: ICU simple mapping test: %d codepoints successful\n",
+ successful);
+}
+#endif
+
+/*
+ * Exhaustively compare case mappings with the results from libc and ICU.
+ */
+int
+main(int argc, char **argv)
+{
+ printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
+#ifdef USE_ICU
+ printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
+ test_icu();
+#else
+ printf("case_test: ICU not available; skipping\n");
+#endif
+
+ exit(0);
+}
diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl
new file mode 100644
index 00000000000..44b785b8619
--- /dev/null
+++ b/src/common/unicode/generate-unicode_case_table.pl
@@ -0,0 +1,134 @@
+#!/usr/bin/perl
+#
+# Generate Unicode character case mappings. Does not include tailoring
+# or locale-specific mappings.
+#
+# Input: UnicodeData.txt
+# Output: unicode_case_table.h
+#
+# Copyright (c) 2000-2023, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+use FindBin;
+use lib "$FindBin::RealBin/../../tools/";
+
+my $output_path = '.';
+
+GetOptions('outdir:s' => \$output_path);
+
+my $output_table_file = "$output_path/unicode_case_table.h";
+
+my $FH;
+
+my %simple = ();
+
+open($FH, '<', "$output_path/UnicodeData.txt")
+ or die "Could not open $output_path/UnicodeData.txt: $!.";
+while (my $line = <$FH>)
+{
+ my @elts = split(';', $line);
+ my $code = hex($elts[0]);
+ my $simple_uppercase = hex($elts[12] =~ s/^\s+|\s+$//rg);
+ my $simple_lowercase = hex($elts[13] =~ s/^\s+|\s+$//rg);
+ my $simple_titlecase = hex($elts[14] =~ s/^\s+|\s+$//rg);
+
+ die "codepoint $code out of range" if $code > 0x10FFFF;
+ die "Simple_Lowercase $code out of range" if $simple_lowercase > 0x10FFFF;
+ die "Simple_Titlecase $code out of range" if $simple_titlecase > 0x10FFFF;
+ die "Simple_Uppercase $code out of range" if $simple_uppercase > 0x10FFFF;
+
+ if ($simple_lowercase || $simple_titlecase || $simple_uppercase)
+ {
+ $simple{$code} = {
+ Simple_Lowercase => ($simple_lowercase || $code),
+ Simple_Titlecase => ($simple_titlecase || $code),
+ Simple_Uppercase => ($simple_uppercase || $code)
+ };
+ }
+}
+close $FH;
+
+# Start writing out the output files
+open my $OT, '>', $output_table_file
+ or die "Could not open output file $output_table_file: $!\n";
+
+# determine size of array given that codepoints <= 0x80 are dense and
+# the rest of the entries are sparse
+my $num_simple = 0x80;
+foreach my $code (sort { $a <=> $b } (keys %simple))
+{
+ $num_simple++ unless $code < 0x80;
+}
+
+print $OT <<"EOS";
+/*-------------------------------------------------------------------------
+ *
+ * unicode_case_table.h
+ * Case mapping and information table.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/common/unicode_case_table.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * File auto-generated by src/common/unicode/generate-unicode_case_table.pl,
+ * do not edit. There is deliberately not an #ifndef PG_UNICODE_CASE_TABLE_H
+ * here.
+ */
+
+#include "common/unicode_case.h"
+#include "mb/pg_wchar.h"
+
+typedef enum
+{
+ CaseLower = 0,
+ CaseTitle = 1,
+ CaseUpper = 2,
+ NCaseKind
+} CaseKind;
+
+typedef struct
+{
+ pg_wchar codepoint; /* Unicode codepoint */
+ pg_wchar simplemap[NCaseKind];
+} pg_case_map;
+
+/*
+ * Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup),
+ * sparse for higher codepoints (requiring scan or binary search).
+ */
+static const pg_case_map case_map[$num_simple] =
+{
+EOS
+
+printf $OT "\t/* begin dense entries for codepoints < 0x80 */\n";
+for (my $code = 0; $code < 0x80; $code++)
+{
+ my $lc = ($simple{$code}{Simple_Lowercase} || $code);
+ my $tc = ($simple{$code}{Simple_Titlecase} || $code);
+ my $uc = ($simple{$code}{Simple_Uppercase} || $code);
+ printf $OT
+ "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
+ $code, $lc, $tc, $uc;
+}
+printf $OT "\n";
+
+printf $OT "\t/* begin sparse entries for codepoints >= 0x80 */\n";
+foreach my $code (sort { $a <=> $b } (keys %simple))
+{
+ next unless $code >= 0x80; # already output above
+
+ my $map = $simple{$code};
+ printf $OT
+ "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
+ $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
+ $map->{Simple_Uppercase};
+}
+print $OT "};\n";
diff --git a/src/common/unicode/meson.build b/src/common/unicode/meson.build
index d7190bb8ca9..b9a4181c320 100644
--- a/src/common/unicode/meson.build
+++ b/src/common/unicode/meson.build
@@ -25,6 +25,16 @@ endforeach
update_unicode_targets = []
update_unicode_targets += \
+ custom_target('unicode_case_table.h',
+ input: [unicode_data['UnicodeData.txt']],
+ output: ['unicode_case_table.h'],
+ command: [
+ perl, files('generate-unicode_case_table.pl'),
+ '--outdir', '@OUTDIR@', '@INPUT@'],
+ build_by_default: false,
+ )
+
+update_unicode_targets += \
custom_target('unicode_category_table.h',
input: [unicode_data['UnicodeData.txt'], unicode_data['DerivedCoreProperties.txt'], unicode_data['PropList.txt']],
output: ['unicode_category_table.h'],
@@ -92,6 +102,17 @@ norm_test_table = custom_target('norm_test_table.h',
inc = include_directories('.')
+case_test = executable('case_test',
+ ['case_test.c'],
+ dependencies: [frontend_port_code, icu],
+ include_directories: inc,
+ link_with: [common_static, pgport_static],
+ build_by_default: false,
+ kwargs: default_bin_args + {
+ 'install': false,
+ }
+)
+
category_test = executable('category_test',
['category_test.c'],
dependencies: [frontend_port_code, icu],
@@ -117,6 +138,16 @@ norm_test = executable('norm_test',
update_unicode_dep = []
if not meson.is_cross_build()
+ update_unicode_dep += custom_target('case_test.run',
+ output: 'case_test.run',
+ input: update_unicode_targets,
+ command: [case_test, UNICODE_VERSION],
+ build_by_default: false,
+ build_always_stale: true,
+ )
+endif
+
+if not meson.is_cross_build()
update_unicode_dep += custom_target('category_test.run',
output: 'category_test.run',
input: update_unicode_targets,
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
new file mode 100644
index 00000000000..842db173ba8
--- /dev/null
+++ b/src/common/unicode_case.c
@@ -0,0 +1,174 @@
+/*-------------------------------------------------------------------------
+ * unicode_case.c
+ * Unicode case mapping and case conversion.
+ *
+ * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/common/unicode_case.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef FRONTEND
+#include "postgres.h"
+#else
+#include "postgres_fe.h"
+#endif
+
+#include "common/unicode_case.h"
+#include "common/unicode_case_table.h"
+#include "common/unicode_category.h"
+#include "mb/pg_wchar.h"
+
+static const pg_case_map *find_case_map(pg_wchar ucs);
+static size_t convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
+ CaseKind casekind);
+
+pg_wchar
+unicode_lowercase_simple(pg_wchar code)
+{
+ const pg_case_map *map = find_case_map(code);
+
+ return map ? map->simplemap[CaseLower] : code;
+}
+
+pg_wchar
+unicode_titlecase_simple(pg_wchar code)
+{
+ const pg_case_map *map = find_case_map(code);
+
+ return map ? map->simplemap[CaseTitle] : code;
+}
+
+pg_wchar
+unicode_uppercase_simple(pg_wchar code)
+{
+ const pg_case_map *map = find_case_map(code);
+
+ return map ? map->simplemap[CaseUpper] : code;
+}
+
+/*
+ * unicode_strlower()
+ *
+ * Convert src to lowercase, and return the result length (not including
+ * terminating NUL).
+ *
+ * String src must be encoded in UTF-8. If srclen < 0, src must be
+ * NUL-terminated.
+ *
+ * Result string is stored in dst, truncating if larger than dstsize. If
+ * dstsize is greater than the result length, dst will be NUL-terminated;
+ * otherwise not.
+ *
+ * If dstsize is zero, dst may be NULL. This is useful for calculating the
+ * required buffer size before allocating.
+ */
+size_t
+unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen)
+{
+ return convert_case(dst, dstsize, src, srclen, CaseLower);
+}
+
+/*
+ * unicode_strupper()
+ *
+ * Convert src to uppercase, and return the result length (not including
+ * terminating NUL).
+ *
+ * String src must be encoded in UTF-8. If srclen < 0, src must be
+ * NUL-terminated.
+ *
+ * Result string is stored in dst, truncating if larger than dstsize. If
+ * dstsize is greater than the result length, dst will be NUL-terminated;
+ * otherwise not.
+ *
+ * If dstsize is zero, dst may be NULL. This is useful for calculating the
+ * required buffer size before allocating.
+ */
+size_t
+unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen)
+{
+ return convert_case(dst, dstsize, src, srclen, CaseUpper);
+}
+
+/*
+ * Implement Unicode Default Case Conversion algorithm.
+ *
+ * Map each character in the string for which a mapping is available.
+ */
+static size_t
+convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
+ CaseKind casekind)
+{
+ size_t srcoff = 0;
+ size_t result_len = 0;
+
+ while (src[srcoff] != '\0' && (srclen < 0 || srcoff < srclen))
+ {
+ pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
+ int u1len = unicode_utf8len(u1);
+ const pg_case_map *casemap = find_case_map(u1);
+
+ if (casemap)
+ {
+ pg_wchar u2 = casemap->simplemap[casekind];
+ pg_wchar u2len = unicode_utf8len(u2);
+
+ if (result_len + u2len < dstsize)
+ unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+
+ result_len += u2len;
+ }
+ else
+ {
+ /* no mapping; copy bytes from src */
+ if (result_len + u1len < dstsize)
+ memcpy(dst + result_len, src + srcoff, u1len);
+
+ result_len += u1len;
+ }
+
+ srcoff += u1len;
+ }
+
+ if (result_len < dstsize)
+ dst[result_len] = '\0';
+
+ return result_len;
+}
+
+/* find entry in simple case map, if any */
+static const pg_case_map *
+find_case_map(pg_wchar ucs)
+{
+ int min;
+ int mid;
+ int max;
+
+ /* all chars <= 0x80 are stored in array for fast lookup */
+ Assert(lengthof(case_map) >= 0x80);
+ if (ucs < 0x80)
+ {
+ const pg_case_map *map = &case_map[ucs];
+
+ Assert(map->codepoint == ucs);
+ return map;
+ }
+
+ /* otherwise, binary search */
+ min = 0x80;
+ max = lengthof(case_map) - 1;
+ while (max >= min)
+ {
+ mid = (min + max) / 2;
+ if (ucs > case_map[mid].codepoint)
+ min = mid + 1;
+ else if (ucs < case_map[mid].codepoint)
+ max = mid - 1;
+ else
+ return &case_map[mid];
+ }
+
+ return NULL;
+}
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 7e7a7507d5c..a238c0106c6 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -477,8 +477,8 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
/*
- * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
- * space allocated.
+ * Map a Unicode code point to UTF-8. utf8string must have at least
+ * unicode_utf8len(c) bytes available.
*/
unsigned char *
unicode_to_utf8(pg_wchar c, unsigned char *utf8string)