# Downloaded files
/CompositionExclusions.txt
/DerivedNormalizationProps.txt
+/EastAsianWidth.txt
/NormalizationTest.txt
/UnicodeData.txt
# By default, do nothing.
all:
-update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
+update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
mv $^ ../../../src/include/common/
$(MAKE) normalization-check
# These files are part of the Unicode Character Database. Download
# them on demand. The dependency on Makefile.global is for
# UNICODE_VERSION.
-UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
+UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
# Generation of conversion tables used for string normalization with
unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
$(PERL) $^ >$@
+unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt
+ $(PERL) $^ >$@
+
unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt
$(PERL) $^ >$@
rm -f $(OBJS) norm_test norm_test.o
distclean: clean
- rm -f UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
+ rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
maintainer-clean: distclean
--- /dev/null
+#!/usr/bin/perl
+#
+# Generate a sorted list of non-overlapping intervals of East Asian Wide (W)
+# and East Asian Fullwidth (F) characters, using Unicode data files as input.
+# Pass EastAsianWidth.txt as argument. The output is on stdout.
+#
+# Copyright (c) 2019-2021, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+my $range_start = undef;
+my ($first, $last);
+my $prev_last;
+
+print
+ "/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n";
+
+print "static const struct mbinterval east_asian_fw[] = {\n";
+
+foreach my $line (<ARGV>)
+{
+ chomp $line;
+ $line =~ s/\s*#.*$//;
+ next if $line eq '';
+ my ($codepoint, $width) = split ';', $line;
+
+ if ($codepoint =~ /\.\./)
+ {
+ ($first, $last) = split /\.\./, $codepoint;
+ }
+ else
+ {
+ $first = $last = $codepoint;
+ }
+
+ ($first, $last) = map(hex, ($first, $last));
+
+ if ($width eq 'F' || $width eq 'W')
+ {
+ # fullwidth/wide characters
+ if (!defined($range_start))
+ {
+ # save for start of range if one hasn't been started yet
+ $range_start = $first;
+ }
+ elsif ($first != $prev_last + 1)
+ {
+ # ranges aren't contiguous; emit the last and start a new one
+ printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+ $range_start = $first;
+ }
+ }
+ else
+ {
+ # not wide characters, print out previous range if any
+ if (defined($range_start))
+ {
+ printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+ $range_start = undef;
+ }
+ }
+}
+continue
+{
+ $prev_last = $last;
+}
+
+# don't forget any ranges at the very end of the database (though there are none
+# as of Unicode 13.0)
+if (defined($range_start))
+{
+ printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+}
+
+print "};\n";
struct mbinterval
{
- unsigned short first;
- unsigned short last;
+ unsigned int first;
+ unsigned int last;
};
/* auxiliary function for binary search in interval table */
* category code Mn or Me in the Unicode database) have a
* column width of 0.
*
- * - Other format characters (general category code Cf in the Unicode
- * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
- *
- * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
- * have a column width of 0.
- *
* - Spacing characters in the East Asian Wide (W) or East Asian
* FullWidth (F) category as defined in Unicode Technical
* Report #11 have a column width of 2.
ucs_wcwidth(pg_wchar ucs)
{
#include "common/unicode_combining_table.h"
+#include "common/unicode_east_asian_fw_table.h"
/* test for 8-bit control characters */
if (ucs == 0)
if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
return -1;
- /* binary search in table of non-spacing characters */
+ /*
+ * binary search in table of non-spacing characters
+ *
+ * XXX: In the official Unicode sources, it is possible for a character to
+ * be described as both non-spacing and wide at the same time. As of
+ * Unicode 13.0, treating the non-spacing property as the determining
+ * factor for display width leads to the correct behavior, so do that
+ * search first.
+ */
if (mbbisearch(ucs, combining,
sizeof(combining) / sizeof(struct mbinterval) - 1))
return 0;
- /*
- * if we arrive here, ucs is not a combining or C0/C1 control character
- */
+ /* binary search in table of wide characters */
+ if (mbbisearch(ucs, east_asian_fw,
+ sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
+ return 2;
- return 1 +
- (ucs >= 0x1100 &&
- (ucs <= 0x115f || /* Hangul Jamo init. consonants */
- (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
- ucs != 0x303f) || /* CJK ... Yi */
- (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
- (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility
- * Ideographs */
- (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
- (ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */
- (ucs >= 0xffe0 && ucs <= 0xffe6) ||
- (ucs >= 0x20000 && ucs <= 0x2ffff)));
+ return 1;
}
/*
--- /dev/null
+/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */
+
+static const struct mbinterval east_asian_fw[] = {
+ {0x1100, 0x115F},
+ {0x231A, 0x231B},
+ {0x2329, 0x232A},
+ {0x23E9, 0x23EC},
+ {0x23F0, 0x23F0},
+ {0x23F3, 0x23F3},
+ {0x25FD, 0x25FE},
+ {0x2614, 0x2615},
+ {0x2648, 0x2653},
+ {0x267F, 0x267F},
+ {0x2693, 0x2693},
+ {0x26A1, 0x26A1},
+ {0x26AA, 0x26AB},
+ {0x26BD, 0x26BE},
+ {0x26C4, 0x26C5},
+ {0x26CE, 0x26CE},
+ {0x26D4, 0x26D4},
+ {0x26EA, 0x26EA},
+ {0x26F2, 0x26F3},
+ {0x26F5, 0x26F5},
+ {0x26FA, 0x26FA},
+ {0x26FD, 0x26FD},
+ {0x2705, 0x2705},
+ {0x270A, 0x270B},
+ {0x2728, 0x2728},
+ {0x274C, 0x274C},
+ {0x274E, 0x274E},
+ {0x2753, 0x2755},
+ {0x2757, 0x2757},
+ {0x2795, 0x2797},
+ {0x27B0, 0x27B0},
+ {0x27BF, 0x27BF},
+ {0x2B1B, 0x2B1C},
+ {0x2B50, 0x2B50},
+ {0x2B55, 0x2B55},
+ {0x2E80, 0x2E99},
+ {0x2E9B, 0x2EF3},
+ {0x2F00, 0x2FD5},
+ {0x2FF0, 0x2FFB},
+ {0x3000, 0x303E},
+ {0x3041, 0x3096},
+ {0x3099, 0x30FF},
+ {0x3105, 0x312F},
+ {0x3131, 0x318E},
+ {0x3190, 0x31E3},
+ {0x31F0, 0x321E},
+ {0x3220, 0x3247},
+ {0x3250, 0x4DBF},
+ {0x4E00, 0xA48C},
+ {0xA490, 0xA4C6},
+ {0xA960, 0xA97C},
+ {0xAC00, 0xD7A3},
+ {0xF900, 0xFAFF},
+ {0xFE10, 0xFE19},
+ {0xFE30, 0xFE52},
+ {0xFE54, 0xFE66},
+ {0xFE68, 0xFE6B},
+ {0xFF01, 0xFF60},
+ {0xFFE0, 0xFFE6},
+ {0x16FE0, 0x16FE4},
+ {0x16FF0, 0x16FF1},
+ {0x17000, 0x187F7},
+ {0x18800, 0x18CD5},
+ {0x18D00, 0x18D08},
+ {0x1B000, 0x1B11E},
+ {0x1B150, 0x1B152},
+ {0x1B164, 0x1B167},
+ {0x1B170, 0x1B2FB},
+ {0x1F004, 0x1F004},
+ {0x1F0CF, 0x1F0CF},
+ {0x1F18E, 0x1F18E},
+ {0x1F191, 0x1F19A},
+ {0x1F200, 0x1F202},
+ {0x1F210, 0x1F23B},
+ {0x1F240, 0x1F248},
+ {0x1F250, 0x1F251},
+ {0x1F260, 0x1F265},
+ {0x1F300, 0x1F320},
+ {0x1F32D, 0x1F335},
+ {0x1F337, 0x1F37C},
+ {0x1F37E, 0x1F393},
+ {0x1F3A0, 0x1F3CA},
+ {0x1F3CF, 0x1F3D3},
+ {0x1F3E0, 0x1F3F0},
+ {0x1F3F4, 0x1F3F4},
+ {0x1F3F8, 0x1F43E},
+ {0x1F440, 0x1F440},
+ {0x1F442, 0x1F4FC},
+ {0x1F4FF, 0x1F53D},
+ {0x1F54B, 0x1F54E},
+ {0x1F550, 0x1F567},
+ {0x1F57A, 0x1F57A},
+ {0x1F595, 0x1F596},
+ {0x1F5A4, 0x1F5A4},
+ {0x1F5FB, 0x1F64F},
+ {0x1F680, 0x1F6C5},
+ {0x1F6CC, 0x1F6CC},
+ {0x1F6D0, 0x1F6D2},
+ {0x1F6D5, 0x1F6D7},
+ {0x1F6EB, 0x1F6EC},
+ {0x1F6F4, 0x1F6FC},
+ {0x1F7E0, 0x1F7EB},
+ {0x1F90C, 0x1F93A},
+ {0x1F93C, 0x1F945},
+ {0x1F947, 0x1F978},
+ {0x1F97A, 0x1F9CB},
+ {0x1F9CD, 0x1F9FF},
+ {0x1FA70, 0x1FA74},
+ {0x1FA78, 0x1FA7A},
+ {0x1FA80, 0x1FA86},
+ {0x1FA90, 0x1FAA8},
+ {0x1FAB0, 0x1FAB6},
+ {0x1FAC0, 0x1FAC2},
+ {0x1FAD0, 0x1FAD6},
+ {0x20000, 0x2FFFD},
+ {0x30000, 0x3FFFD},
+};