summaryrefslogtreecommitdiff
path: root/src/common/unicode
diff options
context:
space:
mode:
authorJohn Naylor2021-08-26 14:53:56 +0000
committerJohn Naylor2021-08-26 14:53:56 +0000
commitbab982161e0590746a2fd2a03043b27108b23ac6 (patch)
treefd77c5e80bbc1f83da0379a6427b99dc5bb26393 /src/common/unicode
parent1563ecbc1be8b8e5c57651cf5c87f90dea9aea8f (diff)
Update display widths as part of updating Unicode
The hardcoded "wide character" set in ucs_wcwidth() was last updated around the Unicode 5.0 era. This led to misalignment when printing emojis and other codepoints that have since been designated wide or full-width. To fix and keep up to date, extend update-unicode to download the list of wide and full-width codepoints from the offical sources. In passing, remove some comments about non-spacing characters that haven't been accurate since we removed the former hardcoded logic. Jacob Champion Reported and reviewed by Pavel Stehule Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRCeX21O69YHxmykYySYyprZAqrKWWg0KoGKdjgqcGyygg@mail.gmail.com
Diffstat (limited to 'src/common/unicode')
-rw-r--r--src/common/unicode/.gitignore1
-rw-r--r--src/common/unicode/Makefile9
-rw-r--r--src/common/unicode/generate-unicode_east_asian_fw_table.pl76
3 files changed, 83 insertions, 3 deletions
diff --git a/src/common/unicode/.gitignore b/src/common/unicode/.gitignore
index 512862e538c..46243f701df 100644
--- a/src/common/unicode/.gitignore
+++ b/src/common/unicode/.gitignore
@@ -4,5 +4,6 @@
# Downloaded files
/CompositionExclusions.txt
/DerivedNormalizationProps.txt
+/EastAsianWidth.txt
/NormalizationTest.txt
/UnicodeData.txt
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile
index eb14add28ad..a3683dd86b9 100644
--- a/src/common/unicode/Makefile
+++ b/src/common/unicode/Makefile
@@ -18,14 +18,14 @@ LIBS += $(PTHREAD_LIBS)
# By default, do nothing.
all:
-update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
+update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
mv $^ ../../../src/include/common/
$(MAKE) normalization-check
# These files are part of the Unicode Character Database. Download
# them on demand. The dependency on Makefile.global is for
# UNICODE_VERSION.
-UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
+UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
# Generation of conversion tables used for string normalization with
@@ -38,6 +38,9 @@ unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt Composition
unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
$(PERL) $^ >$@
+unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt
+ $(PERL) $^ >$@
+
unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt
$(PERL) $^ >$@
@@ -64,6 +67,6 @@ clean:
rm -f $(OBJS) norm_test norm_test.o
distclean: clean
- rm -f UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
+ rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
maintainer-clean: distclean
diff --git a/src/common/unicode/generate-unicode_east_asian_fw_table.pl b/src/common/unicode/generate-unicode_east_asian_fw_table.pl
new file mode 100644
index 00000000000..45f7a4b7fe7
--- /dev/null
+++ b/src/common/unicode/generate-unicode_east_asian_fw_table.pl
@@ -0,0 +1,76 @@
+#!/usr/bin/perl
+#
+# Generate a sorted list of non-overlapping intervals of East Asian Wide (W)
+# and East Asian Fullwidth (F) characters, using Unicode data files as input.
+# Pass EastAsianWidth.txt as argument. The output is on stdout.
+#
+# Copyright (c) 2019-2021, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+my $range_start = undef;
+my ($first, $last);
+my $prev_last;
+
+print
+ "/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n";
+
+print "static const struct mbinterval east_asian_fw[] = {\n";
+
+foreach my $line (<ARGV>)
+{
+ chomp $line;
+ $line =~ s/\s*#.*$//;
+ next if $line eq '';
+ my ($codepoint, $width) = split ';', $line;
+
+ if ($codepoint =~ /\.\./)
+ {
+ ($first, $last) = split /\.\./, $codepoint;
+ }
+ else
+ {
+ $first = $last = $codepoint;
+ }
+
+ ($first, $last) = map(hex, ($first, $last));
+
+ if ($width eq 'F' || $width eq 'W')
+ {
+ # fullwidth/wide characters
+ if (!defined($range_start))
+ {
+ # save for start of range if one hasn't been started yet
+ $range_start = $first;
+ }
+ elsif ($first != $prev_last + 1)
+ {
+ # ranges aren't contiguous; emit the last and start a new one
+ printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+ $range_start = $first;
+ }
+ }
+ else
+ {
+ # not wide characters, print out previous range if any
+ if (defined($range_start))
+ {
+ printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+ $range_start = undef;
+ }
+ }
+}
+continue
+{
+ $prev_last = $last;
+}
+
+# don't forget any ranges at the very end of the database (though there are none
+# as of Unicode 13.0)
+if (defined($range_start))
+{
+ printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+}
+
+print "};\n";