Update display widths as part of updating Unicode

The hardcoded "wide character" set in ucs_wcwidth() was last updated around the Unicode 5.0 era. This led to misalignment when printing emojis and other codepoints that have since been designated wide or full-width. To fix and keep up to date, extend update-unicode to download the list of wide and full-width codepoints from the offical sources. In passing, remove some comments about non-spacing characters that haven't been accurate since we removed the former hardcoded logic. Jacob Champion Reported and reviewed by Pavel Stehule Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRCeX21O69YHxmykYySYyprZAqrKWWg0KoGKdjgqcGyygg@mail.gmail.com
author: John Naylor 2021-08-26 14:53:56 +0000
committer: John Naylor 2021-08-26 14:53:56 +0000
commit: bab982161e0590746a2fd2a03043b27108b23ac6 (patch)
tree: fd77c5e80bbc1f83da0379a6427b99dc5bb26393 /src/common/unicode
parent: 1563ecbc1be8b8e5c57651cf5c87f90dea9aea8f (diff)
3 files changed, 83 insertions, 3 deletions
diff --git a/src/common/unicode/.gitignore b/src/common/unicode/.gitignore
index 512862e538c..46243f701df 100644
--- a/src/common/unicode/.gitignore
+++ b/src/common/unicode/.gitignore
@@ -4,5 +4,6 @@
 # Downloaded files
 /CompositionExclusions.txt
 /DerivedNormalizationProps.txt
+/EastAsianWidth.txt
 /NormalizationTest.txt
 /UnicodeData.txt
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile
index eb14add28ad..a3683dd86b9 100644
--- a/src/common/unicode/Makefile
+++ b/src/common/unicode/Makefile
@@ -18,14 +18,14 @@ LIBS += $(PTHREAD_LIBS)
 # By default, do nothing.
 all:
 
-update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
+update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
 	mv $^ ../../../src/include/common/
 	$(MAKE) normalization-check
 
 # These files are part of the Unicode Character Database. Download
 # them on demand.  The dependency on Makefile.global is for
 # UNICODE_VERSION.
-UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
+UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
 	$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
 
 # Generation of conversion tables used for string normalization with
@@ -38,6 +38,9 @@ unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt Composition
 unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
 	$(PERL) $^ >$@
 
+unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt
+	$(PERL) $^ >$@
+
 unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt
 	$(PERL) $^ >$@
 
@@ -64,6 +67,6 @@ clean:
 	rm -f $(OBJS) norm_test norm_test.o
 
 distclean: clean
-	rm -f UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
+	rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
 
 maintainer-clean: distclean
diff --git a/src/common/unicode/generate-unicode_east_asian_fw_table.pl b/src/common/unicode/generate-unicode_east_asian_fw_table.pl
new file mode 100644
index 00000000000..45f7a4b7fe7
--- /dev/null
+++ b/src/common/unicode/generate-unicode_east_asian_fw_table.pl
@@ -0,0 +1,76 @@
+#!/usr/bin/perl
+#
+# Generate a sorted list of non-overlapping intervals of East Asian Wide (W)
+# and East Asian Fullwidth (F) characters, using Unicode data files as input.
+# Pass EastAsianWidth.txt as argument.  The output is on stdout.
+#
+# Copyright (c) 2019-2021, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+my $range_start = undef;
+my ($first, $last);
+my $prev_last;
+
+print
+  "/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n";
+
+print "static const struct mbinterval east_asian_fw[] = {\n";
+
+foreach my $line (<ARGV>)
+{
+	chomp $line;
+	$line =~ s/\s*#.*$//;
+	next if $line eq '';
+	my ($codepoint, $width) = split ';', $line;
+
+	if ($codepoint =~ /\.\./)
+	{
+		($first, $last) = split /\.\./, $codepoint;
+	}
+	else
+	{
+		$first = $last = $codepoint;
+	}
+
+	($first, $last) = map(hex, ($first, $last));
+
+	if ($width eq 'F' || $width eq 'W')
+	{
+		# fullwidth/wide characters
+		if (!defined($range_start))
+		{
+			# save for start of range if one hasn't been started yet
+			$range_start = $first;
+		}
+		elsif ($first != $prev_last + 1)
+		{
+			# ranges aren't contiguous; emit the last and start a new one
+			printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+			$range_start = $first;
+		}
+	}
+	else
+	{
+		# not wide characters, print out previous range if any
+		if (defined($range_start))
+		{
+			printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+			$range_start = undef;
+		}
+	}
+}
+continue
+{
+	$prev_last = $last;
+}
+
+# don't forget any ranges at the very end of the database (though there are none
+# as of Unicode 13.0)
+if (defined($range_start))
+{
+	printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+}
+
+print "};\n";
author	John Naylor	2021-08-26 14:53:56 +0000
committer	John Naylor	2021-08-26 14:53:56 +0000
commit	bab982161e0590746a2fd2a03043b27108b23ac6 (patch)
tree	fd77c5e80bbc1f83da0379a6427b99dc5bb26393 /src/common/unicode
parent	1563ecbc1be8b8e5c57651cf5c87f90dea9aea8f (diff)