diff options
| author | John Naylor | 2021-08-26 14:53:56 +0000 |
|---|---|---|
| committer | John Naylor | 2021-08-26 14:53:56 +0000 |
| commit | bab982161e0590746a2fd2a03043b27108b23ac6 (patch) | |
| tree | fd77c5e80bbc1f83da0379a6427b99dc5bb26393 /src/common/unicode | |
| parent | 1563ecbc1be8b8e5c57651cf5c87f90dea9aea8f (diff) | |
Update display widths as part of updating Unicode
The hardcoded "wide character" set in ucs_wcwidth() was last updated
around the Unicode 5.0 era. This led to misalignment when printing
emojis and other codepoints that have since been designated
wide or full-width.
To fix and keep up to date, extend update-unicode to download the list
of wide and full-width codepoints from the offical sources.
In passing, remove some comments about non-spacing characters that
haven't been accurate since we removed the former hardcoded logic.
Jacob Champion
Reported and reviewed by Pavel Stehule
Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRCeX21O69YHxmykYySYyprZAqrKWWg0KoGKdjgqcGyygg@mail.gmail.com
Diffstat (limited to 'src/common/unicode')
| -rw-r--r-- | src/common/unicode/.gitignore | 1 | ||||
| -rw-r--r-- | src/common/unicode/Makefile | 9 | ||||
| -rw-r--r-- | src/common/unicode/generate-unicode_east_asian_fw_table.pl | 76 |
3 files changed, 83 insertions, 3 deletions
diff --git a/src/common/unicode/.gitignore b/src/common/unicode/.gitignore index 512862e538c..46243f701df 100644 --- a/src/common/unicode/.gitignore +++ b/src/common/unicode/.gitignore @@ -4,5 +4,6 @@ # Downloaded files /CompositionExclusions.txt /DerivedNormalizationProps.txt +/EastAsianWidth.txt /NormalizationTest.txt /UnicodeData.txt diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile index eb14add28ad..a3683dd86b9 100644 --- a/src/common/unicode/Makefile +++ b/src/common/unicode/Makefile @@ -18,14 +18,14 @@ LIBS += $(PTHREAD_LIBS) # By default, do nothing. all: -update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h +update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h mv $^ ../../../src/include/common/ $(MAKE) normalization-check # These files are part of the Unicode Character Database. Download # them on demand. The dependency on Makefile.global is for # UNICODE_VERSION. -UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global +UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global $(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F) # Generation of conversion tables used for string normalization with @@ -38,6 +38,9 @@ unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt Composition unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt $(PERL) $^ >$@ +unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt + $(PERL) $^ >$@ + unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt $(PERL) $^ >$@ @@ -64,6 +67,6 @@ clean: rm -f $(OBJS) norm_test norm_test.o distclean: clean - rm -f UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h + rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h maintainer-clean: distclean diff --git a/src/common/unicode/generate-unicode_east_asian_fw_table.pl b/src/common/unicode/generate-unicode_east_asian_fw_table.pl new file mode 100644 index 00000000000..45f7a4b7fe7 --- /dev/null +++ b/src/common/unicode/generate-unicode_east_asian_fw_table.pl @@ -0,0 +1,76 @@ +#!/usr/bin/perl +# +# Generate a sorted list of non-overlapping intervals of East Asian Wide (W) +# and East Asian Fullwidth (F) characters, using Unicode data files as input. +# Pass EastAsianWidth.txt as argument. The output is on stdout. +# +# Copyright (c) 2019-2021, PostgreSQL Global Development Group + +use strict; +use warnings; + +my $range_start = undef; +my ($first, $last); +my $prev_last; + +print + "/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n"; + +print "static const struct mbinterval east_asian_fw[] = {\n"; + +foreach my $line (<ARGV>) +{ + chomp $line; + $line =~ s/\s*#.*$//; + next if $line eq ''; + my ($codepoint, $width) = split ';', $line; + + if ($codepoint =~ /\.\./) + { + ($first, $last) = split /\.\./, $codepoint; + } + else + { + $first = $last = $codepoint; + } + + ($first, $last) = map(hex, ($first, $last)); + + if ($width eq 'F' || $width eq 'W') + { + # fullwidth/wide characters + if (!defined($range_start)) + { + # save for start of range if one hasn't been started yet + $range_start = $first; + } + elsif ($first != $prev_last + 1) + { + # ranges aren't contiguous; emit the last and start a new one + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; + $range_start = $first; + } + } + else + { + # not wide characters, print out previous range if any + if (defined($range_start)) + { + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; + $range_start = undef; + } + } +} +continue +{ + $prev_last = $last; +} + +# don't forget any ranges at the very end of the database (though there are none +# as of Unicode 13.0) +if (defined($range_start)) +{ + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; +} + +print "};\n"; |
