Treat Unicode codepoints of category "Format" as non-spacing

Commit d8594d123 updated the list of non-spacing codepoints used for calculating display width, but in doing so inadvertently removed some, since the script used for that commit only considered combining characters. For complete coverage for zero-width characters, include codepoints in the category Cf (Format). To reflect the wider purpose, also rename files and update comments that referred specifically to combining characters. Some of these ranges have been missing since v12, but due to lack of field complaints it was determined not important enough to justify adding special-case logic the backbranches. Kyotaro Horiguchi Report by Pavel Stehule Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRBE8yvpQ0FSkPCoe0Ny1jAAsAQ6j3qMgVwWvkqAoaaNmQ%40mail.gmail.com
author: John Naylor 2022-09-13 09:13:33 +0000
committer: John Naylor 2022-09-13 09:13:33 +0000
commit: 0bd9c629732375e21d3ca6fba16c4a6a2808411a (patch)
tree: 1cdbc6743a69559f62755232d550a7fc9534731b /src/common/unicode
parent: bb629c294bea533884a379eee5f8ed6307c17bf2 (diff)
2 files changed, 9 insertions, 7 deletions
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile
index 60e01e748f8..382da476cf9 100644
--- a/src/common/unicode/Makefile
+++ b/src/common/unicode/Makefile
@@ -18,7 +18,7 @@ LIBS += $(PTHREAD_LIBS)
 # By default, do nothing.
 all:
 
-update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
+update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
 	mv $^ $(top_srcdir)/src/include/common/
 	$(MAKE) normalization-check
 
@@ -35,7 +35,7 @@ unicode_norm_hashfunc.h: unicode_norm_table.h
 unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt
 	$(PERL) $<
 
-unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
+unicode_nonspacing_table.h: generate-unicode_nonspacing_table.pl UnicodeData.txt
 	$(PERL) $^ >$@
 
 unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt
diff --git a/src/common/unicode/generate-unicode_combining_table.pl b/src/common/unicode/generate-unicode_nonspacing_table.pl
index 8177c20260b..3161eed4a26 100644
--- a/src/common/unicode/generate-unicode_combining_table.pl
+++ b/src/common/unicode/generate-unicode_nonspacing_table.pl
@@ -15,9 +15,9 @@ my $prev_codepoint;
 my $count = 0;
 
 print
-  "/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */\n\n";
+  "/* generated by src/common/unicode/generate-unicode_nonspacing_table.pl, do not edit */\n\n";
 
-print "static const struct mbinterval combining[] = {\n";
+print "static const struct mbinterval nonspacing[] = {\n";
 
 foreach my $line (<ARGV>)
 {
@@ -25,9 +25,11 @@ foreach my $line (<ARGV>)
 	my @fields = split ';', $line;
 	$codepoint = hex $fields[0];
 
-	if ($fields[2] eq 'Me' || $fields[2] eq 'Mn')
+	# Me and Mn refer to combining characters
+	# Cf refers to format characters
+	if ($fields[2] eq 'Me' || $fields[2] eq 'Mn' || $fields[2] eq 'Cf')
 	{
-		# combining character, save for start of range
+		# non-spacing character, save for start of range
 		if (!defined($range_start))
 		{
 			$range_start = $codepoint;
@@ -35,7 +37,7 @@ foreach my $line (<ARGV>)
 	}
 	else
 	{
-		# not a combining character, print out previous range if any
+		# not a non-spacing character, print out previous range if any
 		if (defined($range_start))
 		{
 			printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_codepoint;
author	John Naylor	2022-09-13 09:13:33 +0000
committer	John Naylor	2022-09-13 09:13:33 +0000
commit	0bd9c629732375e21d3ca6fba16c4a6a2808411a (patch)
tree	1cdbc6743a69559f62755232d550a7fc9534731b /src/common/unicode
parent	bb629c294bea533884a379eee5f8ed6307c17bf2 (diff)