Extend collection of Unicode combining characters to beyond the BMP
authorJohn Naylor <john.naylor@postgresql.org>
Thu, 26 Aug 2021 17:07:34 +0000 (13:07 -0400)
committerJohn Naylor <john.naylor@postgresql.org>
Thu, 26 Aug 2021 17:07:34 +0000 (13:07 -0400)
The former limit was perhaps a carryover from an older hand-coded
table. Since commit bab982161 we have enough space in mbinterval to
store larger codepoints, so collect all combining characters.

Discussion: https://www.postgresql.org/message-id/49ad1fa0-174e-c901-b14c-c484b60907f1%40enterprisedb.com

src/common/unicode/generate-unicode_combining_table.pl
src/include/common/unicode_combining_table.h

index 86aed7890708b5088ef389d1c1ba5d3b245b12a6..093a802f5f965a66cd553c273bb7a6c1d2f00fc4 100644 (file)
@@ -25,8 +25,6 @@ foreach my $line (<ARGV>)
    my @fields = split ';', $line;
    $codepoint = hex $fields[0];
 
-   next if $codepoint > 0xFFFF;
-
    if ($fields[2] eq 'Me' || $fields[2] eq 'Mn')
    {
        # combining character, save for start of range
index a9f10c31bc8a44b73b06b23aa5c96ef973e50101..4dcaf45c3c21d7a26d090fa5b012af4ff48cb405 100644 (file)
@@ -193,4 +193,106 @@ static const struct mbinterval combining[] = {
    {0xFB1E, 0xFB1E},
    {0xFE00, 0xFE0F},
    {0xFE20, 0xFE2F},
+   {0x101FD, 0x101FD},
+   {0x102E0, 0x102E0},
+   {0x10376, 0x1037A},
+   {0x10A01, 0x10A0F},
+   {0x10A38, 0x10A3F},
+   {0x10AE5, 0x10AE6},
+   {0x10D24, 0x10D27},
+   {0x10EAB, 0x10EAC},
+   {0x10F46, 0x10F50},
+   {0x11001, 0x11001},
+   {0x11038, 0x11046},
+   {0x1107F, 0x11081},
+   {0x110B3, 0x110B6},
+   {0x110B9, 0x110BA},
+   {0x11100, 0x11102},
+   {0x11127, 0x1112B},
+   {0x1112D, 0x11134},
+   {0x11173, 0x11173},
+   {0x11180, 0x11181},
+   {0x111B6, 0x111BE},
+   {0x111C9, 0x111CC},
+   {0x111CF, 0x111CF},
+   {0x1122F, 0x11231},
+   {0x11234, 0x11234},
+   {0x11236, 0x11237},
+   {0x1123E, 0x1123E},
+   {0x112DF, 0x112DF},
+   {0x112E3, 0x112EA},
+   {0x11300, 0x11301},
+   {0x1133B, 0x1133C},
+   {0x11340, 0x11340},
+   {0x11366, 0x11374},
+   {0x11438, 0x1143F},
+   {0x11442, 0x11444},
+   {0x11446, 0x11446},
+   {0x1145E, 0x1145E},
+   {0x114B3, 0x114B8},
+   {0x114BA, 0x114BA},
+   {0x114BF, 0x114C0},
+   {0x114C2, 0x114C3},
+   {0x115B2, 0x115B5},
+   {0x115BC, 0x115BD},
+   {0x115BF, 0x115C0},
+   {0x115DC, 0x115DD},
+   {0x11633, 0x1163A},
+   {0x1163D, 0x1163D},
+   {0x1163F, 0x11640},
+   {0x116AB, 0x116AB},
+   {0x116AD, 0x116AD},
+   {0x116B0, 0x116B5},
+   {0x116B7, 0x116B7},
+   {0x1171D, 0x1171F},
+   {0x11722, 0x11725},
+   {0x11727, 0x1172B},
+   {0x1182F, 0x11837},
+   {0x11839, 0x1183A},
+   {0x1193B, 0x1193C},
+   {0x1193E, 0x1193E},
+   {0x11943, 0x11943},
+   {0x119D4, 0x119DB},
+   {0x119E0, 0x119E0},
+   {0x11A01, 0x11A0A},
+   {0x11A33, 0x11A38},
+   {0x11A3B, 0x11A3E},
+   {0x11A47, 0x11A47},
+   {0x11A51, 0x11A56},
+   {0x11A59, 0x11A5B},
+   {0x11A8A, 0x11A96},
+   {0x11A98, 0x11A99},
+   {0x11C30, 0x11C3D},
+   {0x11C3F, 0x11C3F},
+   {0x11C92, 0x11CA7},
+   {0x11CAA, 0x11CB0},
+   {0x11CB2, 0x11CB3},
+   {0x11CB5, 0x11CB6},
+   {0x11D31, 0x11D45},
+   {0x11D47, 0x11D47},
+   {0x11D90, 0x11D91},
+   {0x11D95, 0x11D95},
+   {0x11D97, 0x11D97},
+   {0x11EF3, 0x11EF4},
+   {0x16AF0, 0x16AF4},
+   {0x16B30, 0x16B36},
+   {0x16F4F, 0x16F4F},
+   {0x16F8F, 0x16F92},
+   {0x16FE4, 0x16FE4},
+   {0x1BC9D, 0x1BC9E},
+   {0x1D167, 0x1D169},
+   {0x1D17B, 0x1D182},
+   {0x1D185, 0x1D18B},
+   {0x1D1AA, 0x1D1AD},
+   {0x1D242, 0x1D244},
+   {0x1DA00, 0x1DA36},
+   {0x1DA3B, 0x1DA6C},
+   {0x1DA75, 0x1DA75},
+   {0x1DA84, 0x1DA84},
+   {0x1DA9B, 0x1E02A},
+   {0x1E130, 0x1E136},
+   {0x1E2EC, 0x1E2EF},
+   {0x1E8D0, 0x1E8D6},
+   {0x1E944, 0x1E94A},
+   {0xE0100, 0xE01EF},
 };