diff options
Diffstat (limited to 'src/common')
-rw-r--r-- | src/common/saslprep.c | 2 | ||||
-rw-r--r-- | src/common/unicode/generate-norm_test_table.pl | 9 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_norm_table.pl | 20 | ||||
-rw-r--r-- | src/common/unicode/norm_test.c | 23 | ||||
-rw-r--r-- | src/common/unicode_norm.c | 31 |
5 files changed, 50 insertions, 35 deletions
diff --git a/src/common/saslprep.c b/src/common/saslprep.c index 7739b81807e..2dedf6b0fb6 100644 --- a/src/common/saslprep.c +++ b/src/common/saslprep.c @@ -1156,7 +1156,7 @@ pg_saslprep(const char *input, char **output) * 2) Normalize -- Normalize the result of step 1 using Unicode * normalization. */ - output_chars = unicode_normalize_kc(input_chars); + output_chars = unicode_normalize(UNICODE_NFKC, input_chars); if (!output_chars) goto oom; diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl index 5c3dd9fd31a..6417b3f0cd1 100644 --- a/src/common/unicode/generate-norm_test_table.pl +++ b/src/common/unicode/generate-norm_test_table.pl @@ -48,7 +48,7 @@ typedef struct { int linenum; pg_wchar input[50]; - pg_wchar output[50]; + pg_wchar output[4][50]; } pg_unicode_test; /* test table */ @@ -89,13 +89,16 @@ while (my $line = <$INPUT>) my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line); my $source_utf8 = codepoint_string_to_hex($source); + my $nfc_utf8 = codepoint_string_to_hex($nfc); + my $nfd_utf8 = codepoint_string_to_hex($nfd); my $nfkc_utf8 = codepoint_string_to_hex($nfkc); + my $nfkd_utf8 = codepoint_string_to_hex($nfkd); - print $OUTPUT "\t{ $linenum, { $source_utf8 }, { $nfkc_utf8 } },\n"; + print $OUTPUT "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n"; } # Output terminator entry -print $OUTPUT "\t{ 0, { 0 }, { 0 } }"; +print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }"; print $OUTPUT "\n};\n"; close $OUTPUT; diff --git a/src/common/unicode/generate-unicode_norm_table.pl b/src/common/unicode/generate-unicode_norm_table.pl index ad995646337..cd5f502d540 100644 --- a/src/common/unicode/generate-unicode_norm_table.pl +++ b/src/common/unicode/generate-unicode_norm_table.pl @@ -99,10 +99,12 @@ typedef struct #define DECOMP_NO_COMPOSE 0x80 /* don't use for re-composition */ #define DECOMP_INLINE 0x40 /* decomposition is stored inline in * dec_index */ +#define DECOMP_COMPAT 0x20 /* compatibility mapping */ -#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x3F) -#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & DECOMP_NO_COMPOSE) != 0) +#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F) +#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0) #define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0) +#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0) /* Table of Unicode codepoints and their decompositions */ static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] = @@ -136,22 +138,22 @@ foreach my $char (@characters) # Decomposition size # Print size of decomposition my $decomp_size = scalar(@decomp_elts); + die if $decomp_size > 0x1F; # to not overrun bitmask my $first_decomp = shift @decomp_elts; my $flags = ""; my $comment = ""; - if ($decomp_size == 2) + if ($compat) { + $flags .= " | DECOMP_COMPAT"; + } + if ($decomp_size == 2) + { # Should this be used for recomposition? - if ($compat) - { - $flags .= " | DECOMP_NO_COMPOSE"; - $comment = "compatibility mapping"; - } - elsif ($character_hash{$first_decomp} + if ($character_hash{$first_decomp} && $character_hash{$first_decomp}->{class} != 0) { $flags .= " | DECOMP_NO_COMPOSE"; diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c index f6e8f0c0bb7..dde5d24349f 100644 --- a/src/common/unicode/norm_test.c +++ b/src/common/unicode/norm_test.c @@ -63,18 +63,21 @@ main(int argc, char **argv) for (test = UnicodeNormalizationTests; test->input[0] != 0; test++) { - pg_wchar *result; + for (int form = 0; form < 4; form++) + { + pg_wchar *result; - result = unicode_normalize_kc(test->input); + result = unicode_normalize(form, test->input); - if (pg_wcscmp(test->output, result) != 0) - { - printf("FAILURE (NormalizationTest.txt line %d):\n", test->linenum); - printf("input: %s\n", print_wchar_str(test->input)); - printf("expected: %s\n", print_wchar_str(test->output)); - printf("got: %s\n", print_wchar_str(result)); - printf("\n"); - exit(1); + if (pg_wcscmp(test->output[form], result) != 0) + { + printf("FAILURE (NormalizationTest.txt line %d form %d):\n", test->linenum, form); + printf("input: %s\n", print_wchar_str(test->input)); + printf("expected: %s\n", print_wchar_str(test->output[form])); + printf("got: %s\n", print_wchar_str(result)); + printf("\n"); + exit(1); + } } } diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c index ab56490a075..ec5abea6bdd 100644 --- a/src/common/unicode_norm.c +++ b/src/common/unicode_norm.c @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * unicode_norm.c - * Normalize a Unicode string to NFKC form + * Normalize a Unicode string * * This implements Unicode normalization, per the documentation at * https://www.unicode.org/reports/tr15/. @@ -98,7 +98,7 @@ get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size) * are, in turn, decomposable. */ static int -get_decomposed_size(pg_wchar code) +get_decomposed_size(pg_wchar code, bool compat) { pg_unicode_decomposition *entry; int size = 0; @@ -131,7 +131,8 @@ get_decomposed_size(pg_wchar code) * Just count current code if no other decompositions. A NULL entry is * equivalent to a character with class 0 and no decompositions. */ - if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0) + if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 || + (!compat && DECOMPOSITION_IS_COMPAT(entry))) return 1; /* @@ -143,7 +144,7 @@ get_decomposed_size(pg_wchar code) { uint32 lcode = decomp[i]; - size += get_decomposed_size(lcode); + size += get_decomposed_size(lcode, compat); } return size; @@ -224,7 +225,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result) * in the array result. */ static void -decompose_code(pg_wchar code, pg_wchar **result, int *current) +decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current) { pg_unicode_decomposition *entry; int i; @@ -272,7 +273,8 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current) * character with class 0 and no decompositions, so just leave also in * this case. */ - if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0) + if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 || + (!compat && DECOMPOSITION_IS_COMPAT(entry))) { pg_wchar *res = *result; @@ -290,12 +292,12 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current) pg_wchar lcode = (pg_wchar) decomp[i]; /* Leave if no more decompositions */ - decompose_code(lcode, result, current); + decompose_code(lcode, compat, result, current); } } /* - * unicode_normalize_kc - Normalize a Unicode string to NFKC form. + * unicode_normalize - Normalize a Unicode string to the specified form. * * The input is a 0-terminated array of codepoints. * @@ -304,8 +306,10 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current) * string is palloc'd instead, and OOM is reported with ereport(). */ pg_wchar * -unicode_normalize_kc(const pg_wchar *input) +unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) { + bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD); + bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC); pg_wchar *decomp_chars; pg_wchar *recomp_chars; int decomp_size, @@ -326,7 +330,7 @@ unicode_normalize_kc(const pg_wchar *input) */ decomp_size = 0; for (p = input; *p; p++) - decomp_size += get_decomposed_size(*p); + decomp_size += get_decomposed_size(*p, compat); decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar)); if (decomp_chars == NULL) @@ -338,7 +342,7 @@ unicode_normalize_kc(const pg_wchar *input) */ current_size = 0; for (p = input; *p; p++) - decompose_code(*p, &decomp_chars, ¤t_size); + decompose_code(*p, compat, &decomp_chars, ¤t_size); decomp_chars[decomp_size] = '\0'; Assert(decomp_size == current_size); @@ -385,8 +389,11 @@ unicode_normalize_kc(const pg_wchar *input) count -= 2; } + if (!recompose) + return decomp_chars; + /* - * The last phase of NFKC is the recomposition of the reordered Unicode + * The last phase of NFC and NFKC is the recomposition of the reordered Unicode * string using combining classes. The recomposed string cannot be longer * than the decomposed one, so make the allocation of the output string * based on that assumption. |