summaryrefslogtreecommitdiff
path: root/src/common
diff options
context:
space:
mode:
Diffstat (limited to 'src/common')
-rw-r--r--src/common/saslprep.c2
-rw-r--r--src/common/unicode/generate-norm_test_table.pl9
-rw-r--r--src/common/unicode/generate-unicode_norm_table.pl20
-rw-r--r--src/common/unicode/norm_test.c23
-rw-r--r--src/common/unicode_norm.c31
5 files changed, 50 insertions, 35 deletions
diff --git a/src/common/saslprep.c b/src/common/saslprep.c
index 7739b81807e..2dedf6b0fb6 100644
--- a/src/common/saslprep.c
+++ b/src/common/saslprep.c
@@ -1156,7 +1156,7 @@ pg_saslprep(const char *input, char **output)
* 2) Normalize -- Normalize the result of step 1 using Unicode
* normalization.
*/
- output_chars = unicode_normalize_kc(input_chars);
+ output_chars = unicode_normalize(UNICODE_NFKC, input_chars);
if (!output_chars)
goto oom;
diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl
index 5c3dd9fd31a..6417b3f0cd1 100644
--- a/src/common/unicode/generate-norm_test_table.pl
+++ b/src/common/unicode/generate-norm_test_table.pl
@@ -48,7 +48,7 @@ typedef struct
{
int linenum;
pg_wchar input[50];
- pg_wchar output[50];
+ pg_wchar output[4][50];
} pg_unicode_test;
/* test table */
@@ -89,13 +89,16 @@ while (my $line = <$INPUT>)
my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line);
my $source_utf8 = codepoint_string_to_hex($source);
+ my $nfc_utf8 = codepoint_string_to_hex($nfc);
+ my $nfd_utf8 = codepoint_string_to_hex($nfd);
my $nfkc_utf8 = codepoint_string_to_hex($nfkc);
+ my $nfkd_utf8 = codepoint_string_to_hex($nfkd);
- print $OUTPUT "\t{ $linenum, { $source_utf8 }, { $nfkc_utf8 } },\n";
+ print $OUTPUT "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n";
}
# Output terminator entry
-print $OUTPUT "\t{ 0, { 0 }, { 0 } }";
+print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }";
print $OUTPUT "\n};\n";
close $OUTPUT;
diff --git a/src/common/unicode/generate-unicode_norm_table.pl b/src/common/unicode/generate-unicode_norm_table.pl
index ad995646337..cd5f502d540 100644
--- a/src/common/unicode/generate-unicode_norm_table.pl
+++ b/src/common/unicode/generate-unicode_norm_table.pl
@@ -99,10 +99,12 @@ typedef struct
#define DECOMP_NO_COMPOSE 0x80 /* don't use for re-composition */
#define DECOMP_INLINE 0x40 /* decomposition is stored inline in
* dec_index */
+#define DECOMP_COMPAT 0x20 /* compatibility mapping */
-#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x3F)
-#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & DECOMP_NO_COMPOSE) != 0)
+#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F)
+#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0)
#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
+#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)
/* Table of Unicode codepoints and their decompositions */
static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
@@ -136,22 +138,22 @@ foreach my $char (@characters)
# Decomposition size
# Print size of decomposition
my $decomp_size = scalar(@decomp_elts);
+ die if $decomp_size > 0x1F; # to not overrun bitmask
my $first_decomp = shift @decomp_elts;
my $flags = "";
my $comment = "";
- if ($decomp_size == 2)
+ if ($compat)
{
+ $flags .= " | DECOMP_COMPAT";
+ }
+ if ($decomp_size == 2)
+ {
# Should this be used for recomposition?
- if ($compat)
- {
- $flags .= " | DECOMP_NO_COMPOSE";
- $comment = "compatibility mapping";
- }
- elsif ($character_hash{$first_decomp}
+ if ($character_hash{$first_decomp}
&& $character_hash{$first_decomp}->{class} != 0)
{
$flags .= " | DECOMP_NO_COMPOSE";
diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c
index f6e8f0c0bb7..dde5d24349f 100644
--- a/src/common/unicode/norm_test.c
+++ b/src/common/unicode/norm_test.c
@@ -63,18 +63,21 @@ main(int argc, char **argv)
for (test = UnicodeNormalizationTests; test->input[0] != 0; test++)
{
- pg_wchar *result;
+ for (int form = 0; form < 4; form++)
+ {
+ pg_wchar *result;
- result = unicode_normalize_kc(test->input);
+ result = unicode_normalize(form, test->input);
- if (pg_wcscmp(test->output, result) != 0)
- {
- printf("FAILURE (NormalizationTest.txt line %d):\n", test->linenum);
- printf("input: %s\n", print_wchar_str(test->input));
- printf("expected: %s\n", print_wchar_str(test->output));
- printf("got: %s\n", print_wchar_str(result));
- printf("\n");
- exit(1);
+ if (pg_wcscmp(test->output[form], result) != 0)
+ {
+ printf("FAILURE (NormalizationTest.txt line %d form %d):\n", test->linenum, form);
+ printf("input: %s\n", print_wchar_str(test->input));
+ printf("expected: %s\n", print_wchar_str(test->output[form]));
+ printf("got: %s\n", print_wchar_str(result));
+ printf("\n");
+ exit(1);
+ }
}
}
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index ab56490a075..ec5abea6bdd 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -1,6 +1,6 @@
/*-------------------------------------------------------------------------
* unicode_norm.c
- * Normalize a Unicode string to NFKC form
+ * Normalize a Unicode string
*
* This implements Unicode normalization, per the documentation at
* https://www.unicode.org/reports/tr15/.
@@ -98,7 +98,7 @@ get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
* are, in turn, decomposable.
*/
static int
-get_decomposed_size(pg_wchar code)
+get_decomposed_size(pg_wchar code, bool compat)
{
pg_unicode_decomposition *entry;
int size = 0;
@@ -131,7 +131,8 @@ get_decomposed_size(pg_wchar code)
* Just count current code if no other decompositions. A NULL entry is
* equivalent to a character with class 0 and no decompositions.
*/
- if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
+ if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
+ (!compat && DECOMPOSITION_IS_COMPAT(entry)))
return 1;
/*
@@ -143,7 +144,7 @@ get_decomposed_size(pg_wchar code)
{
uint32 lcode = decomp[i];
- size += get_decomposed_size(lcode);
+ size += get_decomposed_size(lcode, compat);
}
return size;
@@ -224,7 +225,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
* in the array result.
*/
static void
-decompose_code(pg_wchar code, pg_wchar **result, int *current)
+decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
{
pg_unicode_decomposition *entry;
int i;
@@ -272,7 +273,8 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
* character with class 0 and no decompositions, so just leave also in
* this case.
*/
- if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
+ if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
+ (!compat && DECOMPOSITION_IS_COMPAT(entry)))
{
pg_wchar *res = *result;
@@ -290,12 +292,12 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
pg_wchar lcode = (pg_wchar) decomp[i];
/* Leave if no more decompositions */
- decompose_code(lcode, result, current);
+ decompose_code(lcode, compat, result, current);
}
}
/*
- * unicode_normalize_kc - Normalize a Unicode string to NFKC form.
+ * unicode_normalize - Normalize a Unicode string to the specified form.
*
* The input is a 0-terminated array of codepoints.
*
@@ -304,8 +306,10 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
* string is palloc'd instead, and OOM is reported with ereport().
*/
pg_wchar *
-unicode_normalize_kc(const pg_wchar *input)
+unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
{
+ bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
+ bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
pg_wchar *decomp_chars;
pg_wchar *recomp_chars;
int decomp_size,
@@ -326,7 +330,7 @@ unicode_normalize_kc(const pg_wchar *input)
*/
decomp_size = 0;
for (p = input; *p; p++)
- decomp_size += get_decomposed_size(*p);
+ decomp_size += get_decomposed_size(*p, compat);
decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
if (decomp_chars == NULL)
@@ -338,7 +342,7 @@ unicode_normalize_kc(const pg_wchar *input)
*/
current_size = 0;
for (p = input; *p; p++)
- decompose_code(*p, &decomp_chars, &current_size);
+ decompose_code(*p, compat, &decomp_chars, &current_size);
decomp_chars[decomp_size] = '\0';
Assert(decomp_size == current_size);
@@ -385,8 +389,11 @@ unicode_normalize_kc(const pg_wchar *input)
count -= 2;
}
+ if (!recompose)
+ return decomp_chars;
+
/*
- * The last phase of NFKC is the recomposition of the reordered Unicode
+ * The last phase of NFC and NFKC is the recomposition of the reordered Unicode
* string using combining classes. The recomposed string cannot be longer
* than the decomposed one, so make the allocation of the output string
* based on that assumption.