5 files changed, 50 insertions, 35 deletions
diff --git a/src/common/saslprep.c b/src/common/saslprep.c
index 7739b81807e..2dedf6b0fb6 100644
--- a/src/common/saslprep.c
+++ b/src/common/saslprep.c
@@ -1156,7 +1156,7 @@ pg_saslprep(const char *input, char **output)
 	 * 2) Normalize -- Normalize the result of step 1 using Unicode
 	 * normalization.
 	 */
-	output_chars = unicode_normalize_kc(input_chars);
+	output_chars = unicode_normalize(UNICODE_NFKC, input_chars);
 	if (!output_chars)
 		goto oom;
 
diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl
index 5c3dd9fd31a..6417b3f0cd1 100644
--- a/src/common/unicode/generate-norm_test_table.pl
+++ b/src/common/unicode/generate-norm_test_table.pl
@@ -48,7 +48,7 @@ typedef struct
 {
 	int			linenum;
 	pg_wchar	input[50];
-	pg_wchar	output[50];
+	pg_wchar	output[4][50];
 } pg_unicode_test;
 
 /* test table */
@@ -89,13 +89,16 @@ while (my $line = <$INPUT>)
 	my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line);
 
 	my $source_utf8 = codepoint_string_to_hex($source);
+	my $nfc_utf8    = codepoint_string_to_hex($nfc);
+	my $nfd_utf8    = codepoint_string_to_hex($nfd);
 	my $nfkc_utf8   = codepoint_string_to_hex($nfkc);
+	my $nfkd_utf8   = codepoint_string_to_hex($nfkd);
 
-	print $OUTPUT "\t{ $linenum, { $source_utf8 }, { $nfkc_utf8 } },\n";
+	print $OUTPUT "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n";
 }
 
 # Output terminator entry
-print $OUTPUT "\t{ 0, { 0 }, { 0 } }";
+print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }";
 print $OUTPUT "\n};\n";
 
 close $OUTPUT;
diff --git a/src/common/unicode/generate-unicode_norm_table.pl b/src/common/unicode/generate-unicode_norm_table.pl
index ad995646337..cd5f502d540 100644
--- a/src/common/unicode/generate-unicode_norm_table.pl
+++ b/src/common/unicode/generate-unicode_norm_table.pl
@@ -99,10 +99,12 @@ typedef struct
 #define DECOMP_NO_COMPOSE	0x80	/* don't use for re-composition */
 #define DECOMP_INLINE		0x40	/* decomposition is stored inline in
 									 * dec_index */
+#define DECOMP_COMPAT		0x20	/* compatibility mapping */
 
-#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x3F)
-#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & DECOMP_NO_COMPOSE) != 0)
+#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F)
+#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0)
 #define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
+#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)
 
 /* Table of Unicode codepoints and their decompositions */
 static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
@@ -136,22 +138,22 @@ foreach my $char (@characters)
 	# Decomposition size
 	# Print size of decomposition
 	my $decomp_size = scalar(@decomp_elts);
+	die if $decomp_size > 0x1F;		# to not overrun bitmask
 
 	my $first_decomp = shift @decomp_elts;
 
 	my $flags   = "";
 	my $comment = "";
 
-	if ($decomp_size == 2)
+	if ($compat)
 	{
+		$flags .= " | DECOMP_COMPAT";
+	}
 
+	if ($decomp_size == 2)
+	{
 		# Should this be used for recomposition?
-		if ($compat)
-		{
-			$flags .= " | DECOMP_NO_COMPOSE";
-			$comment = "compatibility mapping";
-		}
-		elsif ($character_hash{$first_decomp}
+		if ($character_hash{$first_decomp}
 			&& $character_hash{$first_decomp}->{class} != 0)
 		{
 			$flags .= " | DECOMP_NO_COMPOSE";
diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c
index f6e8f0c0bb7..dde5d24349f 100644
--- a/src/common/unicode/norm_test.c
+++ b/src/common/unicode/norm_test.c
@@ -63,18 +63,21 @@ main(int argc, char **argv)
 
 	for (test = UnicodeNormalizationTests; test->input[0] != 0; test++)
 	{
-		pg_wchar   *result;
+		for (int form = 0; form < 4; form++)
+		{
+			pg_wchar   *result;
 
-		result = unicode_normalize_kc(test->input);
+			result = unicode_normalize(form, test->input);
 
-		if (pg_wcscmp(test->output, result) != 0)
-		{
-			printf("FAILURE (NormalizationTest.txt line %d):\n", test->linenum);
-			printf("input:    %s\n", print_wchar_str(test->input));
-			printf("expected: %s\n", print_wchar_str(test->output));
-			printf("got:      %s\n", print_wchar_str(result));
-			printf("\n");
-			exit(1);
+			if (pg_wcscmp(test->output[form], result) != 0)
+			{
+				printf("FAILURE (NormalizationTest.txt line %d form %d):\n", test->linenum, form);
+				printf("input:    %s\n", print_wchar_str(test->input));
+				printf("expected: %s\n", print_wchar_str(test->output[form]));
+				printf("got:      %s\n", print_wchar_str(result));
+				printf("\n");
+				exit(1);
+			}
 		}
 	}
 
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index ab56490a075..ec5abea6bdd 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -1,6 +1,6 @@
 /*-------------------------------------------------------------------------
  * unicode_norm.c
- *		Normalize a Unicode string to NFKC form
+ *		Normalize a Unicode string
  *
  * This implements Unicode normalization, per the documentation at
  * https://www.unicode.org/reports/tr15/.
@@ -98,7 +98,7 @@ get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
  * are, in turn, decomposable.
  */
 static int
-get_decomposed_size(pg_wchar code)
+get_decomposed_size(pg_wchar code, bool compat)
 {
 	pg_unicode_decomposition *entry;
 	int			size = 0;
@@ -131,7 +131,8 @@ get_decomposed_size(pg_wchar code)
 	 * Just count current code if no other decompositions.  A NULL entry is
 	 * equivalent to a character with class 0 and no decompositions.
 	 */
-	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
+	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
+		(!compat && DECOMPOSITION_IS_COMPAT(entry)))
 		return 1;
 
 	/*
@@ -143,7 +144,7 @@ get_decomposed_size(pg_wchar code)
 	{
 		uint32		lcode = decomp[i];
 
-		size += get_decomposed_size(lcode);
+		size += get_decomposed_size(lcode, compat);
 	}
 
 	return size;
@@ -224,7 +225,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
  * in the array result.
  */
 static void
-decompose_code(pg_wchar code, pg_wchar **result, int *current)
+decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
 {
 	pg_unicode_decomposition *entry;
 	int			i;
@@ -272,7 +273,8 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
 	 * character with class 0 and no decompositions, so just leave also in
 	 * this case.
 	 */
-	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
+	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
+		(!compat && DECOMPOSITION_IS_COMPAT(entry)))
 	{
 		pg_wchar   *res = *result;
 
@@ -290,12 +292,12 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
 		pg_wchar	lcode = (pg_wchar) decomp[i];
 
 		/* Leave if no more decompositions */
-		decompose_code(lcode, result, current);
+		decompose_code(lcode, compat, result, current);
 	}
 }
 
 /*
- * unicode_normalize_kc - Normalize a Unicode string to NFKC form.
+ * unicode_normalize - Normalize a Unicode string to the specified form.
  *
  * The input is a 0-terminated array of codepoints.
  *
@@ -304,8 +306,10 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
  * string is palloc'd instead, and OOM is reported with ereport().
  */
 pg_wchar *
-unicode_normalize_kc(const pg_wchar *input)
+unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 {
+	bool		compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
+	bool		recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
 	pg_wchar   *decomp_chars;
 	pg_wchar   *recomp_chars;
 	int			decomp_size,
@@ -326,7 +330,7 @@ unicode_normalize_kc(const pg_wchar *input)
 	 */
 	decomp_size = 0;
 	for (p = input; *p; p++)
-		decomp_size += get_decomposed_size(*p);
+		decomp_size += get_decomposed_size(*p, compat);
 
 	decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
 	if (decomp_chars == NULL)
@@ -338,7 +342,7 @@ unicode_normalize_kc(const pg_wchar *input)
 	 */
 	current_size = 0;
 	for (p = input; *p; p++)
-		decompose_code(*p, &decomp_chars, &current_size);
+		decompose_code(*p, compat, &decomp_chars, &current_size);
 	decomp_chars[decomp_size] = '\0';
 	Assert(decomp_size == current_size);
 
@@ -385,8 +389,11 @@ unicode_normalize_kc(const pg_wchar *input)
 			count -= 2;
 	}
 
+	if (!recompose)
+		return decomp_chars;
+
 	/*
-	 * The last phase of NFKC is the recomposition of the reordered Unicode
+	 * The last phase of NFC and NFKC is the recomposition of the reordered Unicode
 	 * string using combining classes. The recomposed string cannot be longer
 	 * than the decomposed one, so make the allocation of the output string
 	 * based on that assumption.