Prevent mis-encoding of "trailing junk after numeric literal" errors.

Since commit 2549f0661, we reject an identifier immediately following a numeric literal (without separating whitespace), because that risks ambiguity with hex/octal/binary integers. However, that patch used token patterns like "{integer}{ident_start}", which is problematic because {ident_start} matches only a single byte. If the first character after the integer is a multibyte character, this ends up with flex reporting an error message that includes a partial multibyte character. That can cause assorted bad-encoding problems downstream, both in the report to the client and in the postmaster log file. To fix, use {identifier} not {ident_start} in the "junk" token patterns, so that they will match complete multibyte characters. This seems generally better user experience quite aside from the encoding problem: for "123abc" the error message will now say that the error appeared at or near "123abc" instead of "123a". While at it, add some commentary about why these patterns exist and how they work. Report and patch by Karina Litskevich; review by Pavel Borisov. Back-patch to v15 where the problem came in. Discussion: https://postgr.es/m/CACiT8iZ_diop=0zJ7zuY3BXegJpkKK1Av-PU7xh0EDYHsa5+=g@mail.gmail.com
author: Tom Lane 2024-09-05 16:42:33 +0000
committer: Tom Lane 2024-09-05 16:42:33 +0000
commit: fadff3fc94598db1d87e4242821964fb2850e19e (patch)
tree: 713189c47f359a2a6ac08b22d17d63f9ded8a37f /src/backend/parser
parent: 85837b8037ada19d319fa4d3ba99c72205868199 (diff)
1 files changed, 23 insertions, 21 deletions
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index f74059e7b0b..994ed9995ac 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -412,16 +412,30 @@ numericfail		{decinteger}\.\.
 real			({decinteger}|{numeric})[Ee][-+]?{decinteger}
 realfail		({decinteger}|{numeric})[Ee][-+]
 
-decinteger_junk	{decinteger}{ident_start}
-hexinteger_junk	{hexinteger}{ident_start}
-octinteger_junk	{octinteger}{ident_start}
-bininteger_junk	{bininteger}{ident_start}
-numeric_junk	{numeric}{ident_start}
-real_junk		{real}{ident_start}
-
 /* Positional parameters don't accept underscores. */
 param			\${decdigit}+
-param_junk		\${decdigit}+{ident_start}
+
+/*
+ * An identifier immediately following an integer literal is disallowed because
+ * in some cases it's ambiguous what is meant: for example, 0x1234 could be
+ * either a hexinteger or a decinteger "0" and an identifier "x1234".  We can
+ * detect such problems by seeing if integer_junk matches a longer substring
+ * than any of the XXXinteger patterns (decinteger, hexinteger, octinteger,
+ * bininteger).  One "junk" pattern is sufficient because
+ * {decinteger}{identifier} will match all the same strings we'd match with
+ * {hexinteger}{identifier} etc.
+ *
+ * Note that the rule for integer_junk must appear after the ones for
+ * XXXinteger to make this work correctly: 0x1234 will match both hexinteger
+ * and integer_junk, and we need hexinteger to be chosen in that case.
+ *
+ * Also disallow strings matched by numeric_junk, real_junk and param_junk
+ * for consistency.
+ */
+integer_junk	{decinteger}{identifier}
+numeric_junk	{numeric}{identifier}
+real_junk		{real}{identifier}
+param_junk		\${decdigit}+{identifier}
 
 other			.
 
@@ -1055,19 +1069,7 @@ other			.
 					SET_YYLLOC();
 					yyerror("trailing junk after numeric literal");
 				}
-{decinteger_junk}	{
-					SET_YYLLOC();
-					yyerror("trailing junk after numeric literal");
-				}
-{hexinteger_junk}	{
-					SET_YYLLOC();
-					yyerror("trailing junk after numeric literal");
-				}
-{octinteger_junk}	{
-					SET_YYLLOC();
-					yyerror("trailing junk after numeric literal");
-				}
-{bininteger_junk}	{
+{integer_junk}	{
 					SET_YYLLOC();
 					yyerror("trailing junk after numeric literal");
 				}
author	Tom Lane	2024-09-05 16:42:33 +0000
committer	Tom Lane	2024-09-05 16:42:33 +0000
commit	fadff3fc94598db1d87e4242821964fb2850e19e (patch)
tree	713189c47f359a2a6ac08b22d17d63f9ded8a37f /src/backend/parser
parent	85837b8037ada19d319fa4d3ba99c72205868199 (diff)