diff options
Diffstat (limited to 'src/parser/wchar.c')
| -rw-r--r-- | src/parser/wchar.c | 554 |
1 files changed, 99 insertions, 455 deletions
diff --git a/src/parser/wchar.c b/src/parser/wchar.c index 1e3a1b0ae..27c972fc0 100644 --- a/src/parser/wchar.c +++ b/src/parser/wchar.c @@ -9,6 +9,7 @@ #include "utils/elog.h" #include <stdio.h> #include <string.h> +#include <limits.h> #include "pg_wchar.h" #ifndef WIN32 @@ -115,6 +116,24 @@ const char *pg_enc2gettext_tbl[] = [PG_SHIFT_JIS_2004] = "SHIFT_JISX0213", }; +/* + * In today's multibyte encodings other than UTF8, this two-byte sequence + * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0. + * + * For historical reasons, several verifychar implementations opt to reject + * this pair specifically. Byte pair range constraints, in encoding + * originator documentation, always excluded this pair. No core conversion + * could translate it. However, longstanding verifychar implementations + * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate + * pairs not valid per encoding originator documentation. To avoid tightening + * core or non-core conversions in a security patch, we sought this one pair. + * + * PQescapeString() historically used spaces for BYTE1; many other values + * could suffice for BYTE1. + */ +#define NONUTF8_INVALID_BYTE0 (0x8d) +#define NONUTF8_INVALID_BYTE1 (' ') + /* * Operations on multi-byte encodings are driven by a table of helper @@ -706,454 +725,8 @@ mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max) static int ucs_wcwidth(pg_wchar ucs) { - /* sorted list of non-overlapping intervals of non-spacing characters */ - static const struct mbinterval nonspacing[] = { - {0x00AD, 0x00AD}, - {0x0300, 0x036F}, - {0x0483, 0x0489}, - {0x0591, 0x05BD}, - {0x05BF, 0x05BF}, - {0x05C1, 0x05C2}, - {0x05C4, 0x05C5}, - {0x05C7, 0x05C7}, - {0x0600, 0x0605}, - {0x0610, 0x061A}, - {0x061C, 0x061C}, - {0x064B, 0x065F}, - {0x0670, 0x0670}, - {0x06D6, 0x06DD}, - {0x06DF, 0x06E4}, - {0x06E7, 0x06E8}, - {0x06EA, 0x06ED}, - {0x070F, 0x070F}, - {0x0711, 0x0711}, - {0x0730, 0x074A}, - {0x07A6, 0x07B0}, - {0x07EB, 0x07F3}, - {0x07FD, 0x07FD}, - {0x0816, 0x0819}, - {0x081B, 0x0823}, - {0x0825, 0x0827}, - {0x0829, 0x082D}, - {0x0859, 0x085B}, - {0x0890, 0x089F}, - {0x08CA, 0x0902}, - {0x093A, 0x093A}, - {0x093C, 0x093C}, - {0x0941, 0x0948}, - {0x094D, 0x094D}, - {0x0951, 0x0957}, - {0x0962, 0x0963}, - {0x0981, 0x0981}, - {0x09BC, 0x09BC}, - {0x09C1, 0x09C4}, - {0x09CD, 0x09CD}, - {0x09E2, 0x09E3}, - {0x09FE, 0x0A02}, - {0x0A3C, 0x0A3C}, - {0x0A41, 0x0A51}, - {0x0A70, 0x0A71}, - {0x0A75, 0x0A75}, - {0x0A81, 0x0A82}, - {0x0ABC, 0x0ABC}, - {0x0AC1, 0x0AC8}, - {0x0ACD, 0x0ACD}, - {0x0AE2, 0x0AE3}, - {0x0AFA, 0x0B01}, - {0x0B3C, 0x0B3C}, - {0x0B3F, 0x0B3F}, - {0x0B41, 0x0B44}, - {0x0B4D, 0x0B56}, - {0x0B62, 0x0B63}, - {0x0B82, 0x0B82}, - {0x0BC0, 0x0BC0}, - {0x0BCD, 0x0BCD}, - {0x0C00, 0x0C00}, - {0x0C04, 0x0C04}, - {0x0C3C, 0x0C3C}, - {0x0C3E, 0x0C40}, - {0x0C46, 0x0C56}, - {0x0C62, 0x0C63}, - {0x0C81, 0x0C81}, - {0x0CBC, 0x0CBC}, - {0x0CBF, 0x0CBF}, - {0x0CC6, 0x0CC6}, - {0x0CCC, 0x0CCD}, - {0x0CE2, 0x0CE3}, - {0x0D00, 0x0D01}, - {0x0D3B, 0x0D3C}, - {0x0D41, 0x0D44}, - {0x0D4D, 0x0D4D}, - {0x0D62, 0x0D63}, - {0x0D81, 0x0D81}, - {0x0DCA, 0x0DCA}, - {0x0DD2, 0x0DD6}, - {0x0E31, 0x0E31}, - {0x0E34, 0x0E3A}, - {0x0E47, 0x0E4E}, - {0x0EB1, 0x0EB1}, - {0x0EB4, 0x0EBC}, - {0x0EC8, 0x0ECE}, - {0x0F18, 0x0F19}, - {0x0F35, 0x0F35}, - {0x0F37, 0x0F37}, - {0x0F39, 0x0F39}, - {0x0F71, 0x0F7E}, - {0x0F80, 0x0F84}, - {0x0F86, 0x0F87}, - {0x0F8D, 0x0FBC}, - {0x0FC6, 0x0FC6}, - {0x102D, 0x1030}, - {0x1032, 0x1037}, - {0x1039, 0x103A}, - {0x103D, 0x103E}, - {0x1058, 0x1059}, - {0x105E, 0x1060}, - {0x1071, 0x1074}, - {0x1082, 0x1082}, - {0x1085, 0x1086}, - {0x108D, 0x108D}, - {0x109D, 0x109D}, - {0x135D, 0x135F}, - {0x1712, 0x1714}, - {0x1732, 0x1733}, - {0x1752, 0x1753}, - {0x1772, 0x1773}, - {0x17B4, 0x17B5}, - {0x17B7, 0x17BD}, - {0x17C6, 0x17C6}, - {0x17C9, 0x17D3}, - {0x17DD, 0x17DD}, - {0x180B, 0x180F}, - {0x1885, 0x1886}, - {0x18A9, 0x18A9}, - {0x1920, 0x1922}, - {0x1927, 0x1928}, - {0x1932, 0x1932}, - {0x1939, 0x193B}, - {0x1A17, 0x1A18}, - {0x1A1B, 0x1A1B}, - {0x1A56, 0x1A56}, - {0x1A58, 0x1A60}, - {0x1A62, 0x1A62}, - {0x1A65, 0x1A6C}, - {0x1A73, 0x1A7F}, - {0x1AB0, 0x1B03}, - {0x1B34, 0x1B34}, - {0x1B36, 0x1B3A}, - {0x1B3C, 0x1B3C}, - {0x1B42, 0x1B42}, - {0x1B6B, 0x1B73}, - {0x1B80, 0x1B81}, - {0x1BA2, 0x1BA5}, - {0x1BA8, 0x1BA9}, - {0x1BAB, 0x1BAD}, - {0x1BE6, 0x1BE6}, - {0x1BE8, 0x1BE9}, - {0x1BED, 0x1BED}, - {0x1BEF, 0x1BF1}, - {0x1C2C, 0x1C33}, - {0x1C36, 0x1C37}, - {0x1CD0, 0x1CD2}, - {0x1CD4, 0x1CE0}, - {0x1CE2, 0x1CE8}, - {0x1CED, 0x1CED}, - {0x1CF4, 0x1CF4}, - {0x1CF8, 0x1CF9}, - {0x1DC0, 0x1DFF}, - {0x200B, 0x200F}, - {0x202A, 0x202E}, - {0x2060, 0x206F}, - {0x20D0, 0x20F0}, - {0x2CEF, 0x2CF1}, - {0x2D7F, 0x2D7F}, - {0x2DE0, 0x2DFF}, - {0x302A, 0x302D}, - {0x3099, 0x309A}, - {0xA66F, 0xA672}, - {0xA674, 0xA67D}, - {0xA69E, 0xA69F}, - {0xA6F0, 0xA6F1}, - {0xA802, 0xA802}, - {0xA806, 0xA806}, - {0xA80B, 0xA80B}, - {0xA825, 0xA826}, - {0xA82C, 0xA82C}, - {0xA8C4, 0xA8C5}, - {0xA8E0, 0xA8F1}, - {0xA8FF, 0xA8FF}, - {0xA926, 0xA92D}, - {0xA947, 0xA951}, - {0xA980, 0xA982}, - {0xA9B3, 0xA9B3}, - {0xA9B6, 0xA9B9}, - {0xA9BC, 0xA9BD}, - {0xA9E5, 0xA9E5}, - {0xAA29, 0xAA2E}, - {0xAA31, 0xAA32}, - {0xAA35, 0xAA36}, - {0xAA43, 0xAA43}, - {0xAA4C, 0xAA4C}, - {0xAA7C, 0xAA7C}, - {0xAAB0, 0xAAB0}, - {0xAAB2, 0xAAB4}, - {0xAAB7, 0xAAB8}, - {0xAABE, 0xAABF}, - {0xAAC1, 0xAAC1}, - {0xAAEC, 0xAAED}, - {0xAAF6, 0xAAF6}, - {0xABE5, 0xABE5}, - {0xABE8, 0xABE8}, - {0xABED, 0xABED}, - {0xFB1E, 0xFB1E}, - {0xFE00, 0xFE0F}, - {0xFE20, 0xFE2F}, - {0xFEFF, 0xFEFF}, - {0xFFF9, 0xFFFB}, - {0x101FD, 0x101FD}, - {0x102E0, 0x102E0}, - {0x10376, 0x1037A}, - {0x10A01, 0x10A0F}, - {0x10A38, 0x10A3F}, - {0x10AE5, 0x10AE6}, - {0x10D24, 0x10D27}, - {0x10EAB, 0x10EAC}, - {0x10EFD, 0x10EFF}, - {0x10F46, 0x10F50}, - {0x10F82, 0x10F85}, - {0x11001, 0x11001}, - {0x11038, 0x11046}, - {0x11070, 0x11070}, - {0x11073, 0x11074}, - {0x1107F, 0x11081}, - {0x110B3, 0x110B6}, - {0x110B9, 0x110BA}, - {0x110BD, 0x110BD}, - {0x110C2, 0x110CD}, - {0x11100, 0x11102}, - {0x11127, 0x1112B}, - {0x1112D, 0x11134}, - {0x11173, 0x11173}, - {0x11180, 0x11181}, - {0x111B6, 0x111BE}, - {0x111C9, 0x111CC}, - {0x111CF, 0x111CF}, - {0x1122F, 0x11231}, - {0x11234, 0x11234}, - {0x11236, 0x11237}, - {0x1123E, 0x1123E}, - {0x11241, 0x11241}, - {0x112DF, 0x112DF}, - {0x112E3, 0x112EA}, - {0x11300, 0x11301}, - {0x1133B, 0x1133C}, - {0x11340, 0x11340}, - {0x11366, 0x11374}, - {0x11438, 0x1143F}, - {0x11442, 0x11444}, - {0x11446, 0x11446}, - {0x1145E, 0x1145E}, - {0x114B3, 0x114B8}, - {0x114BA, 0x114BA}, - {0x114BF, 0x114C0}, - {0x114C2, 0x114C3}, - {0x115B2, 0x115B5}, - {0x115BC, 0x115BD}, - {0x115BF, 0x115C0}, - {0x115DC, 0x115DD}, - {0x11633, 0x1163A}, - {0x1163D, 0x1163D}, - {0x1163F, 0x11640}, - {0x116AB, 0x116AB}, - {0x116AD, 0x116AD}, - {0x116B0, 0x116B5}, - {0x116B7, 0x116B7}, - {0x1171D, 0x1171F}, - {0x11722, 0x11725}, - {0x11727, 0x1172B}, - {0x1182F, 0x11837}, - {0x11839, 0x1183A}, - {0x1193B, 0x1193C}, - {0x1193E, 0x1193E}, - {0x11943, 0x11943}, - {0x119D4, 0x119DB}, - {0x119E0, 0x119E0}, - {0x11A01, 0x11A0A}, - {0x11A33, 0x11A38}, - {0x11A3B, 0x11A3E}, - {0x11A47, 0x11A47}, - {0x11A51, 0x11A56}, - {0x11A59, 0x11A5B}, - {0x11A8A, 0x11A96}, - {0x11A98, 0x11A99}, - {0x11C30, 0x11C3D}, - {0x11C3F, 0x11C3F}, - {0x11C92, 0x11CA7}, - {0x11CAA, 0x11CB0}, - {0x11CB2, 0x11CB3}, - {0x11CB5, 0x11CB6}, - {0x11D31, 0x11D45}, - {0x11D47, 0x11D47}, - {0x11D90, 0x11D91}, - {0x11D95, 0x11D95}, - {0x11D97, 0x11D97}, - {0x11EF3, 0x11EF4}, - {0x11F00, 0x11F01}, - {0x11F36, 0x11F3A}, - {0x11F40, 0x11F40}, - {0x11F42, 0x11F42}, - {0x13430, 0x13440}, - {0x13447, 0x13455}, - {0x16AF0, 0x16AF4}, - {0x16B30, 0x16B36}, - {0x16F4F, 0x16F4F}, - {0x16F8F, 0x16F92}, - {0x16FE4, 0x16FE4}, - {0x1BC9D, 0x1BC9E}, - {0x1BCA0, 0x1CF46}, - {0x1D167, 0x1D169}, - {0x1D173, 0x1D182}, - {0x1D185, 0x1D18B}, - {0x1D1AA, 0x1D1AD}, - {0x1D242, 0x1D244}, - {0x1DA00, 0x1DA36}, - {0x1DA3B, 0x1DA6C}, - {0x1DA75, 0x1DA75}, - {0x1DA84, 0x1DA84}, - {0x1DA9B, 0x1DAAF}, - {0x1E000, 0x1E02A}, - {0x1E08F, 0x1E08F}, - {0x1E130, 0x1E136}, - {0x1E2AE, 0x1E2AE}, - {0x1E2EC, 0x1E2EF}, - {0x1E4EC, 0x1E4EF}, - {0x1E8D0, 0x1E8D6}, - {0x1E944, 0x1E94A}, - {0xE0001, 0xE01EF}, - }; - - static const struct mbinterval east_asian_fw[] = { - {0x1100, 0x115F}, - {0x231A, 0x231B}, - {0x2329, 0x232A}, - {0x23E9, 0x23EC}, - {0x23F0, 0x23F0}, - {0x23F3, 0x23F3}, - {0x25FD, 0x25FE}, - {0x2614, 0x2615}, - {0x2648, 0x2653}, - {0x267F, 0x267F}, - {0x2693, 0x2693}, - {0x26A1, 0x26A1}, - {0x26AA, 0x26AB}, - {0x26BD, 0x26BE}, - {0x26C4, 0x26C5}, - {0x26CE, 0x26CE}, - {0x26D4, 0x26D4}, - {0x26EA, 0x26EA}, - {0x26F2, 0x26F3}, - {0x26F5, 0x26F5}, - {0x26FA, 0x26FA}, - {0x26FD, 0x26FD}, - {0x2705, 0x2705}, - {0x270A, 0x270B}, - {0x2728, 0x2728}, - {0x274C, 0x274C}, - {0x274E, 0x274E}, - {0x2753, 0x2755}, - {0x2757, 0x2757}, - {0x2795, 0x2797}, - {0x27B0, 0x27B0}, - {0x27BF, 0x27BF}, - {0x2B1B, 0x2B1C}, - {0x2B50, 0x2B50}, - {0x2B55, 0x2B55}, - {0x2E80, 0x2E99}, - {0x2E9B, 0x2EF3}, - {0x2F00, 0x2FD5}, - {0x2FF0, 0x303E}, - {0x3041, 0x3096}, - {0x3099, 0x30FF}, - {0x3105, 0x312F}, - {0x3131, 0x318E}, - {0x3190, 0x31E3}, - {0x31EF, 0x321E}, - {0x3220, 0x3247}, - {0x3250, 0x4DBF}, - {0x4E00, 0xA48C}, - {0xA490, 0xA4C6}, - {0xA960, 0xA97C}, - {0xAC00, 0xD7A3}, - {0xF900, 0xFAFF}, - {0xFE10, 0xFE19}, - {0xFE30, 0xFE52}, - {0xFE54, 0xFE66}, - {0xFE68, 0xFE6B}, - {0xFF01, 0xFF60}, - {0xFFE0, 0xFFE6}, - {0x16FE0, 0x16FE4}, - {0x16FF0, 0x16FF1}, - {0x17000, 0x187F7}, - {0x18800, 0x18CD5}, - {0x18D00, 0x18D08}, - {0x1AFF0, 0x1AFF3}, - {0x1AFF5, 0x1AFFB}, - {0x1AFFD, 0x1AFFE}, - {0x1B000, 0x1B122}, - {0x1B150, 0x1B152}, - {0x1B164, 0x1B167}, - {0x1B170, 0x1B2FB}, - {0x1F004, 0x1F004}, - {0x1F0CF, 0x1F0CF}, - {0x1F18E, 0x1F18E}, - {0x1F191, 0x1F19A}, - {0x1F200, 0x1F202}, - {0x1F210, 0x1F23B}, - {0x1F240, 0x1F248}, - {0x1F250, 0x1F251}, - {0x1F260, 0x1F265}, - {0x1F300, 0x1F320}, - {0x1F32D, 0x1F335}, - {0x1F337, 0x1F37C}, - {0x1F37E, 0x1F393}, - {0x1F3A0, 0x1F3CA}, - {0x1F3CF, 0x1F3D3}, - {0x1F3E0, 0x1F3F0}, - {0x1F3F4, 0x1F3F4}, - {0x1F3F8, 0x1F43E}, - {0x1F440, 0x1F440}, - {0x1F442, 0x1F4FC}, - {0x1F4FF, 0x1F53D}, - {0x1F54B, 0x1F54E}, - {0x1F550, 0x1F567}, - {0x1F57A, 0x1F57A}, - {0x1F595, 0x1F596}, - {0x1F5A4, 0x1F5A4}, - {0x1F5FB, 0x1F64F}, - {0x1F680, 0x1F6C5}, - {0x1F6CC, 0x1F6CC}, - {0x1F6D0, 0x1F6D2}, - {0x1F6D5, 0x1F6D7}, - {0x1F6DD, 0x1F6DF}, - {0x1F6EB, 0x1F6EC}, - {0x1F6F4, 0x1F6FC}, - {0x1F7E0, 0x1F7EB}, - {0x1F7F0, 0x1F7F0}, - {0x1F90C, 0x1F93A}, - {0x1F93C, 0x1F945}, - {0x1F947, 0x1F9FF}, - {0x1FA70, 0x1FA74}, - {0x1FA78, 0x1FA7C}, - {0x1FA80, 0x1FA86}, - {0x1FA90, 0x1FAAC}, - {0x1FAB0, 0x1FABA}, - {0x1FAC0, 0x1FAC5}, - {0x1FAD0, 0x1FAD9}, - {0x1FAE0, 0x1FAE7}, - {0x1FAF0, 0x1FAF6}, - {0x20000, 0x2FFFD}, - {0x30000, 0x3FFFD}, - }; +#include "unicode_nonspacing_table.h" +#include "unicode_east_asian_fw_table.h" /* test for 8-bit control characters */ if (ucs == 0) @@ -2010,6 +1583,11 @@ pg_big5_verifychar(const unsigned char *s, int len) if (len < l) return -1; + if (l == 2 && + s[0] == NONUTF8_INVALID_BYTE0 && + s[1] == NONUTF8_INVALID_BYTE1) + return -1; + while (--l > 0) { if (*++s == '\0') @@ -2059,6 +1637,11 @@ pg_gbk_verifychar(const unsigned char *s, int len) if (len < l) return -1; + if (l == 2 && + s[0] == NONUTF8_INVALID_BYTE0 && + s[1] == NONUTF8_INVALID_BYTE1) + return -1; + while (--l > 0) { if (*++s == '\0') @@ -2108,6 +1691,11 @@ pg_uhc_verifychar(const unsigned char *s, int len) if (len < l) return -1; + if (l == 2 && + s[0] == NONUTF8_INVALID_BYTE0 && + s[1] == NONUTF8_INVALID_BYTE1) + return -1; + while (--l > 0) { if (*++s == '\0') @@ -2631,6 +2219,19 @@ pg_eucjp_increment(unsigned char *charptr, int length) /* + * Fills the provided buffer with two bytes such that: + * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0 + */ +void +pg_encoding_set_invalid(int encoding, char *dst) +{ + Assert(pg_encoding_max_length(encoding) > 1); + + dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0); + dst[1] = NONUTF8_INVALID_BYTE1; +} + +/* *------------------------------------------------------------------- * encoding info table *------------------------------------------------------------------- @@ -2683,10 +2284,27 @@ const pg_wchar_tbl pg_wchar_table[] = { /* * Returns the byte length of a multibyte character. * - * Caution: when dealing with text that is not certainly valid in the - * specified encoding, the result may exceed the actual remaining - * string length. Callers that are not prepared to deal with that - * should use pg_encoding_mblen_bounded() instead. + * Choose "mblen" functions based on the input string characteristics. + * pg_encoding_mblen() can be used when ANY of these conditions are met: + * + * - The input string is zero-terminated + * + * - The input string is known to be valid in the encoding (e.g., string + * converted from database encoding) + * + * - The encoding is not GB18030 (e.g., when only database encodings are + * passed to 'encoding' parameter) + * + * encoding==GB18030 requires examining up to two bytes to determine character + * length. Therefore, callers satisfying none of those conditions must use + * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be + * guaranteed to be within allocation bounds. + * + * When dealing with text that is not certainly valid in the specified + * encoding, the result may exceed the actual remaining string length. + * Callers that are not prepared to deal with that should use Min(remaining, + * pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and + * pg_encoding_mblen_bounded() are interchangeable. */ int pg_encoding_mblen(int encoding, const char *mbstr) @@ -2697,8 +2315,28 @@ pg_encoding_mblen(int encoding, const char *mbstr) } /* - * Returns the byte length of a multibyte character; but not more than - * the distance to end of string. + * Returns the byte length of a multibyte character (possibly not + * zero-terminated), or INT_MAX if too few bytes remain to determine a length. + */ +int +pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr, + size_t remaining) +{ + /* + * Define zero remaining as too few, even for single-byte encodings. + * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read + * zero; others read one. + */ + if (remaining < 1 || + (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2)) + return INT_MAX; + return pg_encoding_mblen(encoding, mbstr); +} + +/* + * Returns the byte length of a multibyte character; but not more than the + * distance to the terminating zero byte. For input that might lack a + * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()). */ int pg_encoding_mblen_bounded(int encoding, const char *mbstr) @@ -2751,7 +2389,13 @@ pg_encoding_max_length(int encoding) { Assert(PG_VALID_ENCODING(encoding)); - return pg_wchar_table[encoding].maxmblen; + /* + * Check for the encoding despite the assert, due to some mingw versions + * otherwise issuing bogus warnings. + */ + return PG_VALID_ENCODING(encoding) ? + pg_wchar_table[encoding].maxmblen : + pg_wchar_table[PG_SQL_ASCII].maxmblen; } #ifndef FRONTEND |
