summaryrefslogtreecommitdiff
path: root/src/parser/wchar.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/parser/wchar.c')
-rw-r--r--src/parser/wchar.c554
1 files changed, 99 insertions, 455 deletions
diff --git a/src/parser/wchar.c b/src/parser/wchar.c
index 1e3a1b0ae..27c972fc0 100644
--- a/src/parser/wchar.c
+++ b/src/parser/wchar.c
@@ -9,6 +9,7 @@
#include "utils/elog.h"
#include <stdio.h>
#include <string.h>
+#include <limits.h>
#include "pg_wchar.h"
#ifndef WIN32
@@ -115,6 +116,24 @@ const char *pg_enc2gettext_tbl[] =
[PG_SHIFT_JIS_2004] = "SHIFT_JISX0213",
};
+/*
+ * In today's multibyte encodings other than UTF8, this two-byte sequence
+ * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
+ *
+ * For historical reasons, several verifychar implementations opt to reject
+ * this pair specifically. Byte pair range constraints, in encoding
+ * originator documentation, always excluded this pair. No core conversion
+ * could translate it. However, longstanding verifychar implementations
+ * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
+ * pairs not valid per encoding originator documentation. To avoid tightening
+ * core or non-core conversions in a security patch, we sought this one pair.
+ *
+ * PQescapeString() historically used spaces for BYTE1; many other values
+ * could suffice for BYTE1.
+ */
+#define NONUTF8_INVALID_BYTE0 (0x8d)
+#define NONUTF8_INVALID_BYTE1 (' ')
+
/*
* Operations on multi-byte encodings are driven by a table of helper
@@ -706,454 +725,8 @@ mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
static int
ucs_wcwidth(pg_wchar ucs)
{
- /* sorted list of non-overlapping intervals of non-spacing characters */
- static const struct mbinterval nonspacing[] = {
- {0x00AD, 0x00AD},
- {0x0300, 0x036F},
- {0x0483, 0x0489},
- {0x0591, 0x05BD},
- {0x05BF, 0x05BF},
- {0x05C1, 0x05C2},
- {0x05C4, 0x05C5},
- {0x05C7, 0x05C7},
- {0x0600, 0x0605},
- {0x0610, 0x061A},
- {0x061C, 0x061C},
- {0x064B, 0x065F},
- {0x0670, 0x0670},
- {0x06D6, 0x06DD},
- {0x06DF, 0x06E4},
- {0x06E7, 0x06E8},
- {0x06EA, 0x06ED},
- {0x070F, 0x070F},
- {0x0711, 0x0711},
- {0x0730, 0x074A},
- {0x07A6, 0x07B0},
- {0x07EB, 0x07F3},
- {0x07FD, 0x07FD},
- {0x0816, 0x0819},
- {0x081B, 0x0823},
- {0x0825, 0x0827},
- {0x0829, 0x082D},
- {0x0859, 0x085B},
- {0x0890, 0x089F},
- {0x08CA, 0x0902},
- {0x093A, 0x093A},
- {0x093C, 0x093C},
- {0x0941, 0x0948},
- {0x094D, 0x094D},
- {0x0951, 0x0957},
- {0x0962, 0x0963},
- {0x0981, 0x0981},
- {0x09BC, 0x09BC},
- {0x09C1, 0x09C4},
- {0x09CD, 0x09CD},
- {0x09E2, 0x09E3},
- {0x09FE, 0x0A02},
- {0x0A3C, 0x0A3C},
- {0x0A41, 0x0A51},
- {0x0A70, 0x0A71},
- {0x0A75, 0x0A75},
- {0x0A81, 0x0A82},
- {0x0ABC, 0x0ABC},
- {0x0AC1, 0x0AC8},
- {0x0ACD, 0x0ACD},
- {0x0AE2, 0x0AE3},
- {0x0AFA, 0x0B01},
- {0x0B3C, 0x0B3C},
- {0x0B3F, 0x0B3F},
- {0x0B41, 0x0B44},
- {0x0B4D, 0x0B56},
- {0x0B62, 0x0B63},
- {0x0B82, 0x0B82},
- {0x0BC0, 0x0BC0},
- {0x0BCD, 0x0BCD},
- {0x0C00, 0x0C00},
- {0x0C04, 0x0C04},
- {0x0C3C, 0x0C3C},
- {0x0C3E, 0x0C40},
- {0x0C46, 0x0C56},
- {0x0C62, 0x0C63},
- {0x0C81, 0x0C81},
- {0x0CBC, 0x0CBC},
- {0x0CBF, 0x0CBF},
- {0x0CC6, 0x0CC6},
- {0x0CCC, 0x0CCD},
- {0x0CE2, 0x0CE3},
- {0x0D00, 0x0D01},
- {0x0D3B, 0x0D3C},
- {0x0D41, 0x0D44},
- {0x0D4D, 0x0D4D},
- {0x0D62, 0x0D63},
- {0x0D81, 0x0D81},
- {0x0DCA, 0x0DCA},
- {0x0DD2, 0x0DD6},
- {0x0E31, 0x0E31},
- {0x0E34, 0x0E3A},
- {0x0E47, 0x0E4E},
- {0x0EB1, 0x0EB1},
- {0x0EB4, 0x0EBC},
- {0x0EC8, 0x0ECE},
- {0x0F18, 0x0F19},
- {0x0F35, 0x0F35},
- {0x0F37, 0x0F37},
- {0x0F39, 0x0F39},
- {0x0F71, 0x0F7E},
- {0x0F80, 0x0F84},
- {0x0F86, 0x0F87},
- {0x0F8D, 0x0FBC},
- {0x0FC6, 0x0FC6},
- {0x102D, 0x1030},
- {0x1032, 0x1037},
- {0x1039, 0x103A},
- {0x103D, 0x103E},
- {0x1058, 0x1059},
- {0x105E, 0x1060},
- {0x1071, 0x1074},
- {0x1082, 0x1082},
- {0x1085, 0x1086},
- {0x108D, 0x108D},
- {0x109D, 0x109D},
- {0x135D, 0x135F},
- {0x1712, 0x1714},
- {0x1732, 0x1733},
- {0x1752, 0x1753},
- {0x1772, 0x1773},
- {0x17B4, 0x17B5},
- {0x17B7, 0x17BD},
- {0x17C6, 0x17C6},
- {0x17C9, 0x17D3},
- {0x17DD, 0x17DD},
- {0x180B, 0x180F},
- {0x1885, 0x1886},
- {0x18A9, 0x18A9},
- {0x1920, 0x1922},
- {0x1927, 0x1928},
- {0x1932, 0x1932},
- {0x1939, 0x193B},
- {0x1A17, 0x1A18},
- {0x1A1B, 0x1A1B},
- {0x1A56, 0x1A56},
- {0x1A58, 0x1A60},
- {0x1A62, 0x1A62},
- {0x1A65, 0x1A6C},
- {0x1A73, 0x1A7F},
- {0x1AB0, 0x1B03},
- {0x1B34, 0x1B34},
- {0x1B36, 0x1B3A},
- {0x1B3C, 0x1B3C},
- {0x1B42, 0x1B42},
- {0x1B6B, 0x1B73},
- {0x1B80, 0x1B81},
- {0x1BA2, 0x1BA5},
- {0x1BA8, 0x1BA9},
- {0x1BAB, 0x1BAD},
- {0x1BE6, 0x1BE6},
- {0x1BE8, 0x1BE9},
- {0x1BED, 0x1BED},
- {0x1BEF, 0x1BF1},
- {0x1C2C, 0x1C33},
- {0x1C36, 0x1C37},
- {0x1CD0, 0x1CD2},
- {0x1CD4, 0x1CE0},
- {0x1CE2, 0x1CE8},
- {0x1CED, 0x1CED},
- {0x1CF4, 0x1CF4},
- {0x1CF8, 0x1CF9},
- {0x1DC0, 0x1DFF},
- {0x200B, 0x200F},
- {0x202A, 0x202E},
- {0x2060, 0x206F},
- {0x20D0, 0x20F0},
- {0x2CEF, 0x2CF1},
- {0x2D7F, 0x2D7F},
- {0x2DE0, 0x2DFF},
- {0x302A, 0x302D},
- {0x3099, 0x309A},
- {0xA66F, 0xA672},
- {0xA674, 0xA67D},
- {0xA69E, 0xA69F},
- {0xA6F0, 0xA6F1},
- {0xA802, 0xA802},
- {0xA806, 0xA806},
- {0xA80B, 0xA80B},
- {0xA825, 0xA826},
- {0xA82C, 0xA82C},
- {0xA8C4, 0xA8C5},
- {0xA8E0, 0xA8F1},
- {0xA8FF, 0xA8FF},
- {0xA926, 0xA92D},
- {0xA947, 0xA951},
- {0xA980, 0xA982},
- {0xA9B3, 0xA9B3},
- {0xA9B6, 0xA9B9},
- {0xA9BC, 0xA9BD},
- {0xA9E5, 0xA9E5},
- {0xAA29, 0xAA2E},
- {0xAA31, 0xAA32},
- {0xAA35, 0xAA36},
- {0xAA43, 0xAA43},
- {0xAA4C, 0xAA4C},
- {0xAA7C, 0xAA7C},
- {0xAAB0, 0xAAB0},
- {0xAAB2, 0xAAB4},
- {0xAAB7, 0xAAB8},
- {0xAABE, 0xAABF},
- {0xAAC1, 0xAAC1},
- {0xAAEC, 0xAAED},
- {0xAAF6, 0xAAF6},
- {0xABE5, 0xABE5},
- {0xABE8, 0xABE8},
- {0xABED, 0xABED},
- {0xFB1E, 0xFB1E},
- {0xFE00, 0xFE0F},
- {0xFE20, 0xFE2F},
- {0xFEFF, 0xFEFF},
- {0xFFF9, 0xFFFB},
- {0x101FD, 0x101FD},
- {0x102E0, 0x102E0},
- {0x10376, 0x1037A},
- {0x10A01, 0x10A0F},
- {0x10A38, 0x10A3F},
- {0x10AE5, 0x10AE6},
- {0x10D24, 0x10D27},
- {0x10EAB, 0x10EAC},
- {0x10EFD, 0x10EFF},
- {0x10F46, 0x10F50},
- {0x10F82, 0x10F85},
- {0x11001, 0x11001},
- {0x11038, 0x11046},
- {0x11070, 0x11070},
- {0x11073, 0x11074},
- {0x1107F, 0x11081},
- {0x110B3, 0x110B6},
- {0x110B9, 0x110BA},
- {0x110BD, 0x110BD},
- {0x110C2, 0x110CD},
- {0x11100, 0x11102},
- {0x11127, 0x1112B},
- {0x1112D, 0x11134},
- {0x11173, 0x11173},
- {0x11180, 0x11181},
- {0x111B6, 0x111BE},
- {0x111C9, 0x111CC},
- {0x111CF, 0x111CF},
- {0x1122F, 0x11231},
- {0x11234, 0x11234},
- {0x11236, 0x11237},
- {0x1123E, 0x1123E},
- {0x11241, 0x11241},
- {0x112DF, 0x112DF},
- {0x112E3, 0x112EA},
- {0x11300, 0x11301},
- {0x1133B, 0x1133C},
- {0x11340, 0x11340},
- {0x11366, 0x11374},
- {0x11438, 0x1143F},
- {0x11442, 0x11444},
- {0x11446, 0x11446},
- {0x1145E, 0x1145E},
- {0x114B3, 0x114B8},
- {0x114BA, 0x114BA},
- {0x114BF, 0x114C0},
- {0x114C2, 0x114C3},
- {0x115B2, 0x115B5},
- {0x115BC, 0x115BD},
- {0x115BF, 0x115C0},
- {0x115DC, 0x115DD},
- {0x11633, 0x1163A},
- {0x1163D, 0x1163D},
- {0x1163F, 0x11640},
- {0x116AB, 0x116AB},
- {0x116AD, 0x116AD},
- {0x116B0, 0x116B5},
- {0x116B7, 0x116B7},
- {0x1171D, 0x1171F},
- {0x11722, 0x11725},
- {0x11727, 0x1172B},
- {0x1182F, 0x11837},
- {0x11839, 0x1183A},
- {0x1193B, 0x1193C},
- {0x1193E, 0x1193E},
- {0x11943, 0x11943},
- {0x119D4, 0x119DB},
- {0x119E0, 0x119E0},
- {0x11A01, 0x11A0A},
- {0x11A33, 0x11A38},
- {0x11A3B, 0x11A3E},
- {0x11A47, 0x11A47},
- {0x11A51, 0x11A56},
- {0x11A59, 0x11A5B},
- {0x11A8A, 0x11A96},
- {0x11A98, 0x11A99},
- {0x11C30, 0x11C3D},
- {0x11C3F, 0x11C3F},
- {0x11C92, 0x11CA7},
- {0x11CAA, 0x11CB0},
- {0x11CB2, 0x11CB3},
- {0x11CB5, 0x11CB6},
- {0x11D31, 0x11D45},
- {0x11D47, 0x11D47},
- {0x11D90, 0x11D91},
- {0x11D95, 0x11D95},
- {0x11D97, 0x11D97},
- {0x11EF3, 0x11EF4},
- {0x11F00, 0x11F01},
- {0x11F36, 0x11F3A},
- {0x11F40, 0x11F40},
- {0x11F42, 0x11F42},
- {0x13430, 0x13440},
- {0x13447, 0x13455},
- {0x16AF0, 0x16AF4},
- {0x16B30, 0x16B36},
- {0x16F4F, 0x16F4F},
- {0x16F8F, 0x16F92},
- {0x16FE4, 0x16FE4},
- {0x1BC9D, 0x1BC9E},
- {0x1BCA0, 0x1CF46},
- {0x1D167, 0x1D169},
- {0x1D173, 0x1D182},
- {0x1D185, 0x1D18B},
- {0x1D1AA, 0x1D1AD},
- {0x1D242, 0x1D244},
- {0x1DA00, 0x1DA36},
- {0x1DA3B, 0x1DA6C},
- {0x1DA75, 0x1DA75},
- {0x1DA84, 0x1DA84},
- {0x1DA9B, 0x1DAAF},
- {0x1E000, 0x1E02A},
- {0x1E08F, 0x1E08F},
- {0x1E130, 0x1E136},
- {0x1E2AE, 0x1E2AE},
- {0x1E2EC, 0x1E2EF},
- {0x1E4EC, 0x1E4EF},
- {0x1E8D0, 0x1E8D6},
- {0x1E944, 0x1E94A},
- {0xE0001, 0xE01EF},
- };
-
- static const struct mbinterval east_asian_fw[] = {
- {0x1100, 0x115F},
- {0x231A, 0x231B},
- {0x2329, 0x232A},
- {0x23E9, 0x23EC},
- {0x23F0, 0x23F0},
- {0x23F3, 0x23F3},
- {0x25FD, 0x25FE},
- {0x2614, 0x2615},
- {0x2648, 0x2653},
- {0x267F, 0x267F},
- {0x2693, 0x2693},
- {0x26A1, 0x26A1},
- {0x26AA, 0x26AB},
- {0x26BD, 0x26BE},
- {0x26C4, 0x26C5},
- {0x26CE, 0x26CE},
- {0x26D4, 0x26D4},
- {0x26EA, 0x26EA},
- {0x26F2, 0x26F3},
- {0x26F5, 0x26F5},
- {0x26FA, 0x26FA},
- {0x26FD, 0x26FD},
- {0x2705, 0x2705},
- {0x270A, 0x270B},
- {0x2728, 0x2728},
- {0x274C, 0x274C},
- {0x274E, 0x274E},
- {0x2753, 0x2755},
- {0x2757, 0x2757},
- {0x2795, 0x2797},
- {0x27B0, 0x27B0},
- {0x27BF, 0x27BF},
- {0x2B1B, 0x2B1C},
- {0x2B50, 0x2B50},
- {0x2B55, 0x2B55},
- {0x2E80, 0x2E99},
- {0x2E9B, 0x2EF3},
- {0x2F00, 0x2FD5},
- {0x2FF0, 0x303E},
- {0x3041, 0x3096},
- {0x3099, 0x30FF},
- {0x3105, 0x312F},
- {0x3131, 0x318E},
- {0x3190, 0x31E3},
- {0x31EF, 0x321E},
- {0x3220, 0x3247},
- {0x3250, 0x4DBF},
- {0x4E00, 0xA48C},
- {0xA490, 0xA4C6},
- {0xA960, 0xA97C},
- {0xAC00, 0xD7A3},
- {0xF900, 0xFAFF},
- {0xFE10, 0xFE19},
- {0xFE30, 0xFE52},
- {0xFE54, 0xFE66},
- {0xFE68, 0xFE6B},
- {0xFF01, 0xFF60},
- {0xFFE0, 0xFFE6},
- {0x16FE0, 0x16FE4},
- {0x16FF0, 0x16FF1},
- {0x17000, 0x187F7},
- {0x18800, 0x18CD5},
- {0x18D00, 0x18D08},
- {0x1AFF0, 0x1AFF3},
- {0x1AFF5, 0x1AFFB},
- {0x1AFFD, 0x1AFFE},
- {0x1B000, 0x1B122},
- {0x1B150, 0x1B152},
- {0x1B164, 0x1B167},
- {0x1B170, 0x1B2FB},
- {0x1F004, 0x1F004},
- {0x1F0CF, 0x1F0CF},
- {0x1F18E, 0x1F18E},
- {0x1F191, 0x1F19A},
- {0x1F200, 0x1F202},
- {0x1F210, 0x1F23B},
- {0x1F240, 0x1F248},
- {0x1F250, 0x1F251},
- {0x1F260, 0x1F265},
- {0x1F300, 0x1F320},
- {0x1F32D, 0x1F335},
- {0x1F337, 0x1F37C},
- {0x1F37E, 0x1F393},
- {0x1F3A0, 0x1F3CA},
- {0x1F3CF, 0x1F3D3},
- {0x1F3E0, 0x1F3F0},
- {0x1F3F4, 0x1F3F4},
- {0x1F3F8, 0x1F43E},
- {0x1F440, 0x1F440},
- {0x1F442, 0x1F4FC},
- {0x1F4FF, 0x1F53D},
- {0x1F54B, 0x1F54E},
- {0x1F550, 0x1F567},
- {0x1F57A, 0x1F57A},
- {0x1F595, 0x1F596},
- {0x1F5A4, 0x1F5A4},
- {0x1F5FB, 0x1F64F},
- {0x1F680, 0x1F6C5},
- {0x1F6CC, 0x1F6CC},
- {0x1F6D0, 0x1F6D2},
- {0x1F6D5, 0x1F6D7},
- {0x1F6DD, 0x1F6DF},
- {0x1F6EB, 0x1F6EC},
- {0x1F6F4, 0x1F6FC},
- {0x1F7E0, 0x1F7EB},
- {0x1F7F0, 0x1F7F0},
- {0x1F90C, 0x1F93A},
- {0x1F93C, 0x1F945},
- {0x1F947, 0x1F9FF},
- {0x1FA70, 0x1FA74},
- {0x1FA78, 0x1FA7C},
- {0x1FA80, 0x1FA86},
- {0x1FA90, 0x1FAAC},
- {0x1FAB0, 0x1FABA},
- {0x1FAC0, 0x1FAC5},
- {0x1FAD0, 0x1FAD9},
- {0x1FAE0, 0x1FAE7},
- {0x1FAF0, 0x1FAF6},
- {0x20000, 0x2FFFD},
- {0x30000, 0x3FFFD},
- };
+#include "unicode_nonspacing_table.h"
+#include "unicode_east_asian_fw_table.h"
/* test for 8-bit control characters */
if (ucs == 0)
@@ -2010,6 +1583,11 @@ pg_big5_verifychar(const unsigned char *s, int len)
if (len < l)
return -1;
+ if (l == 2 &&
+ s[0] == NONUTF8_INVALID_BYTE0 &&
+ s[1] == NONUTF8_INVALID_BYTE1)
+ return -1;
+
while (--l > 0)
{
if (*++s == '\0')
@@ -2059,6 +1637,11 @@ pg_gbk_verifychar(const unsigned char *s, int len)
if (len < l)
return -1;
+ if (l == 2 &&
+ s[0] == NONUTF8_INVALID_BYTE0 &&
+ s[1] == NONUTF8_INVALID_BYTE1)
+ return -1;
+
while (--l > 0)
{
if (*++s == '\0')
@@ -2108,6 +1691,11 @@ pg_uhc_verifychar(const unsigned char *s, int len)
if (len < l)
return -1;
+ if (l == 2 &&
+ s[0] == NONUTF8_INVALID_BYTE0 &&
+ s[1] == NONUTF8_INVALID_BYTE1)
+ return -1;
+
while (--l > 0)
{
if (*++s == '\0')
@@ -2631,6 +2219,19 @@ pg_eucjp_increment(unsigned char *charptr, int length)
/*
+ * Fills the provided buffer with two bytes such that:
+ * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
+ */
+void
+pg_encoding_set_invalid(int encoding, char *dst)
+{
+ Assert(pg_encoding_max_length(encoding) > 1);
+
+ dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
+ dst[1] = NONUTF8_INVALID_BYTE1;
+}
+
+/*
*-------------------------------------------------------------------
* encoding info table
*-------------------------------------------------------------------
@@ -2683,10 +2284,27 @@ const pg_wchar_tbl pg_wchar_table[] = {
/*
* Returns the byte length of a multibyte character.
*
- * Caution: when dealing with text that is not certainly valid in the
- * specified encoding, the result may exceed the actual remaining
- * string length. Callers that are not prepared to deal with that
- * should use pg_encoding_mblen_bounded() instead.
+ * Choose "mblen" functions based on the input string characteristics.
+ * pg_encoding_mblen() can be used when ANY of these conditions are met:
+ *
+ * - The input string is zero-terminated
+ *
+ * - The input string is known to be valid in the encoding (e.g., string
+ * converted from database encoding)
+ *
+ * - The encoding is not GB18030 (e.g., when only database encodings are
+ * passed to 'encoding' parameter)
+ *
+ * encoding==GB18030 requires examining up to two bytes to determine character
+ * length. Therefore, callers satisfying none of those conditions must use
+ * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
+ * guaranteed to be within allocation bounds.
+ *
+ * When dealing with text that is not certainly valid in the specified
+ * encoding, the result may exceed the actual remaining string length.
+ * Callers that are not prepared to deal with that should use Min(remaining,
+ * pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and
+ * pg_encoding_mblen_bounded() are interchangeable.
*/
int
pg_encoding_mblen(int encoding, const char *mbstr)
@@ -2697,8 +2315,28 @@ pg_encoding_mblen(int encoding, const char *mbstr)
}
/*
- * Returns the byte length of a multibyte character; but not more than
- * the distance to end of string.
+ * Returns the byte length of a multibyte character (possibly not
+ * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
+ */
+int
+pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
+ size_t remaining)
+{
+ /*
+ * Define zero remaining as too few, even for single-byte encodings.
+ * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
+ * zero; others read one.
+ */
+ if (remaining < 1 ||
+ (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
+ return INT_MAX;
+ return pg_encoding_mblen(encoding, mbstr);
+}
+
+/*
+ * Returns the byte length of a multibyte character; but not more than the
+ * distance to the terminating zero byte. For input that might lack a
+ * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
*/
int
pg_encoding_mblen_bounded(int encoding, const char *mbstr)
@@ -2751,7 +2389,13 @@ pg_encoding_max_length(int encoding)
{
Assert(PG_VALID_ENCODING(encoding));
- return pg_wchar_table[encoding].maxmblen;
+ /*
+ * Check for the encoding despite the assert, due to some mingw versions
+ * otherwise issuing bogus warnings.
+ */
+ return PG_VALID_ENCODING(encoding) ?
+ pg_wchar_table[encoding].maxmblen :
+ pg_wchar_table[PG_SQL_ASCII].maxmblen;
}
#ifndef FRONTEND