return l;
}
+/*
+ * The fast path of the UTF-8 verifier uses a deterministic finite automaton
+ * (DFA) for multibyte characters. In a traditional table-driven DFA, the
+ * input byte and current state are used to compute an index into an array of
+ * state transitions. Since the address of the next transition is dependent
+ * on this computation, there is latency in executing the load instruction,
+ * and the CPU is not kept busy.
+ *
+ * Instead, we use a "shift-based" DFA as described by Per Vognsen:
+ *
+ * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
+ *
+ * In a shift-based DFA, the input byte is an index into array of integers
+ * whose bit pattern encodes the state transitions. To compute the next
+ * state, we simply right-shift the integer by the current state and apply a
+ * mask. In this scheme, the address of the transition only depends on the
+ * input byte, so there is better pipelining.
+ *
+ * The naming convention for states and transitions was adopted from a UTF-8
+ * to UTF-16/32 transcoder, whose table is reproduced below:
+ *
+ * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
+ *
+ * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
+ * ==========================================================================
+ * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
+ * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
+ * |
+ * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
+ * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
+ * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
+ * |
+ * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
+ * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
+ * |
+ * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
+ * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
+ *
+ * In the most straightforward implementation, a shift-based DFA for UTF-8
+ * requires 64-bit integers to encode the transitions, but with an SMT solver
+ * it's possible to find state numbers such that the transitions fit within
+ * 32-bit integers, as Dougall Johnson demonstrated:
+ *
+ * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
+ *
+ * This packed representation is the reason for the seemingly odd choice of
+ * state values below.
+ */
+
+/* Error */
+#define ERR 0
+/* Begin */
+#define BGN 11
+/* Continuation states, expect 1/2/3 continuation bytes */
+#define CS1 16
+#define CS2 1
+#define CS3 5
+/* Leading byte was E0/ED, expect 1 more continuation byte */
+#define P3A 6
+#define P3B 20
+/* Leading byte was F0/F4, expect 2 more continuation bytes */
+#define P4A 25
+#define P4B 30
+/* Begin and End are the same state */
+#define END BGN
+
+/* the encoded state transitions for the lookup table */
+
+/* ASCII */
+#define ASC (END << BGN)
+/* 2-byte lead */
+#define L2A (CS1 << BGN)
+/* 3-byte lead */
+#define L3A (P3A << BGN)
+#define L3B (CS2 << BGN)
+#define L3C (P3B << BGN)
+/* 4-byte lead */
+#define L4A (P4A << BGN)
+#define L4B (CS3 << BGN)
+#define L4C (P4B << BGN)
+/* continuation byte */
+#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
+#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
+#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
+/* invalid byte */
+#define ILL ERR
+
+static const uint32 Utf8Transition[256] =
+{
+ /* ASCII */
+
+ ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+ /* continuation bytes */
+
+ /* 80..8F */
+ CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+ CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+
+ /* 90..9F */
+ CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+ CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+
+ /* A0..BF */
+ CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+ CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+ CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+ CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+
+ /* leading bytes */
+
+ /* C0..DF */
+ ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
+ L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+ L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+ L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+
+ /* E0..EF */
+ L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
+ L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
+
+ /* F0..FF */
+ L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
+ ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
+};
+
+static void
+utf8_advance(const unsigned char *s, uint32 *state, int len)
+{
+ /* Note: We deliberately don't check the state's value here. */
+ while (len > 0)
+ {
+ /*
+ * It's important that the mask value is 31: In most instruction sets,
+ * a shift by a 32-bit operand is understood to be a shift by its mod
+ * 32, so the compiler should elide the mask operation.
+ */
+ *state = Utf8Transition[*s++] >> (*state & 31);
+ len--;
+ }
+
+ *state &= 31;
+}
+
static int
pg_utf8_verifystr(const unsigned char *s, int len)
{
const unsigned char *start = s;
+ const int orig_len = len;
+ uint32 state = BGN;
+
+/*
+ * Sixteen seems to give the best balance of performance across different
+ * byte distributions.
+ */
+#define STRIDE_LENGTH 16
+
+ if (len >= STRIDE_LENGTH)
+ {
+ while (len >= STRIDE_LENGTH)
+ {
+ /*
+ * If the chunk is all ASCII, we can skip the full UTF-8 check,
+ * but we must first check for a non-END state, which means the
+ * previous chunk ended in the middle of a multibyte sequence.
+ */
+ if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
+ utf8_advance(s, &state, STRIDE_LENGTH);
+
+ s += STRIDE_LENGTH;
+ len -= STRIDE_LENGTH;
+ }
+
+ /*
+ * The error state persists, so we only need to check for it here. In
+ * case of error we start over from the beginning with the slow path
+ * so we can count the valid bytes.
+ */
+ if (state == ERR)
+ {
+ len = orig_len;
+ s = start;
+ }
+
+ /*
+ * We treat all other states as success, but it's possible the fast
+ * path exited in the middle of a multibyte sequence, since that
+ * wouldn't have caused an error. Before checking the remaining bytes,
+ * walk backwards to find the last byte that could have been the start
+ * of a valid sequence.
+ */
+ while (s > start)
+ {
+ s--;
+ len++;
+
+ if (!IS_HIGHBIT_SET(*s) || pg_utf_mblen(s) > 1)
+ break;
+ }
+ }
+ /* check remaining bytes */
while (len > 0)
{
int l;
--
-- UTF-8
--
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs values
+ ('\x66006f', 'NUL byte'),
+ ('\xaf', 'bare continuation'),
+ ('\xc5', 'missing second byte in 2-byte char'),
+ ('\xc080', 'smallest 2-byte overlong'),
+ ('\xc1bf', 'largest 2-byte overlong'),
+ ('\xc280', 'next 2-byte after overlongs'),
+ ('\xdfbf', 'largest 2-byte'),
+ ('\xe9af', 'missing third byte in 3-byte char'),
+ ('\xe08080', 'smallest 3-byte overlong'),
+ ('\xe09fbf', 'largest 3-byte overlong'),
+ ('\xe0a080', 'next 3-byte after overlong'),
+ ('\xed9fbf', 'last before surrogates'),
+ ('\xeda080', 'smallest surrogate'),
+ ('\xedbfbf', 'largest surrogate'),
+ ('\xee8080', 'next after surrogates'),
+ ('\xefbfbf', 'largest 3-byte'),
+ ('\xf1afbf', 'missing fourth byte in 4-byte char'),
+ ('\xf0808080', 'smallest 4-byte overlong'),
+ ('\xf08fbfbf', 'largest 4-byte overlong'),
+ ('\xf0908080', 'next 4-byte after overlong'),
+ ('\xf48fbfbf', 'largest 4-byte'),
+ ('\xf4908080', 'smallest too large'),
+ ('\xfa9a9a8a8a', '5-byte');
+-- Test UTF-8 verification slow path
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+ description | result | errorat | error
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ NUL byte | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00
+ bare continuation | \x | \xaf | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x | \xc5 | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong | \x | \xc080 | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong | \x | \xc1bf | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs | \xc280 | |
+ largest 2-byte | \xdfbf | |
+ missing third byte in 3-byte char | \x | \xe9af | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong | \x | \xe08080 | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong | \x | \xe09fbf | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong | \xe0a080 | |
+ last before surrogates | \xed9fbf | |
+ smallest surrogate | \x | \xeda080 | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate | \x | \xedbfbf | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates | \xee8080 | |
+ largest 3-byte | \xefbfbf | |
+ missing fourth byte in 4-byte char | \x | \xf1afbf | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong | \x | \xf0808080 | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong | \x | \xf08fbfbf | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong | \xf0908080 | |
+ largest 4-byte | \xf48fbfbf | |
+ smallest too large | \x | \xf4908080 | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte | \x | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+-- The error message for a sequence starting with a 4-byte lead
+-- will contain all 4 bytes if they are present, so various
+-- expressions below add 3 ASCII bytes to the end to ensure
+-- consistent error messages.
+-- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+-- Test multibyte verification in fast path
+with test_bytes as (
+ select
+ inbytes,
+ description,
+ (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from utf8_verification_inputs
+), test_padded as (
+ select
+ description,
+ (test_conv(inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+ from test_bytes
+)
+select
+ description,
+ b.error as orig_error,
+ p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding
+-------------+------------+---------------------
+(0 rows)
+
+-- Test ASCII verification in fast path where incomplete
+-- UTF-8 sequences fall at the end of the preceding chunk.
+with test_bytes as (
+ select
+ inbytes,
+ description,
+ (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from utf8_verification_inputs
+), test_padded as (
+ select
+ description,
+ (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+ from test_bytes
+)
+select
+ description,
+ b.error as orig_error,
+ p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding
+-------------+------------+---------------------
+(0 rows)
+
+-- Test cases where UTF-8 sequences within short text
+-- come after the fast path returns.
+with test_bytes as (
+ select
+ inbytes,
+ description,
+ (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from utf8_verification_inputs
+), test_padded as (
+ select
+ description,
+ (test_conv(repeat('.', 64)::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from test_bytes
+)
+select
+ description,
+ b.error as orig_error,
+ p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding
+-------------+------------+---------------------
+(0 rows)
+
+-- Test cases where incomplete UTF-8 sequences fall at the
+-- end of the part checked by the fast path.
+with test_bytes as (
+ select
+ inbytes,
+ description,
+ (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from utf8_verification_inputs
+), test_padded as (
+ select
+ description,
+ (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from test_bytes
+)
+select
+ description,
+ b.error as orig_error,
+ p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding
+-------------+------------+---------------------
+(0 rows)
+
CREATE TABLE utf8_inputs (inbytes bytea, description text);
insert into utf8_inputs values
('\x666f6f', 'valid, pure ASCII'),
--
-- UTF-8
--
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs values
+ ('\x66006f', 'NUL byte'),
+ ('\xaf', 'bare continuation'),
+ ('\xc5', 'missing second byte in 2-byte char'),
+ ('\xc080', 'smallest 2-byte overlong'),
+ ('\xc1bf', 'largest 2-byte overlong'),
+ ('\xc280', 'next 2-byte after overlongs'),
+ ('\xdfbf', 'largest 2-byte'),
+ ('\xe9af', 'missing third byte in 3-byte char'),
+ ('\xe08080', 'smallest 3-byte overlong'),
+ ('\xe09fbf', 'largest 3-byte overlong'),
+ ('\xe0a080', 'next 3-byte after overlong'),
+ ('\xed9fbf', 'last before surrogates'),
+ ('\xeda080', 'smallest surrogate'),
+ ('\xedbfbf', 'largest surrogate'),
+ ('\xee8080', 'next after surrogates'),
+ ('\xefbfbf', 'largest 3-byte'),
+ ('\xf1afbf', 'missing fourth byte in 4-byte char'),
+ ('\xf0808080', 'smallest 4-byte overlong'),
+ ('\xf08fbfbf', 'largest 4-byte overlong'),
+ ('\xf0908080', 'next 4-byte after overlong'),
+ ('\xf48fbfbf', 'largest 4-byte'),
+ ('\xf4908080', 'smallest too large'),
+ ('\xfa9a9a8a8a', '5-byte');
+
+-- Test UTF-8 verification slow path
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+-- The error message for a sequence starting with a 4-byte lead
+-- will contain all 4 bytes if they are present, so various
+-- expressions below add 3 ASCII bytes to the end to ensure
+-- consistent error messages.
+-- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+
+-- Test multibyte verification in fast path
+with test_bytes as (
+ select
+ inbytes,
+ description,
+ (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from utf8_verification_inputs
+), test_padded as (
+ select
+ description,
+ (test_conv(inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+ from test_bytes
+)
+select
+ description,
+ b.error as orig_error,
+ p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test ASCII verification in fast path where incomplete
+-- UTF-8 sequences fall at the end of the preceding chunk.
+with test_bytes as (
+ select
+ inbytes,
+ description,
+ (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from utf8_verification_inputs
+), test_padded as (
+ select
+ description,
+ (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+ from test_bytes
+)
+select
+ description,
+ b.error as orig_error,
+ p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test cases where UTF-8 sequences within short text
+-- come after the fast path returns.
+with test_bytes as (
+ select
+ inbytes,
+ description,
+ (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from utf8_verification_inputs
+), test_padded as (
+ select
+ description,
+ (test_conv(repeat('.', 64)::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from test_bytes
+)
+select
+ description,
+ b.error as orig_error,
+ p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test cases where incomplete UTF-8 sequences fall at the
+-- end of the part checked by the fast path.
+with test_bytes as (
+ select
+ inbytes,
+ description,
+ (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from utf8_verification_inputs
+), test_padded as (
+ select
+ description,
+ (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from test_bytes
+)
+select
+ description,
+ b.error as orig_error,
+ p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
CREATE TABLE utf8_inputs (inbytes bytea, description text);
insert into utf8_inputs values
('\x666f6f', 'valid, pure ASCII'),