Add fast path for validating UTF-8 text

author John Naylor <john.naylor@postgresql.org>

Tue, 19 Oct 2021 20:43:14 +0000 (16:43 -0400)

committer John Naylor <john.naylor@postgresql.org>

Mon, 20 Dec 2021 14:07:29 +0000 (10:07 -0400)
author John Naylor <john.naylor@postgresql.org>
Tue, 19 Oct 2021 20:43:14 +0000 (16:43 -0400)
committer John Naylor <john.naylor@postgresql.org>
Mon, 20 Dec 2021 14:07:29 +0000 (10:07 -0400)
diff --git a/src/common/wchar.c b/src/common/wchar.c

index a6bffd06428fbd1152da636daaf63c7631e21f3e..be931c5e92a48afe97aa5c39dd107ee6acc64808 100644 (file)
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1750,11 +1750,226 @@ pg_utf8_verifychar(const unsigned char *s, int len)
     return l;
  }
  
+/*
+ * The fast path of the UTF-8 verifier uses a deterministic finite automaton
+ * (DFA) for multibyte characters. In a traditional table-driven DFA, the
+ * input byte and current state are used to compute an index into an array of
+ * state transitions. Since the address of the next transition is dependent
+ * on this computation, there is latency in executing the load instruction,
+ * and the CPU is not kept busy.
+ *
+ * Instead, we use a "shift-based" DFA as described by Per Vognsen:
+ *
+ * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
+ *
+ * In a shift-based DFA, the input byte is an index into array of integers
+ * whose bit pattern encodes the state transitions. To compute the next
+ * state, we simply right-shift the integer by the current state and apply a
+ * mask. In this scheme, the address of the transition only depends on the
+ * input byte, so there is better pipelining.
+ *
+ * The naming convention for states and transitions was adopted from a UTF-8
+ * to UTF-16/32 transcoder, whose table is reproduced below:
+ *
+ * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
+ *
+ * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
+ * ==========================================================================
+ * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
+ * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
+ *                                                                  |
+ * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
+ * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
+ * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
+ *                                                                  |
+ * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
+ * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
+ *                                                                  |
+ * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
+ * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
+ *
+ * In the most straightforward implementation, a shift-based DFA for UTF-8
+ * requires 64-bit integers to encode the transitions, but with an SMT solver
+ * it's possible to find state numbers such that the transitions fit within
+ * 32-bit integers, as Dougall Johnson demonstrated:
+ *
+ * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
+ *
+ * This packed representation is the reason for the seemingly odd choice of
+ * state values below.
+ */
+
+/* Error */
+#define    ERR  0
+/* Begin */
+#define    BGN 11
+/* Continuation states, expect 1/2/3 continuation bytes */
+#define    CS1 16
+#define    CS2  1
+#define    CS3  5
+/* Leading byte was E0/ED, expect 1 more continuation byte */
+#define    P3A  6
+#define    P3B 20
+/* Leading byte was F0/F4, expect 2 more continuation bytes */
+#define    P4A 25
+#define    P4B 30
+/* Begin and End are the same state */
+#define    END BGN
+
+/* the encoded state transitions for the lookup table */
+
+/* ASCII */
+#define ASC (END << BGN)
+/* 2-byte lead */
+#define L2A (CS1 << BGN)
+/* 3-byte lead */
+#define L3A (P3A << BGN)
+#define L3B (CS2 << BGN)
+#define L3C (P3B << BGN)
+/* 4-byte lead */
+#define L4A (P4A << BGN)
+#define L4B (CS3 << BGN)
+#define L4C (P4B << BGN)
+/* continuation byte */
+#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
+#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
+#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
+/* invalid byte */
+#define ILL ERR
+
+static const uint32 Utf8Transition[256] =
+{
+   /* ASCII */
+
+   ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
+
+   /* continuation bytes */
+
+   /* 80..8F */
+   CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+   CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
+
+   /* 90..9F */
+   CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+   CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
+
+   /* A0..BF */
+   CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+   CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+   CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+   CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
+
+   /* leading bytes */
+
+   /* C0..DF */
+   ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
+   L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+   L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+   L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+
+   /* E0..EF */
+   L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
+   L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
+
+   /* F0..FF */
+   L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
+   ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
+};
+
+static void
+utf8_advance(const unsigned char *s, uint32 *state, int len)
+{
+   /* Note: We deliberately don't check the state's value here. */
+   while (len > 0)
+   {
+       /*
+        * It's important that the mask value is 31: In most instruction sets,
+        * a shift by a 32-bit operand is understood to be a shift by its mod
+        * 32, so the compiler should elide the mask operation.
+        */
+       *state = Utf8Transition[*s++] >> (*state & 31);
+       len--;
+   }
+
+   *state &= 31;
+}
+
  static int
  pg_utf8_verifystr(const unsigned char *s, int len)
  {
     const unsigned char *start = s;
+   const int   orig_len = len;
+   uint32      state = BGN;
+
+/*
+ * Sixteen seems to give the best balance of performance across different
+ * byte distributions.
+ */
+#define STRIDE_LENGTH 16
+
+   if (len >= STRIDE_LENGTH)
+   {
+       while (len >= STRIDE_LENGTH)
+       {
+           /*
+            * If the chunk is all ASCII, we can skip the full UTF-8 check,
+            * but we must first check for a non-END state, which means the
+            * previous chunk ended in the middle of a multibyte sequence.
+            */
+           if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
+               utf8_advance(s, &state, STRIDE_LENGTH);
+
+           s += STRIDE_LENGTH;
+           len -= STRIDE_LENGTH;
+       }
+
+       /*
+        * The error state persists, so we only need to check for it here. In
+        * case of error we start over from the beginning with the slow path
+        * so we can count the valid bytes.
+        */
+       if (state == ERR)
+       {
+           len = orig_len;
+           s = start;
+       }
+
+       /*
+        * We treat all other states as success, but it's possible the fast
+        * path exited in the middle of a multibyte sequence, since that
+        * wouldn't have caused an error. Before checking the remaining bytes,
+        * walk backwards to find the last byte that could have been the start
+        * of a valid sequence.
+        */
+       while (s > start)
+       {
+           s--;
+           len++;
+
+           if (!IS_HIGHBIT_SET(*s) || pg_utf_mblen(s) > 1)
+               break;
+       }
+   }
  
+   /* check remaining bytes */
     while (len > 0)
     {
         int         l;
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h

index d93ccac263338c751347c3c5a25e56ef34127711..6bd996b3d0c4c3f05e1a5862cc94d244d1d168eb 100644 (file)
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -699,4 +699,57 @@ extern int mic2latin_with_table(const unsigned char *mic, unsigned char *p,
  extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
  #endif
  
+
+/*
+ * Verify a chunk of bytes for valid ASCII.
+ *
+ * Returns false if the input contains any zero bytes or bytes with the
+ * high-bit set. Input len must be a multiple of 8.
+ */
+static inline bool
+is_valid_ascii(const unsigned char *s, int len)
+{
+   uint64      chunk,
+               highbit_cum = UINT64CONST(0),
+               zero_cum = UINT64CONST(0x8080808080808080);
+
+   Assert(len % sizeof(chunk) == 0);
+
+   while (len > 0)
+   {
+       memcpy(&chunk, s, sizeof(chunk));
+
+       /*
+        * Capture any zero bytes in this chunk.
+        *
+        * First, add 0x7f to each byte. This sets the high bit in each byte,
+        * unless it was a zero. If any resulting high bits are zero, the
+        * corresponding high bits in the zero accumulator will be cleared.
+        *
+        * If none of the bytes in the chunk had the high bit set, the max
+        * value each byte can have after the addition is 0x7f + 0x7f = 0xfe,
+        * and we don't need to worry about carrying over to the next byte. If
+        * any input bytes did have the high bit set, it doesn't matter
+        * because we check for those separately.
+        */
+       zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+
+       /* Capture any set bits in this chunk. */
+       highbit_cum |= chunk;
+
+       s += sizeof(chunk);
+       len -= sizeof(chunk);
+   }
+
+   /* Check if any high bits in the high bit accumulator got set. */
+   if (highbit_cum & UINT64CONST(0x8080808080808080))
+       return false;
+
+   /* Check if any high bits in the zero accumulator got cleared. */
+   if (zero_cum != UINT64CONST(0x8080808080808080))
+       return false;
+
+   return true;
+}
+
  #endif                         /* PG_WCHAR_H */
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out

index 5c9d63175562c67e36847b5539fecf38d6adf9d4..f8a64f616e65208c6b7f2ec9498d07f7d19e26ed 100644 (file)
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,175 @@ $$;
  --
  -- UTF-8
  --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\x66006f', 'NUL byte'),
+  ('\xaf',     'bare continuation'),
+  ('\xc5',     'missing second byte in 2-byte char'),
+  ('\xc080',   'smallest 2-byte overlong'),
+  ('\xc1bf',   'largest 2-byte overlong'),
+  ('\xc280',   'next 2-byte after overlongs'),
+  ('\xdfbf',   'largest 2-byte'),
+  ('\xe9af',   'missing third byte in 3-byte char'),
+  ('\xe08080', 'smallest 3-byte overlong'),
+  ('\xe09fbf', 'largest 3-byte overlong'),
+  ('\xe0a080', 'next 3-byte after overlong'),
+  ('\xed9fbf', 'last before surrogates'),
+  ('\xeda080', 'smallest surrogate'),
+  ('\xedbfbf', 'largest surrogate'),
+  ('\xee8080', 'next after surrogates'),
+  ('\xefbfbf', 'largest 3-byte'),
+  ('\xf1afbf', 'missing fourth byte in 4-byte char'),
+  ('\xf0808080',   'smallest 4-byte overlong'),
+  ('\xf08fbfbf',   'largest 4-byte overlong'),
+  ('\xf0908080',   'next 4-byte after overlong'),
+  ('\xf48fbfbf',   'largest 4-byte'),
+  ('\xf4908080',   'smallest too large'),
+  ('\xfa9a9a8a8a', '5-byte');
+-- Test UTF-8 verification slow path
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+-- The error message for a sequence starting with a 4-byte lead
+-- will contain all 4 bytes if they are present, so various
+-- expressions below add 3 ASCII bytes to the end to ensure
+-- consistent error messages.
+-- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+-- Test multibyte verification in fast path
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test ASCII verification in fast path where incomplete
+-- UTF-8 sequences fall at the end of the preceding chunk.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test cases where UTF-8 sequences within short text
+-- come after the fast path returns.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64)::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
+-- Test cases where incomplete UTF-8 sequences fall at the
+-- end of the part checked by the fast path.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
  CREATE TABLE utf8_inputs (inbytes bytea, description text);
  insert into utf8_inputs  values
    ('\x666f6f',     'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql

index 5576999e42ebd470a5a5f6f8ca1dc31ba597474a..e178e2479b0e54d4700f880b269765111fef5cc5 100644 (file)
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,139 @@ $$;
  --
  -- UTF-8
  --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\x66006f', 'NUL byte'),
+  ('\xaf',     'bare continuation'),
+  ('\xc5',     'missing second byte in 2-byte char'),
+  ('\xc080',   'smallest 2-byte overlong'),
+  ('\xc1bf',   'largest 2-byte overlong'),
+  ('\xc280',   'next 2-byte after overlongs'),
+  ('\xdfbf',   'largest 2-byte'),
+  ('\xe9af',   'missing third byte in 3-byte char'),
+  ('\xe08080', 'smallest 3-byte overlong'),
+  ('\xe09fbf', 'largest 3-byte overlong'),
+  ('\xe0a080', 'next 3-byte after overlong'),
+  ('\xed9fbf', 'last before surrogates'),
+  ('\xeda080', 'smallest surrogate'),
+  ('\xedbfbf', 'largest surrogate'),
+  ('\xee8080', 'next after surrogates'),
+  ('\xefbfbf', 'largest 3-byte'),
+  ('\xf1afbf', 'missing fourth byte in 4-byte char'),
+  ('\xf0808080',   'smallest 4-byte overlong'),
+  ('\xf08fbfbf',   'largest 4-byte overlong'),
+  ('\xf0908080',   'next 4-byte after overlong'),
+  ('\xf48fbfbf',   'largest 4-byte'),
+  ('\xf4908080',   'smallest too large'),
+  ('\xfa9a9a8a8a', '5-byte');
+
+-- Test UTF-8 verification slow path
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+-- The error message for a sequence starting with a 4-byte lead
+-- will contain all 4 bytes if they are present, so various
+-- expressions below add 3 ASCII bytes to the end to ensure
+-- consistent error messages.
+-- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+
+-- Test multibyte verification in fast path
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test ASCII verification in fast path where incomplete
+-- UTF-8 sequences fall at the end of the preceding chunk.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 64)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test cases where UTF-8 sequences within short text
+-- come after the fast path returns.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64)::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
+-- Test cases where incomplete UTF-8 sequences fall at the
+-- end of the part checked by the fast path.
+with test_bytes as (
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(repeat('.', 64 - length(inbytes))::bytea || inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
  CREATE TABLE utf8_inputs (inbytes bytea, description text);
  insert into utf8_inputs  values
    ('\x666f6f',     'valid, pure ASCII'),
author	John Naylor <john.naylor@postgresql.org>
	Tue, 19 Oct 2021 20:43:14 +0000 (16:43 -0400)
committer	John Naylor <john.naylor@postgresql.org>
	Mon, 20 Dec 2021 14:07:29 +0000 (10:07 -0400)
src/common/wchar.c		patch \| blob \| blame \| history
src/include/mb/pg_wchar.h		patch \| blob \| blame \| history
src/test/regress/expected/conversion.out		patch \| blob \| blame \| history
src/test/regress/sql/conversion.sql		patch \| blob \| blame \| history