Add pg_encoding_set_invalid()

author Andres Freund <andres@anarazel.de>

Mon, 10 Feb 2025 15:03:40 +0000 (10:03 -0500)

committer Andres Freund <andres@anarazel.de>

Mon, 10 Feb 2025 15:03:40 +0000 (10:03 -0500)
author Andres Freund <andres@anarazel.de>
Mon, 10 Feb 2025 15:03:40 +0000 (10:03 -0500)
committer Andres Freund <andres@anarazel.de>
Mon, 10 Feb 2025 15:03:40 +0000 (10:03 -0500)
diff --git a/src/common/wchar.c b/src/common/wchar.c

index 0636b8765ba358a382fccabab376adabe75e44c9..35885fb6de7d91650ecefb91f5bccce2c3b6d255 100644 (file)
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -15,6 +15,25 @@
  #include "mb/pg_wchar.h"
  
  
+/*
+ * In today's multibyte encodings other than UTF8, this two-byte sequence
+ * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
+ *
+ * For historical reasons, several verifychar implementations opt to reject
+ * this pair specifically.  Byte pair range constraints, in encoding
+ * originator documentation, always excluded this pair.  No core conversion
+ * could translate it.  However, longstanding verifychar implementations
+ * accepted any non-NUL byte.  big5_to_euc_tw and big5_to_mic even translate
+ * pairs not valid per encoding originator documentation.  To avoid tightening
+ * core or non-core conversions in a security patch, we sought this one pair.
+ *
+ * PQescapeString() historically used spaces for BYTE1; many other values
+ * could suffice for BYTE1.
+ */
+#define NONUTF8_INVALID_BYTE0 (0x8d)
+#define NONUTF8_INVALID_BYTE1 (' ')
+
+
  /*
   * Operations on multi-byte encodings are driven by a table of helper
   * functions.
@@ -1532,6 +1551,11 @@ pg_big5_verifychar(const unsigned char *s, int len)
     if (len < l)
         return -1;
  
+   if (l == 2 &&
+       s[0] == NONUTF8_INVALID_BYTE0 &&
+       s[1] == NONUTF8_INVALID_BYTE1)
+       return -1;
+
     while (--l > 0)
     {
         if (*++s == '\0')
@@ -1581,6 +1605,11 @@ pg_gbk_verifychar(const unsigned char *s, int len)
     if (len < l)
         return -1;
  
+   if (l == 2 &&
+       s[0] == NONUTF8_INVALID_BYTE0 &&
+       s[1] == NONUTF8_INVALID_BYTE1)
+       return -1;
+
     while (--l > 0)
     {
         if (*++s == '\0')
@@ -1630,6 +1659,11 @@ pg_uhc_verifychar(const unsigned char *s, int len)
     if (len < l)
         return -1;
  
+   if (l == 2 &&
+       s[0] == NONUTF8_INVALID_BYTE0 &&
+       s[1] == NONUTF8_INVALID_BYTE1)
+       return -1;
+
     while (--l > 0)
     {
         if (*++s == '\0')
@@ -1858,6 +1892,19 @@ pg_utf8_islegal(const unsigned char *source, int length)
  }
  
  
+/*
+ * Fills the provided buffer with two bytes such that:
+ *   pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
+ */
+void
+pg_encoding_set_invalid(int encoding, char *dst)
+{
+   Assert(pg_encoding_max_length(encoding) > 1);
+
+   dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
+   dst[1] = NONUTF8_INVALID_BYTE1;
+}
+
  /*
   *-------------------------------------------------------------------
   * encoding info table
@@ -1980,5 +2027,11 @@ pg_encoding_max_length(int encoding)
  {
     Assert(PG_VALID_ENCODING(encoding));
  
-   return pg_wchar_table[encoding].maxmblen;
+   /*
+    * Check for the encoding despite the assert, due to some mingw versions
+    * otherwise issuing bogus warnings.
+    */
+   return PG_VALID_ENCODING(encoding) ?
+       pg_wchar_table[encoding].maxmblen :
+       pg_wchar_table[PG_SQL_ASCII].maxmblen;
  }
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h

index d93ccac263338c751347c3c5a25e56ef34127711..abd65eb9f7dbca2c26909e390bc5a05b21091999 100644 (file)
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -359,7 +359,7 @@ typedef struct pg_enc2name
  #endif
  } pg_enc2name;
  
-extern const pg_enc2name pg_enc2name_tbl[];
+extern PGDLLIMPORT const pg_enc2name pg_enc2name_tbl[];
  
  /*
   * Encoding names for gettext
@@ -573,6 +573,7 @@ extern int  pg_valid_server_encoding_id(int encoding);
   * (in addition to the ones just above).  The constant tables declared
   * earlier in this file are also available from libpgcommon.
   */
+extern void pg_encoding_set_invalid(int encoding, char *dst);
  extern int pg_encoding_mblen(int encoding, const char *mbstr);
  extern int pg_encoding_mblen_bounded(int encoding, const char *mbstr);
  extern int pg_encoding_dsplen(int encoding, const char *mbstr);
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out

index 04fdcba4964a5669721a65393d273e7ba7aac730..772814732af0850a352aadf845fd495472884a51 100644 (file)
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -1,6 +1,10 @@
  --
  -- create user defined conversion
  --
+SELECT FROM test_enc_setup();
+--
+(1 row)
+
  CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE;
  SET SESSION AUTHORIZATION regress_conversion_user;
  CREATE CONVERSION myconv FOR 'LATIN1' TO 'UTF8' FROM iso8859_1_to_utf8;
diff --git a/src/test/regress/input/create_function_0.source b/src/test/regress/input/create_function_0.source

index f47f635789ab2a30583bbb38afa23dffa4e1bcad..54c76f9a8ed11481f8ebc779c31aefc1bbcc4012 100644 (file)
--- a/src/test/regress/input/create_function_0.source
+++ b/src/test/regress/input/create_function_0.source
@@ -59,6 +59,11 @@ CREATE FUNCTION test_opclass_options_func(internal)
      AS '@libdir@/regress@DLSUFFIX@', 'test_opclass_options_func'
      LANGUAGE C;
  
+
+CREATE FUNCTION test_enc_setup() RETURNS void
+    AS '@libdir@/regress@DLSUFFIX@', 'test_enc_setup'
+    LANGUAGE C STRICT;
+
  CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
      AS '@libdir@/regress@DLSUFFIX@', 'test_enc_conversion'
      LANGUAGE C STRICT;
diff --git a/src/test/regress/output/create_function_0.source b/src/test/regress/output/create_function_0.source

index 342bc40e115ff55778096bc62f5c85c9b9a84b13..7d3908967ac61a793ac6f862393f2eda332d7bb3 100644 (file)
--- a/src/test/regress/output/create_function_0.source
+++ b/src/test/regress/output/create_function_0.source
@@ -46,6 +46,9 @@ CREATE FUNCTION test_opclass_options_func(internal)
      RETURNS void
      AS '@libdir@/regress@DLSUFFIX@', 'test_opclass_options_func'
      LANGUAGE C;
+CREATE FUNCTION test_enc_setup() RETURNS void
+    AS '@libdir@/regress@DLSUFFIX@', 'test_enc_setup'
+    LANGUAGE C STRICT;
  CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
      AS '@libdir@/regress@DLSUFFIX@', 'test_enc_conversion'
      LANGUAGE C STRICT;
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c

index 38d67aa79c9131acb92d9944c0929c55de7fc247..0a0153e55511a2fe36015f9a00322089971baa59 100644 (file)
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -1089,6 +1089,56 @@ test_opclass_options_func(PG_FUNCTION_ARGS)
     PG_RETURN_NULL();
  }
  
+/* one-time tests for encoding infrastructure */
+PG_FUNCTION_INFO_V1(test_enc_setup);
+Datum
+test_enc_setup(PG_FUNCTION_ARGS)
+{
+   /* Test pg_encoding_set_invalid() */
+   for (int i = 0; i < _PG_LAST_ENCODING_; i++)
+   {
+       char        buf[2],
+                   bigbuf[16];
+       int         len,
+                   mblen,
+                   valid;
+
+       if (pg_encoding_max_length(i) == 1)
+           continue;
+       pg_encoding_set_invalid(i, buf);
+       len = strnlen(buf, 2);
+       if (len != 2)
+           elog(WARNING,
+                "official invalid string for encoding \"%s\" has length %d",
+                pg_enc2name_tbl[i].name, len);
+       mblen = pg_encoding_mblen(i, buf);
+       if (mblen != 2)
+           elog(WARNING,
+                "official invalid string for encoding \"%s\" has mblen %d",
+                pg_enc2name_tbl[i].name, mblen);
+       valid = pg_encoding_verifymbstr(i, buf, len);
+       if (valid != 0)
+           elog(WARNING,
+                "official invalid string for encoding \"%s\" has valid prefix of length %d",
+                pg_enc2name_tbl[i].name, valid);
+       valid = pg_encoding_verifymbstr(i, buf, 1);
+       if (valid != 0)
+           elog(WARNING,
+                "first byte of official invalid string for encoding \"%s\" has valid prefix of length %d",
+                pg_enc2name_tbl[i].name, valid);
+       memset(bigbuf, ' ', sizeof(bigbuf));
+       bigbuf[0] = buf[0];
+       bigbuf[1] = buf[1];
+       valid = pg_encoding_verifymbstr(i, bigbuf, sizeof(bigbuf));
+       if (valid != 0)
+           elog(WARNING,
+                "trailing data changed official invalid string for encoding \"%s\" to have valid prefix of length %d",
+                pg_enc2name_tbl[i].name, valid);
+   }
+
+   PG_RETURN_VOID();
+}
+
  /*
   * Call an encoding conversion or verification function.
   *
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql

index 835868243219774e1a3e23ce2368a7f2f32710ca..d22b065885fa4695531fac815b821dff17a86e42 100644 (file)
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -1,6 +1,9 @@
  --
  -- create user defined conversion
  --
+
+SELECT FROM test_enc_setup();
+
  CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE;
  SET SESSION AUTHORIZATION regress_conversion_user;
  CREATE CONVERSION myconv FOR 'LATIN1' TO 'UTF8' FROM iso8859_1_to_utf8;
author	Andres Freund <andres@anarazel.de>
	Mon, 10 Feb 2025 15:03:40 +0000 (10:03 -0500)
committer	Andres Freund <andres@anarazel.de>
	Mon, 10 Feb 2025 15:03:40 +0000 (10:03 -0500)
src/common/wchar.c		patch \| blob \| blame \| history
src/include/mb/pg_wchar.h		patch \| blob \| blame \| history
src/test/regress/expected/conversion.out		patch \| blob \| blame \| history
src/test/regress/input/create_function_0.source		patch \| blob \| blame \| history
src/test/regress/output/create_function_0.source		patch \| blob \| blame \| history
src/test/regress/regress.c		patch \| blob \| blame \| history
src/test/regress/sql/conversion.sql		patch \| blob \| blame \| history