Rationalize code placement between wchar.c, encnames.c, and mbutils.c.

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 16 Jan 2020 23:08:21 +0000 (18:08 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 16 Jan 2020 23:08:21 +0000 (18:08 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 16 Jan 2020 23:08:21 +0000 (18:08 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 16 Jan 2020 23:08:21 +0000 (18:08 -0500)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c

index 0eb915e62e9fb38bcea00c416f15ace2fabf252f..25fb7e2ebfa120c735b28bf14d01eeb310e96c30 100644 (file)
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1555,9 +1555,14 @@ init_icu_converter(void)
         UConverter *conv;
  
         if (icu_converter)
-               return;
+               return;                                 /* already done */
  
         icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
+       if (!icu_encoding_name)
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("encoding \"%s\" not supported by ICU",
+                                               pg_encoding_to_char(GetDatabaseEncoding()))));
  
         status = U_ZERO_ERROR;
         conv = ucnv_open(icu_encoding_name, &status);
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c

index 11f17b337e6511bbff8d9e363f5f34d3a8570afd..54dcf71fb75625163fccb7d29c793999ebfc9b74 100644 (file)
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -115,7 +115,7 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len,
                 }
                 else
                 {
-                       int                     l = pg_mic_mblen(mic);
+                       int                     l = pg_mule_mblen(mic);
  
                         if (len < l)
                                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
@@ -217,7 +217,7 @@ mic2latin_with_table(const unsigned char *mic,
                 }
                 else
                 {
-                       int                     l = pg_mic_mblen(mic);
+                       int                     l = pg_mule_mblen(mic);
  
                         if (len < l)
                                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c

index 5d7cc74ad6c869e8011354d50fa7ba92a462ac62..86787bcb3190a0048bf7cff7eccaf6c3f1053732 100644 (file)
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -1066,6 +1066,23 @@ pg_client_encoding(PG_FUNCTION_ARGS)
         return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
  }
  
+Datum
+PG_char_to_encoding(PG_FUNCTION_ARGS)
+{
+       Name            s = PG_GETARG_NAME(0);
+
+       PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
+}
+
+Datum
+PG_encoding_to_char(PG_FUNCTION_ARGS)
+{
+       int32           encoding = PG_GETARG_INT32(0);
+       const char *encoding_name = pg_encoding_to_char(encoding);
+
+       return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
+}
+
  /*
   * gettext() returns messages in this encoding.  This often matches the
   * database encoding, but it differs for SQL_ASCII databases, for processes
@@ -1078,6 +1095,438 @@ GetMessageEncoding(void)
         return MessageEncoding->encoding;
  }
  
+
+/*
+ * Generic character incrementer function.
+ *
+ * Not knowing anything about the properties of the encoding in use, we just
+ * keep incrementing the last byte until we get a validly-encoded result,
+ * or we run out of values to try.  We don't bother to try incrementing
+ * higher-order bytes, so there's no growth in runtime for wider characters.
+ * (If we did try to do that, we'd need to consider the likelihood that 255
+ * is not a valid final byte in the encoding.)
+ */
+static bool
+pg_generic_charinc(unsigned char *charptr, int len)
+{
+       unsigned char *lastbyte = charptr + len - 1;
+       mbverifier      mbverify;
+
+       /* We can just invoke the character verifier directly. */
+       mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
+
+       while (*lastbyte < (unsigned char) 255)
+       {
+               (*lastbyte)++;
+               if ((*mbverify) (charptr, len) == len)
+                       return true;
+       }
+
+       return false;
+}
+
+/*
+ * UTF-8 character incrementer function.
+ *
+ * For a one-byte character less than 0x7F, we just increment the byte.
+ *
+ * For a multibyte character, every byte but the first must fall between 0x80
+ * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
+ * the last byte that's not already at its maximum value.  If we can't find a
+ * byte that's less than the maximum allowable value, we simply fail.  We also
+ * need some special-case logic to skip regions used for surrogate pair
+ * handling, as those should not occur in valid UTF-8.
+ *
+ * Note that we don't reset lower-order bytes back to their minimums, since
+ * we can't afford to make an exhaustive search (see make_greater_string).
+ */
+static bool
+pg_utf8_increment(unsigned char *charptr, int length)
+{
+       unsigned char a;
+       unsigned char limit;
+
+       switch (length)
+       {
+               default:
+                       /* reject lengths 5 and 6 for now */
+                       return false;
+               case 4:
+                       a = charptr[3];
+                       if (a < 0xBF)
+                       {
+                               charptr[3]++;
+                               break;
+                       }
+                       /* FALL THRU */
+               case 3:
+                       a = charptr[2];
+                       if (a < 0xBF)
+                       {
+                               charptr[2]++;
+                               break;
+                       }
+                       /* FALL THRU */
+               case 2:
+                       a = charptr[1];
+                       switch (*charptr)
+                       {
+                               case 0xED:
+                                       limit = 0x9F;
+                                       break;
+                               case 0xF4:
+                                       limit = 0x8F;
+                                       break;
+                               default:
+                                       limit = 0xBF;
+                                       break;
+                       }
+                       if (a < limit)
+                       {
+                               charptr[1]++;
+                               break;
+                       }
+                       /* FALL THRU */
+               case 1:
+                       a = *charptr;
+                       if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
+                               return false;
+                       charptr[0]++;
+                       break;
+       }
+
+       return true;
+}
+
+/*
+ * EUC-JP character incrementer function.
+ *
+ * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
+ * representing JIS X 0201 characters with the second byte ranging between
+ * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
+ * and otherwise rewrite the whole sequence to 0xa1 0xa1.
+ *
+ * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
+ * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
+ * is incremented if possible, otherwise the second-to-last byte.
+ *
+ * If the sequence starts with a value other than the above and its MSB
+ * is set, it must be a two-byte sequence representing JIS X 0208 characters
+ * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
+ * incremented if possible, otherwise the second-to-last byte.
+ *
+ * Otherwise, the sequence is a single-byte ASCII character. It is
+ * incremented up to 0x7f.
+ */
+static bool
+pg_eucjp_increment(unsigned char *charptr, int length)
+{
+       unsigned char c1,
+                               c2;
+       int                     i;
+
+       c1 = *charptr;
+
+       switch (c1)
+       {
+               case SS2:                               /* JIS X 0201 */
+                       if (length != 2)
+                               return false;
+
+                       c2 = charptr[1];
+
+                       if (c2 >= 0xdf)
+                               charptr[0] = charptr[1] = 0xa1;
+                       else if (c2 < 0xa1)
+                               charptr[1] = 0xa1;
+                       else
+                               charptr[1]++;
+                       break;
+
+               case SS3:                               /* JIS X 0212 */
+                       if (length != 3)
+                               return false;
+
+                       for (i = 2; i > 0; i--)
+                       {
+                               c2 = charptr[i];
+                               if (c2 < 0xa1)
+                               {
+                                       charptr[i] = 0xa1;
+                                       return true;
+                               }
+                               else if (c2 < 0xfe)
+                               {
+                                       charptr[i]++;
+                                       return true;
+                               }
+                       }
+
+                       /* Out of 3-byte code region */
+                       return false;
+
+               default:
+                       if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
+                       {
+                               if (length != 2)
+                                       return false;
+
+                               for (i = 1; i >= 0; i--)
+                               {
+                                       c2 = charptr[i];
+                                       if (c2 < 0xa1)
+                                       {
+                                               charptr[i] = 0xa1;
+                                               return true;
+                                       }
+                                       else if (c2 < 0xfe)
+                                       {
+                                               charptr[i]++;
+                                               return true;
+                                       }
+                               }
+
+                               /* Out of 2 byte code region */
+                               return false;
+                       }
+                       else
+                       {                                       /* ASCII, single byte */
+                               if (c1 > 0x7e)
+                                       return false;
+                               (*charptr)++;
+                       }
+                       break;
+       }
+
+       return true;
+}
+
+/*
+ * get the character incrementer for the encoding for the current database
+ */
+mbcharacter_incrementer
+pg_database_encoding_character_incrementer(void)
+{
+       /*
+        * Eventually it might be best to add a field to pg_wchar_table[], but for
+        * now we just use a switch.
+        */
+       switch (GetDatabaseEncoding())
+       {
+               case PG_UTF8:
+                       return pg_utf8_increment;
+
+               case PG_EUC_JP:
+                       return pg_eucjp_increment;
+
+               default:
+                       return pg_generic_charinc;
+       }
+}
+
+/*
+ * fetch maximum length of the encoding for the current database
+ */
+int
+pg_database_encoding_max_length(void)
+{
+       return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the current
+ * database encoding.  Otherwise same as pg_verify_mbstr().
+ */
+bool
+pg_verifymbstr(const char *mbstr, int len, bool noError)
+{
+       return
+               pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the specified
+ * encoding.
+ */
+bool
+pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
+{
+       return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the specified
+ * encoding.
+ *
+ * mbstr is not necessarily zero terminated; length of mbstr is
+ * specified by len.
+ *
+ * If OK, return length of string in the encoding.
+ * If a problem is found, return -1 when noError is
+ * true; when noError is false, ereport() a descriptive message.
+ */
+int
+pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
+{
+       mbverifier      mbverify;
+       int                     mb_len;
+
+       Assert(PG_VALID_ENCODING(encoding));
+
+       /*
+        * In single-byte encodings, we need only reject nulls (\0).
+        */
+       if (pg_encoding_max_length(encoding) <= 1)
+       {
+               const char *nullpos = memchr(mbstr, 0, len);
+
+               if (nullpos == NULL)
+                       return len;
+               if (noError)
+                       return -1;
+               report_invalid_encoding(encoding, nullpos, 1);
+       }
+
+       /* fetch function pointer just once */
+       mbverify = pg_wchar_table[encoding].mbverify;
+
+       mb_len = 0;
+
+       while (len > 0)
+       {
+               int                     l;
+
+               /* fast path for ASCII-subset characters */
+               if (!IS_HIGHBIT_SET(*mbstr))
+               {
+                       if (*mbstr != '\0')
+                       {
+                               mb_len++;
+                               mbstr++;
+                               len--;
+                               continue;
+                       }
+                       if (noError)
+                               return -1;
+                       report_invalid_encoding(encoding, mbstr, len);
+               }
+
+               l = (*mbverify) ((const unsigned char *) mbstr, len);
+
+               if (l < 0)
+               {
+                       if (noError)
+                               return -1;
+                       report_invalid_encoding(encoding, mbstr, len);
+               }
+
+               mbstr += l;
+               len -= l;
+               mb_len++;
+       }
+       return mb_len;
+}
+
+/*
+ * check_encoding_conversion_args: check arguments of a conversion function
+ *
+ * "expected" arguments can be either an encoding ID or -1 to indicate that
+ * the caller will check whether it accepts the ID.
+ *
+ * Note: the errors here are not really user-facing, so elog instead of
+ * ereport seems sufficient.  Also, we trust that the "expected" encoding
+ * arguments are valid encoding IDs, but we don't trust the actuals.
+ */
+void
+check_encoding_conversion_args(int src_encoding,
+                                                          int dest_encoding,
+                                                          int len,
+                                                          int expected_src_encoding,
+                                                          int expected_dest_encoding)
+{
+       if (!PG_VALID_ENCODING(src_encoding))
+               elog(ERROR, "invalid source encoding ID: %d", src_encoding);
+       if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
+               elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
+                        pg_enc2name_tbl[expected_src_encoding].name,
+                        pg_enc2name_tbl[src_encoding].name);
+       if (!PG_VALID_ENCODING(dest_encoding))
+               elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
+       if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
+               elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
+                        pg_enc2name_tbl[expected_dest_encoding].name,
+                        pg_enc2name_tbl[dest_encoding].name);
+       if (len < 0)
+               elog(ERROR, "encoding conversion length must not be negative");
+}
+
+/*
+ * report_invalid_encoding: complain about invalid multibyte character
+ *
+ * note: len is remaining length of string, not length of character;
+ * len must be greater than zero, as we always examine the first byte.
+ */
+void
+report_invalid_encoding(int encoding, const char *mbstr, int len)
+{
+       int                     l = pg_encoding_mblen(encoding, mbstr);
+       char            buf[8 * 5 + 1];
+       char       *p = buf;
+       int                     j,
+                               jlimit;
+
+       jlimit = Min(l, len);
+       jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
+
+       for (j = 0; j < jlimit; j++)
+       {
+               p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
+               if (j < jlimit - 1)
+                       p += sprintf(p, " ");
+       }
+
+       ereport(ERROR,
+                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                        errmsg("invalid byte sequence for encoding \"%s\": %s",
+                                       pg_enc2name_tbl[encoding].name,
+                                       buf)));
+}
+
+/*
+ * report_untranslatable_char: complain about untranslatable character
+ *
+ * note: len is remaining length of string, not length of character;
+ * len must be greater than zero, as we always examine the first byte.
+ */
+void
+report_untranslatable_char(int src_encoding, int dest_encoding,
+                                                  const char *mbstr, int len)
+{
+       int                     l = pg_encoding_mblen(src_encoding, mbstr);
+       char            buf[8 * 5 + 1];
+       char       *p = buf;
+       int                     j,
+                               jlimit;
+
+       jlimit = Min(l, len);
+       jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
+
+       for (j = 0; j < jlimit; j++)
+       {
+               p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
+               if (j < jlimit - 1)
+                       p += sprintf(p, " ");
+       }
+
+       ereport(ERROR,
+                       (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
+                        errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
+                                       buf,
+                                       pg_enc2name_tbl[src_encoding].name,
+                                       pg_enc2name_tbl[dest_encoding].name)));
+}
+
+
  #ifdef WIN32
  /*
   * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
@@ -1149,4 +1598,4 @@ pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
         return utf16;
  }
  
-#endif
+#endif                                                 /* WIN32 */
diff --git a/src/common/encnames.c b/src/common/encnames.c

index 2086e007fc547b9be20ace86c76f0dc432a2ab0b..14cf1b39e9866fb2190b28b9b9a7fc071634ea25 100644 (file)
--- a/src/common/encnames.c
+++ b/src/common/encnames.c
@@ -10,12 +10,7 @@
   *
   *-------------------------------------------------------------------------
   */
-#ifdef FRONTEND
-#include "postgres_fe.h"
-#else
-#include "postgres.h"
-#include "utils/builtins.h"
-#endif
+#include "c.h"
  
  #include <ctype.h>
  #include <unistd.h>
@@ -310,6 +305,7 @@ static const pg_encname pg_encname_tbl[] =
  #else
  #define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
  #endif
+
  const pg_enc2name pg_enc2name_tbl[] =
  {
         DEF_ENC2NAME(SQL_ASCII, 0),
@@ -409,10 +405,8 @@ const pg_enc2gettext pg_enc2gettext_tbl[] =
  };
  
  
-#ifndef FRONTEND
-
  /*
- * Table of encoding names for ICU
+ * Table of encoding names for ICU (currently covers backend encodings only)
   *
   * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
   *
@@ -457,33 +451,32 @@ static const char *const pg_enc2icu_tbl[] =
         "KOI8-U",                                       /* PG_KOI8U */
  };
  
+
+/*
+ * Is this encoding supported by ICU?
+ */
  bool
  is_encoding_supported_by_icu(int encoding)
  {
+       if (!PG_VALID_BE_ENCODING(encoding))
+               return false;
         return (pg_enc2icu_tbl[encoding] != NULL);
  }
  
+/*
+ * Returns ICU's name for encoding, or NULL if not supported
+ */
  const char *
  get_encoding_name_for_icu(int encoding)
  {
-       const char *icu_encoding_name;
-
         StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
                                          "pg_enc2icu_tbl incomplete");
  
-       icu_encoding_name = pg_enc2icu_tbl[encoding];
-
-       if (!icu_encoding_name)
-               ereport(ERROR,
-                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                errmsg("encoding \"%s\" not supported by ICU",
-                                               pg_encoding_to_char(encoding))));
-
-       return icu_encoding_name;
+       if (!PG_VALID_BE_ENCODING(encoding))
+               return NULL;
+       return pg_enc2icu_tbl[encoding];
  }
  
-#endif                                                 /* not FRONTEND */
-
  
  /* ----------
   * Encoding checks, for error returns -1 else encoding id
@@ -523,9 +516,10 @@ pg_valid_server_encoding_id(int encoding)
         return PG_VALID_BE_ENCODING(encoding);
  }
  
-/* ----------
- * Remove irrelevant chars from encoding name
- * ----------
+/*
+ * Remove irrelevant chars from encoding name, store at *newkey
+ *
+ * (Caller's responsibility to provide a large enough buffer)
   */
  static char *
  clean_encoding_name(const char *key, char *newkey)
@@ -547,11 +541,10 @@ clean_encoding_name(const char *key, char *newkey)
         return newkey;
  }
  
-/* ----------
+/*
   * Search encoding by encoding name
   *
- * Returns encoding ID, or -1 for error
- * ----------
+ * Returns encoding ID, or -1 if not recognized
   */
  int
  pg_char_to_encoding(const char *name)
@@ -568,16 +561,8 @@ pg_char_to_encoding(const char *name)
                 return -1;
  
         if (strlen(name) >= NAMEDATALEN)
-       {
-#ifdef FRONTEND
-               fprintf(stderr, "encoding name too long\n");
-               return -1;
-#else
-               ereport(ERROR,
-                               (errcode(ERRCODE_NAME_TOO_LONG),
-                                errmsg("encoding name too long")));
-#endif
-       }
+               return -1;                              /* it's certainly not in the table */
+
         key = clean_encoding_name(name, buff);
  
         while (last >= base)
@@ -599,16 +584,6 @@ pg_char_to_encoding(const char *name)
         return -1;
  }
  
-#ifndef FRONTEND
-Datum
-PG_char_to_encoding(PG_FUNCTION_ARGS)
-{
-       Name            s = PG_GETARG_NAME(0);
-
-       PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
-}
-#endif
-
  const char *
  pg_encoding_to_char(int encoding)
  {
@@ -621,15 +596,3 @@ pg_encoding_to_char(int encoding)
         }
         return "";
  }
-
-#ifndef FRONTEND
-Datum
-PG_encoding_to_char(PG_FUNCTION_ARGS)
-{
-       int32           encoding = PG_GETARG_INT32(0);
-       const char *encoding_name = pg_encoding_to_char(encoding);
-
-       return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
-}
-
-#endif
diff --git a/src/common/wchar.c b/src/common/wchar.c

index 74a88239a1dc08e06482faecb89c04336622aff5..efaf1c155bbdec453a603596e18b0b84729b97a1 100644 (file)
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -10,11 +10,7 @@
   *
   *-------------------------------------------------------------------------
   */
-#ifdef FRONTEND
-#include "postgres_fe.h"
-#else
-#include "postgres.h"
-#endif
+#include "c.h"
  
  #include "mb/pg_wchar.h"
  
@@ -838,6 +834,7 @@ pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
         return cnt;
  }
  
+/* exported for direct use by conv.c */
  int
  pg_mule_mblen(const unsigned char *s)
  {
@@ -1498,214 +1495,6 @@ pg_utf8_islegal(const unsigned char *source, int length)
         return true;
  }
  
-#ifndef FRONTEND
-
-/*
- * Generic character incrementer function.
- *
- * Not knowing anything about the properties of the encoding in use, we just
- * keep incrementing the last byte until we get a validly-encoded result,
- * or we run out of values to try.  We don't bother to try incrementing
- * higher-order bytes, so there's no growth in runtime for wider characters.
- * (If we did try to do that, we'd need to consider the likelihood that 255
- * is not a valid final byte in the encoding.)
- */
-static bool
-pg_generic_charinc(unsigned char *charptr, int len)
-{
-       unsigned char *lastbyte = charptr + len - 1;
-       mbverifier      mbverify;
-
-       /* We can just invoke the character verifier directly. */
-       mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
-
-       while (*lastbyte < (unsigned char) 255)
-       {
-               (*lastbyte)++;
-               if ((*mbverify) (charptr, len) == len)
-                       return true;
-       }
-
-       return false;
-}
-
-/*
- * UTF-8 character incrementer function.
- *
- * For a one-byte character less than 0x7F, we just increment the byte.
- *
- * For a multibyte character, every byte but the first must fall between 0x80
- * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
- * the last byte that's not already at its maximum value.  If we can't find a
- * byte that's less than the maximum allowable value, we simply fail.  We also
- * need some special-case logic to skip regions used for surrogate pair
- * handling, as those should not occur in valid UTF-8.
- *
- * Note that we don't reset lower-order bytes back to their minimums, since
- * we can't afford to make an exhaustive search (see make_greater_string).
- */
-static bool
-pg_utf8_increment(unsigned char *charptr, int length)
-{
-       unsigned char a;
-       unsigned char limit;
-
-       switch (length)
-       {
-               default:
-                       /* reject lengths 5 and 6 for now */
-                       return false;
-               case 4:
-                       a = charptr[3];
-                       if (a < 0xBF)
-                       {
-                               charptr[3]++;
-                               break;
-                       }
-                       /* FALL THRU */
-               case 3:
-                       a = charptr[2];
-                       if (a < 0xBF)
-                       {
-                               charptr[2]++;
-                               break;
-                       }
-                       /* FALL THRU */
-               case 2:
-                       a = charptr[1];
-                       switch (*charptr)
-                       {
-                               case 0xED:
-                                       limit = 0x9F;
-                                       break;
-                               case 0xF4:
-                                       limit = 0x8F;
-                                       break;
-                               default:
-                                       limit = 0xBF;
-                                       break;
-                       }
-                       if (a < limit)
-                       {
-                               charptr[1]++;
-                               break;
-                       }
-                       /* FALL THRU */
-               case 1:
-                       a = *charptr;
-                       if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
-                               return false;
-                       charptr[0]++;
-                       break;
-       }
-
-       return true;
-}
-
-/*
- * EUC-JP character incrementer function.
- *
- * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
- * representing JIS X 0201 characters with the second byte ranging between
- * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
- * and otherwise rewrite the whole sequence to 0xa1 0xa1.
- *
- * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
- * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
- * is incremented if possible, otherwise the second-to-last byte.
- *
- * If the sequence starts with a value other than the above and its MSB
- * is set, it must be a two-byte sequence representing JIS X 0208 characters
- * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
- * incremented if possible, otherwise the second-to-last byte.
- *
- * Otherwise, the sequence is a single-byte ASCII character. It is
- * incremented up to 0x7f.
- */
-static bool
-pg_eucjp_increment(unsigned char *charptr, int length)
-{
-       unsigned char c1,
-                               c2;
-       int                     i;
-
-       c1 = *charptr;
-
-       switch (c1)
-       {
-               case SS2:                               /* JIS X 0201 */
-                       if (length != 2)
-                               return false;
-
-                       c2 = charptr[1];
-
-                       if (c2 >= 0xdf)
-                               charptr[0] = charptr[1] = 0xa1;
-                       else if (c2 < 0xa1)
-                               charptr[1] = 0xa1;
-                       else
-                               charptr[1]++;
-                       break;
-
-               case SS3:                               /* JIS X 0212 */
-                       if (length != 3)
-                               return false;
-
-                       for (i = 2; i > 0; i--)
-                       {
-                               c2 = charptr[i];
-                               if (c2 < 0xa1)
-                               {
-                                       charptr[i] = 0xa1;
-                                       return true;
-                               }
-                               else if (c2 < 0xfe)
-                               {
-                                       charptr[i]++;
-                                       return true;
-                               }
-                       }
-
-                       /* Out of 3-byte code region */
-                       return false;
-
-               default:
-                       if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
-                       {
-                               if (length != 2)
-                                       return false;
-
-                               for (i = 1; i >= 0; i--)
-                               {
-                                       c2 = charptr[i];
-                                       if (c2 < 0xa1)
-                                       {
-                                               charptr[i] = 0xa1;
-                                               return true;
-                                       }
-                                       else if (c2 < 0xfe)
-                                       {
-                                               charptr[i]++;
-                                               return true;
-                                       }
-                               }
-
-                               /* Out of 2 byte code region */
-                               return false;
-                       }
-                       else
-                       {                                       /* ASCII, single byte */
-                               if (c1 > 0x7e)
-                                       return false;
-                               (*charptr)++;
-                       }
-                       break;
-       }
-
-       return true;
-}
-#endif                                                 /* !FRONTEND */
-
  
  /*
   *-------------------------------------------------------------------
@@ -1758,13 +1547,6 @@ const pg_wchar_tbl pg_wchar_table[] = {
         {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}      /* PG_SHIFT_JIS_2004 */
  };
  
-/* returns the byte length of a word for mule internal code */
-int
-pg_mic_mblen(const unsigned char *mbstr)
-{
-       return pg_mule_mblen(mbstr);
-}
-
  /*
   * Returns the byte length of a multibyte character.
   */
@@ -1810,232 +1592,3 @@ pg_encoding_max_length(int encoding)
  
         return pg_wchar_table[encoding].maxmblen;
  }
-
-#ifndef FRONTEND
-
-/*
- * fetch maximum length of the encoding for the current database
- */
-int
-pg_database_encoding_max_length(void)
-{
-       return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
-}
-
-/*
- * get the character incrementer for the encoding for the current database
- */
-mbcharacter_incrementer
-pg_database_encoding_character_incrementer(void)
-{
-       /*
-        * Eventually it might be best to add a field to pg_wchar_table[], but for
-        * now we just use a switch.
-        */
-       switch (GetDatabaseEncoding())
-       {
-               case PG_UTF8:
-                       return pg_utf8_increment;
-
-               case PG_EUC_JP:
-                       return pg_eucjp_increment;
-
-               default:
-                       return pg_generic_charinc;
-       }
-}
-
-/*
- * Verify mbstr to make sure that it is validly encoded in the current
- * database encoding.  Otherwise same as pg_verify_mbstr().
- */
-bool
-pg_verifymbstr(const char *mbstr, int len, bool noError)
-{
-       return
-               pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
-}
-
-/*
- * Verify mbstr to make sure that it is validly encoded in the specified
- * encoding.
- */
-bool
-pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
-{
-       return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
-}
-
-/*
- * Verify mbstr to make sure that it is validly encoded in the specified
- * encoding.
- *
- * mbstr is not necessarily zero terminated; length of mbstr is
- * specified by len.
- *
- * If OK, return length of string in the encoding.
- * If a problem is found, return -1 when noError is
- * true; when noError is false, ereport() a descriptive message.
- */
-int
-pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
-{
-       mbverifier      mbverify;
-       int                     mb_len;
-
-       Assert(PG_VALID_ENCODING(encoding));
-
-       /*
-        * In single-byte encodings, we need only reject nulls (\0).
-        */
-       if (pg_encoding_max_length(encoding) <= 1)
-       {
-               const char *nullpos = memchr(mbstr, 0, len);
-
-               if (nullpos == NULL)
-                       return len;
-               if (noError)
-                       return -1;
-               report_invalid_encoding(encoding, nullpos, 1);
-       }
-
-       /* fetch function pointer just once */
-       mbverify = pg_wchar_table[encoding].mbverify;
-
-       mb_len = 0;
-
-       while (len > 0)
-       {
-               int                     l;
-
-               /* fast path for ASCII-subset characters */
-               if (!IS_HIGHBIT_SET(*mbstr))
-               {
-                       if (*mbstr != '\0')
-                       {
-                               mb_len++;
-                               mbstr++;
-                               len--;
-                               continue;
-                       }
-                       if (noError)
-                               return -1;
-                       report_invalid_encoding(encoding, mbstr, len);
-               }
-
-               l = (*mbverify) ((const unsigned char *) mbstr, len);
-
-               if (l < 0)
-               {
-                       if (noError)
-                               return -1;
-                       report_invalid_encoding(encoding, mbstr, len);
-               }
-
-               mbstr += l;
-               len -= l;
-               mb_len++;
-       }
-       return mb_len;
-}
-
-/*
- * check_encoding_conversion_args: check arguments of a conversion function
- *
- * "expected" arguments can be either an encoding ID or -1 to indicate that
- * the caller will check whether it accepts the ID.
- *
- * Note: the errors here are not really user-facing, so elog instead of
- * ereport seems sufficient.  Also, we trust that the "expected" encoding
- * arguments are valid encoding IDs, but we don't trust the actuals.
- */
-void
-check_encoding_conversion_args(int src_encoding,
-                                                          int dest_encoding,
-                                                          int len,
-                                                          int expected_src_encoding,
-                                                          int expected_dest_encoding)
-{
-       if (!PG_VALID_ENCODING(src_encoding))
-               elog(ERROR, "invalid source encoding ID: %d", src_encoding);
-       if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
-               elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
-                        pg_enc2name_tbl[expected_src_encoding].name,
-                        pg_enc2name_tbl[src_encoding].name);
-       if (!PG_VALID_ENCODING(dest_encoding))
-               elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
-       if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
-               elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
-                        pg_enc2name_tbl[expected_dest_encoding].name,
-                        pg_enc2name_tbl[dest_encoding].name);
-       if (len < 0)
-               elog(ERROR, "encoding conversion length must not be negative");
-}
-
-/*
- * report_invalid_encoding: complain about invalid multibyte character
- *
- * note: len is remaining length of string, not length of character;
- * len must be greater than zero, as we always examine the first byte.
- */
-void
-report_invalid_encoding(int encoding, const char *mbstr, int len)
-{
-       int                     l = pg_encoding_mblen(encoding, mbstr);
-       char            buf[8 * 5 + 1];
-       char       *p = buf;
-       int                     j,
-                               jlimit;
-
-       jlimit = Min(l, len);
-       jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
-
-       for (j = 0; j < jlimit; j++)
-       {
-               p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
-               if (j < jlimit - 1)
-                       p += sprintf(p, " ");
-       }
-
-       ereport(ERROR,
-                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                        errmsg("invalid byte sequence for encoding \"%s\": %s",
-                                       pg_enc2name_tbl[encoding].name,
-                                       buf)));
-}
-
-/*
- * report_untranslatable_char: complain about untranslatable character
- *
- * note: len is remaining length of string, not length of character;
- * len must be greater than zero, as we always examine the first byte.
- */
-void
-report_untranslatable_char(int src_encoding, int dest_encoding,
-                                                  const char *mbstr, int len)
-{
-       int                     l = pg_encoding_mblen(src_encoding, mbstr);
-       char            buf[8 * 5 + 1];
-       char       *p = buf;
-       int                     j,
-                               jlimit;
-
-       jlimit = Min(l, len);
-       jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
-
-       for (j = 0; j < jlimit; j++)
-       {
-               p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
-               if (j < jlimit - 1)
-                       p += sprintf(p, " ");
-       }
-
-       ereport(ERROR,
-                       (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
-                        errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
-                                       buf,
-                                       pg_enc2name_tbl[src_encoding].name,
-                                       pg_enc2name_tbl[dest_encoding].name)));
-}
-
-#endif                                                 /* !FRONTEND */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h

index 026f64f90be40456365e9fc20fe50f26823e2460..b8892ef730e3f3384885c7bd8c795a372d103302 100644 (file)
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -9,7 +9,7 @@
   * src/include/mb/pg_wchar.h
   *
   *     NOTES
- *             This is used both by the backend and by libpq, but should not be
+ *             This is used both by the backend and by frontends, but should not be
   *             included by libpq client programs.  In particular, a libpq client
   *             should not assume that the encoding IDs used by the version of libpq
   *             it's linked to match up with the IDs declared here.
@@ -345,12 +345,6 @@ typedef struct pg_enc2gettext
  
  extern const pg_enc2gettext pg_enc2gettext_tbl[];
  
-/*
- * Encoding names for ICU
- */
-extern bool is_encoding_supported_by_icu(int encoding);
-extern const char *get_encoding_name_for_icu(int encoding);
-
  /*
   * pg_wchar stuff
   */
@@ -539,8 +533,27 @@ extern const char *pg_encoding_to_char(int encoding);
  extern int     pg_valid_server_encoding_id(int encoding);
  
  /*
- * Remaining functions are not considered part of libpq's API, though many
- * of them do exist inside libpq.
+ * These functions are available to frontend code that links with libpgcommon
+ * (in addition to the ones just above).  The constant tables declared
+ * earlier in this file are also available from libpgcommon.
+ */
+extern int     pg_encoding_mblen(int encoding, const char *mbstr);
+extern int     pg_encoding_dsplen(int encoding, const char *mbstr);
+extern int     pg_encoding_verifymb(int encoding, const char *mbstr, int len);
+extern int     pg_encoding_max_length(int encoding);
+extern int     pg_valid_client_encoding(const char *name);
+extern int     pg_valid_server_encoding(const char *name);
+extern bool is_encoding_supported_by_icu(int encoding);
+extern const char *get_encoding_name_for_icu(int encoding);
+
+extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
+extern pg_wchar utf8_to_unicode(const unsigned char *c);
+extern bool pg_utf8_islegal(const unsigned char *source, int length);
+extern int     pg_utf_mblen(const unsigned char *s);
+extern int     pg_mule_mblen(const unsigned char *s);
+
+/*
+ * The remaining functions are backend-only.
   */
  extern int     pg_mb2wchar(const char *from, pg_wchar *to);
  extern int     pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len);
@@ -556,18 +569,12 @@ extern int        pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t
  extern size_t pg_wchar_strlen(const pg_wchar *wstr);
  extern int     pg_mblen(const char *mbstr);
  extern int     pg_dsplen(const char *mbstr);
-extern int     pg_encoding_mblen(int encoding, const char *mbstr);
-extern int     pg_encoding_dsplen(int encoding, const char *mbstr);
-extern int     pg_encoding_verifymb(int encoding, const char *mbstr, int len);
-extern int     pg_mule_mblen(const unsigned char *mbstr);
-extern int     pg_mic_mblen(const unsigned char *mbstr);
  extern int     pg_mbstrlen(const char *mbstr);
  extern int     pg_mbstrlen_with_len(const char *mbstr, int len);
  extern int     pg_mbcliplen(const char *mbstr, int len, int limit);
  extern int     pg_encoding_mbcliplen(int encoding, const char *mbstr,
                                                                   int len, int limit);
  extern int     pg_mbcharcliplen(const char *mbstr, int len, int limit);
-extern int     pg_encoding_max_length(int encoding);
  extern int     pg_database_encoding_max_length(void);
  extern mbcharacter_incrementer pg_database_encoding_character_incrementer(void);
  
@@ -587,12 +594,6 @@ extern int GetMessageEncoding(void);
  extern int     pg_bind_textdomain_codeset(const char *domainname);
  #endif
  
-extern int     pg_valid_client_encoding(const char *name);
-extern int     pg_valid_server_encoding(const char *name);
-
-extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
-extern pg_wchar utf8_to_unicode(const unsigned char *c);
-extern int     pg_utf_mblen(const unsigned char *);
  extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
                                                                                                 int src_encoding,
                                                                                                 int dest_encoding);
@@ -647,8 +648,6 @@ extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
                                                                  int len, int lc, int encoding,
                                                                  const unsigned char *tab);
  
-extern bool pg_utf8_islegal(const unsigned char *source, int length);
-
  #ifdef WIN32
  extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
  #endif
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 16 Jan 2020 23:08:21 +0000 (18:08 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 16 Jan 2020 23:08:21 +0000 (18:08 -0500)
src/backend/utils/adt/pg_locale.c		patch \| blob \| blame \| history
src/backend/utils/mb/conv.c		patch \| blob \| blame \| history
src/backend/utils/mb/mbutils.c		patch \| blob \| blame \| history
src/common/encnames.c		patch \| blob \| blame \| history
src/common/wchar.c		patch \| blob \| blame \| history
src/include/mb/pg_wchar.h		patch \| blob \| blame \| history