Rationalize code placement between wchar.c, encnames.c, and mbutils.c.

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 16 Jan 2020 23:08:21 +0000 (18:08 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 16 Jan 2020 23:08:21 +0000 (18:08 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 16 Jan 2020 23:08:21 +0000 (18:08 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 16 Jan 2020 23:08:21 +0000 (18:08 -0500)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c

index 0eb915e62e9fb38bcea00c416f15ace2fabf252f..25fb7e2ebfa120c735b28bf14d01eeb310e96c30 100644 (file)
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1555,9 +1555,14 @@ init_icu_converter(void)
     UConverter *conv;
  
     if (icu_converter)
-       return;
+       return;                 /* already done */
  
     icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
+   if (!icu_encoding_name)
+       ereport(ERROR,
+               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                errmsg("encoding \"%s\" not supported by ICU",
+                       pg_encoding_to_char(GetDatabaseEncoding()))));
  
     status = U_ZERO_ERROR;
     conv = ucnv_open(icu_encoding_name, &status);
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c

index 11f17b337e6511bbff8d9e363f5f34d3a8570afd..54dcf71fb75625163fccb7d29c793999ebfc9b74 100644 (file)
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -115,7 +115,7 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len,
         }
         else
         {
-           int         l = pg_mic_mblen(mic);
+           int         l = pg_mule_mblen(mic);
  
             if (len < l)
                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
@@ -217,7 +217,7 @@ mic2latin_with_table(const unsigned char *mic,
         }
         else
         {
-           int         l = pg_mic_mblen(mic);
+           int         l = pg_mule_mblen(mic);
  
             if (len < l)
                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c

index 5d7cc74ad6c869e8011354d50fa7ba92a462ac62..86787bcb3190a0048bf7cff7eccaf6c3f1053732 100644 (file)
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -1066,6 +1066,23 @@ pg_client_encoding(PG_FUNCTION_ARGS)
     return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
  }
  
+Datum
+PG_char_to_encoding(PG_FUNCTION_ARGS)
+{
+   Name        s = PG_GETARG_NAME(0);
+
+   PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
+}
+
+Datum
+PG_encoding_to_char(PG_FUNCTION_ARGS)
+{
+   int32       encoding = PG_GETARG_INT32(0);
+   const char *encoding_name = pg_encoding_to_char(encoding);
+
+   return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
+}
+
  /*
   * gettext() returns messages in this encoding.  This often matches the
   * database encoding, but it differs for SQL_ASCII databases, for processes
@@ -1078,6 +1095,438 @@ GetMessageEncoding(void)
     return MessageEncoding->encoding;
  }
  
+
+/*
+ * Generic character incrementer function.
+ *
+ * Not knowing anything about the properties of the encoding in use, we just
+ * keep incrementing the last byte until we get a validly-encoded result,
+ * or we run out of values to try.  We don't bother to try incrementing
+ * higher-order bytes, so there's no growth in runtime for wider characters.
+ * (If we did try to do that, we'd need to consider the likelihood that 255
+ * is not a valid final byte in the encoding.)
+ */
+static bool
+pg_generic_charinc(unsigned char *charptr, int len)
+{
+   unsigned char *lastbyte = charptr + len - 1;
+   mbverifier  mbverify;
+
+   /* We can just invoke the character verifier directly. */
+   mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
+
+   while (*lastbyte < (unsigned char) 255)
+   {
+       (*lastbyte)++;
+       if ((*mbverify) (charptr, len) == len)
+           return true;
+   }
+
+   return false;
+}
+
+/*
+ * UTF-8 character incrementer function.
+ *
+ * For a one-byte character less than 0x7F, we just increment the byte.
+ *
+ * For a multibyte character, every byte but the first must fall between 0x80
+ * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
+ * the last byte that's not already at its maximum value.  If we can't find a
+ * byte that's less than the maximum allowable value, we simply fail.  We also
+ * need some special-case logic to skip regions used for surrogate pair
+ * handling, as those should not occur in valid UTF-8.
+ *
+ * Note that we don't reset lower-order bytes back to their minimums, since
+ * we can't afford to make an exhaustive search (see make_greater_string).
+ */
+static bool
+pg_utf8_increment(unsigned char *charptr, int length)
+{
+   unsigned char a;
+   unsigned char limit;
+
+   switch (length)
+   {
+       default:
+           /* reject lengths 5 and 6 for now */
+           return false;
+       case 4:
+           a = charptr[3];
+           if (a < 0xBF)
+           {
+               charptr[3]++;
+               break;
+           }
+           /* FALL THRU */
+       case 3:
+           a = charptr[2];
+           if (a < 0xBF)
+           {
+               charptr[2]++;
+               break;
+           }
+           /* FALL THRU */
+       case 2:
+           a = charptr[1];
+           switch (*charptr)
+           {
+               case 0xED:
+                   limit = 0x9F;
+                   break;
+               case 0xF4:
+                   limit = 0x8F;
+                   break;
+               default:
+                   limit = 0xBF;
+                   break;
+           }
+           if (a < limit)
+           {
+               charptr[1]++;
+               break;
+           }
+           /* FALL THRU */
+       case 1:
+           a = *charptr;
+           if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
+               return false;
+           charptr[0]++;
+           break;
+   }
+
+   return true;
+}
+
+/*
+ * EUC-JP character incrementer function.
+ *
+ * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
+ * representing JIS X 0201 characters with the second byte ranging between
+ * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
+ * and otherwise rewrite the whole sequence to 0xa1 0xa1.
+ *
+ * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
+ * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
+ * is incremented if possible, otherwise the second-to-last byte.
+ *
+ * If the sequence starts with a value other than the above and its MSB
+ * is set, it must be a two-byte sequence representing JIS X 0208 characters
+ * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
+ * incremented if possible, otherwise the second-to-last byte.
+ *
+ * Otherwise, the sequence is a single-byte ASCII character. It is
+ * incremented up to 0x7f.
+ */
+static bool
+pg_eucjp_increment(unsigned char *charptr, int length)
+{
+   unsigned char c1,
+               c2;
+   int         i;
+
+   c1 = *charptr;
+
+   switch (c1)
+   {
+       case SS2:               /* JIS X 0201 */
+           if (length != 2)
+               return false;
+
+           c2 = charptr[1];
+
+           if (c2 >= 0xdf)
+               charptr[0] = charptr[1] = 0xa1;
+           else if (c2 < 0xa1)
+               charptr[1] = 0xa1;
+           else
+               charptr[1]++;
+           break;
+
+       case SS3:               /* JIS X 0212 */
+           if (length != 3)
+               return false;
+
+           for (i = 2; i > 0; i--)
+           {
+               c2 = charptr[i];
+               if (c2 < 0xa1)
+               {
+                   charptr[i] = 0xa1;
+                   return true;
+               }
+               else if (c2 < 0xfe)
+               {
+                   charptr[i]++;
+                   return true;
+               }
+           }
+
+           /* Out of 3-byte code region */
+           return false;
+
+       default:
+           if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
+           {
+               if (length != 2)
+                   return false;
+
+               for (i = 1; i >= 0; i--)
+               {
+                   c2 = charptr[i];
+                   if (c2 < 0xa1)
+                   {
+                       charptr[i] = 0xa1;
+                       return true;
+                   }
+                   else if (c2 < 0xfe)
+                   {
+                       charptr[i]++;
+                       return true;
+                   }
+               }
+
+               /* Out of 2 byte code region */
+               return false;
+           }
+           else
+           {                   /* ASCII, single byte */
+               if (c1 > 0x7e)
+                   return false;
+               (*charptr)++;
+           }
+           break;
+   }
+
+   return true;
+}
+
+/*
+ * get the character incrementer for the encoding for the current database
+ */
+mbcharacter_incrementer
+pg_database_encoding_character_incrementer(void)
+{
+   /*
+    * Eventually it might be best to add a field to pg_wchar_table[], but for
+    * now we just use a switch.
+    */
+   switch (GetDatabaseEncoding())
+   {
+       case PG_UTF8:
+           return pg_utf8_increment;
+
+       case PG_EUC_JP:
+           return pg_eucjp_increment;
+
+       default:
+           return pg_generic_charinc;
+   }
+}
+
+/*
+ * fetch maximum length of the encoding for the current database
+ */
+int
+pg_database_encoding_max_length(void)
+{
+   return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the current
+ * database encoding.  Otherwise same as pg_verify_mbstr().
+ */
+bool
+pg_verifymbstr(const char *mbstr, int len, bool noError)
+{
+   return
+       pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the specified
+ * encoding.
+ */
+bool
+pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
+{
+   return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the specified
+ * encoding.
+ *
+ * mbstr is not necessarily zero terminated; length of mbstr is
+ * specified by len.
+ *
+ * If OK, return length of string in the encoding.
+ * If a problem is found, return -1 when noError is
+ * true; when noError is false, ereport() a descriptive message.
+ */
+int
+pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
+{
+   mbverifier  mbverify;
+   int         mb_len;
+
+   Assert(PG_VALID_ENCODING(encoding));
+
+   /*
+    * In single-byte encodings, we need only reject nulls (\0).
+    */
+   if (pg_encoding_max_length(encoding) <= 1)
+   {
+       const char *nullpos = memchr(mbstr, 0, len);
+
+       if (nullpos == NULL)
+           return len;
+       if (noError)
+           return -1;
+       report_invalid_encoding(encoding, nullpos, 1);
+   }
+
+   /* fetch function pointer just once */
+   mbverify = pg_wchar_table[encoding].mbverify;
+
+   mb_len = 0;
+
+   while (len > 0)
+   {
+       int         l;
+
+       /* fast path for ASCII-subset characters */
+       if (!IS_HIGHBIT_SET(*mbstr))
+       {
+           if (*mbstr != '\0')
+           {
+               mb_len++;
+               mbstr++;
+               len--;
+               continue;
+           }
+           if (noError)
+               return -1;
+           report_invalid_encoding(encoding, mbstr, len);
+       }
+
+       l = (*mbverify) ((const unsigned char *) mbstr, len);
+
+       if (l < 0)
+       {
+           if (noError)
+               return -1;
+           report_invalid_encoding(encoding, mbstr, len);
+       }
+
+       mbstr += l;
+       len -= l;
+       mb_len++;
+   }
+   return mb_len;
+}
+
+/*
+ * check_encoding_conversion_args: check arguments of a conversion function
+ *
+ * "expected" arguments can be either an encoding ID or -1 to indicate that
+ * the caller will check whether it accepts the ID.
+ *
+ * Note: the errors here are not really user-facing, so elog instead of
+ * ereport seems sufficient.  Also, we trust that the "expected" encoding
+ * arguments are valid encoding IDs, but we don't trust the actuals.
+ */
+void
+check_encoding_conversion_args(int src_encoding,
+                              int dest_encoding,
+                              int len,
+                              int expected_src_encoding,
+                              int expected_dest_encoding)
+{
+   if (!PG_VALID_ENCODING(src_encoding))
+       elog(ERROR, "invalid source encoding ID: %d", src_encoding);
+   if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
+       elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
+            pg_enc2name_tbl[expected_src_encoding].name,
+            pg_enc2name_tbl[src_encoding].name);
+   if (!PG_VALID_ENCODING(dest_encoding))
+       elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
+   if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
+       elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
+            pg_enc2name_tbl[expected_dest_encoding].name,
+            pg_enc2name_tbl[dest_encoding].name);
+   if (len < 0)
+       elog(ERROR, "encoding conversion length must not be negative");
+}
+
+/*
+ * report_invalid_encoding: complain about invalid multibyte character
+ *
+ * note: len is remaining length of string, not length of character;
+ * len must be greater than zero, as we always examine the first byte.
+ */
+void
+report_invalid_encoding(int encoding, const char *mbstr, int len)
+{
+   int         l = pg_encoding_mblen(encoding, mbstr);
+   char        buf[8 * 5 + 1];
+   char       *p = buf;
+   int         j,
+               jlimit;
+
+   jlimit = Min(l, len);
+   jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
+
+   for (j = 0; j < jlimit; j++)
+   {
+       p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
+       if (j < jlimit - 1)
+           p += sprintf(p, " ");
+   }
+
+   ereport(ERROR,
+           (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+            errmsg("invalid byte sequence for encoding \"%s\": %s",
+                   pg_enc2name_tbl[encoding].name,
+                   buf)));
+}
+
+/*
+ * report_untranslatable_char: complain about untranslatable character
+ *
+ * note: len is remaining length of string, not length of character;
+ * len must be greater than zero, as we always examine the first byte.
+ */
+void
+report_untranslatable_char(int src_encoding, int dest_encoding,
+                          const char *mbstr, int len)
+{
+   int         l = pg_encoding_mblen(src_encoding, mbstr);
+   char        buf[8 * 5 + 1];
+   char       *p = buf;
+   int         j,
+               jlimit;
+
+   jlimit = Min(l, len);
+   jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
+
+   for (j = 0; j < jlimit; j++)
+   {
+       p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
+       if (j < jlimit - 1)
+           p += sprintf(p, " ");
+   }
+
+   ereport(ERROR,
+           (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
+            errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
+                   buf,
+                   pg_enc2name_tbl[src_encoding].name,
+                   pg_enc2name_tbl[dest_encoding].name)));
+}
+
+
  #ifdef WIN32
  /*
   * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
@@ -1149,4 +1598,4 @@ pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
     return utf16;
  }
  
-#endif
+#endif                         /* WIN32 */
diff --git a/src/common/encnames.c b/src/common/encnames.c

index 2086e007fc547b9be20ace86c76f0dc432a2ab0b..14cf1b39e9866fb2190b28b9b9a7fc071634ea25 100644 (file)
--- a/src/common/encnames.c
+++ b/src/common/encnames.c
@@ -10,12 +10,7 @@
   *
   *-------------------------------------------------------------------------
   */
-#ifdef FRONTEND
-#include "postgres_fe.h"
-#else
-#include "postgres.h"
-#include "utils/builtins.h"
-#endif
+#include "c.h"
  
  #include <ctype.h>
  #include <unistd.h>
@@ -310,6 +305,7 @@ static const pg_encname pg_encname_tbl[] =
  #else
  #define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
  #endif
+
  const pg_enc2name pg_enc2name_tbl[] =
  {
     DEF_ENC2NAME(SQL_ASCII, 0),
@@ -409,10 +405,8 @@ const pg_enc2gettext pg_enc2gettext_tbl[] =
  };
  
  
-#ifndef FRONTEND
-
  /*
- * Table of encoding names for ICU
+ * Table of encoding names for ICU (currently covers backend encodings only)
   *
   * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
   *
@@ -457,33 +451,32 @@ static const char *const pg_enc2icu_tbl[] =
     "KOI8-U",                   /* PG_KOI8U */
  };
  
+
+/*
+ * Is this encoding supported by ICU?
+ */
  bool
  is_encoding_supported_by_icu(int encoding)
  {
+   if (!PG_VALID_BE_ENCODING(encoding))
+       return false;
     return (pg_enc2icu_tbl[encoding] != NULL);
  }
  
+/*
+ * Returns ICU's name for encoding, or NULL if not supported
+ */
  const char *
  get_encoding_name_for_icu(int encoding)
  {
-   const char *icu_encoding_name;
-
     StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
                      "pg_enc2icu_tbl incomplete");
  
-   icu_encoding_name = pg_enc2icu_tbl[encoding];
-
-   if (!icu_encoding_name)
-       ereport(ERROR,
-               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                errmsg("encoding \"%s\" not supported by ICU",
-                       pg_encoding_to_char(encoding))));
-
-   return icu_encoding_name;
+   if (!PG_VALID_BE_ENCODING(encoding))
+       return NULL;
+   return pg_enc2icu_tbl[encoding];
  }
  
-#endif                         /* not FRONTEND */
-
  
  /* ----------
   * Encoding checks, for error returns -1 else encoding id
@@ -523,9 +516,10 @@ pg_valid_server_encoding_id(int encoding)
     return PG_VALID_BE_ENCODING(encoding);
  }
  
-/* ----------
- * Remove irrelevant chars from encoding name
- * ----------
+/*
+ * Remove irrelevant chars from encoding name, store at *newkey
+ *
+ * (Caller's responsibility to provide a large enough buffer)
   */
  static char *
  clean_encoding_name(const char *key, char *newkey)
@@ -547,11 +541,10 @@ clean_encoding_name(const char *key, char *newkey)
     return newkey;
  }
  
-/* ----------
+/*
   * Search encoding by encoding name
   *
- * Returns encoding ID, or -1 for error
- * ----------
+ * Returns encoding ID, or -1 if not recognized
   */
  int
  pg_char_to_encoding(const char *name)
@@ -568,16 +561,8 @@ pg_char_to_encoding(const char *name)
         return -1;
  
     if (strlen(name) >= NAMEDATALEN)
-   {
-#ifdef FRONTEND
-       fprintf(stderr, "encoding name too long\n");
-       return -1;
-#else
-       ereport(ERROR,
-               (errcode(ERRCODE_NAME_TOO_LONG),
-                errmsg("encoding name too long")));
-#endif
-   }
+       return -1;              /* it's certainly not in the table */
+
     key = clean_encoding_name(name, buff);
  
     while (last >= base)
@@ -599,16 +584,6 @@ pg_char_to_encoding(const char *name)
     return -1;
  }
  
-#ifndef FRONTEND
-Datum
-PG_char_to_encoding(PG_FUNCTION_ARGS)
-{
-   Name        s = PG_GETARG_NAME(0);
-
-   PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
-}
-#endif
-
  const char *
  pg_encoding_to_char(int encoding)
  {
@@ -621,15 +596,3 @@ pg_encoding_to_char(int encoding)
     }
     return "";
  }
-
-#ifndef FRONTEND
-Datum
-PG_encoding_to_char(PG_FUNCTION_ARGS)
-{
-   int32       encoding = PG_GETARG_INT32(0);
-   const char *encoding_name = pg_encoding_to_char(encoding);
-
-   return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
-}
-
-#endif
diff --git a/src/common/wchar.c b/src/common/wchar.c

index 74a88239a1dc08e06482faecb89c04336622aff5..efaf1c155bbdec453a603596e18b0b84729b97a1 100644 (file)
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -10,11 +10,7 @@
   *
   *-------------------------------------------------------------------------
   */
-#ifdef FRONTEND
-#include "postgres_fe.h"
-#else
-#include "postgres.h"
-#endif
+#include "c.h"
  
  #include "mb/pg_wchar.h"
  
@@ -838,6 +834,7 @@ pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
     return cnt;
  }
  
+/* exported for direct use by conv.c */
  int
  pg_mule_mblen(const unsigned char *s)
  {
@@ -1498,214 +1495,6 @@ pg_utf8_islegal(const unsigned char *source, int length)
     return true;
  }
  
-#ifndef FRONTEND
-
-/*
- * Generic character incrementer function.
- *
- * Not knowing anything about the properties of the encoding in use, we just
- * keep incrementing the last byte until we get a validly-encoded result,
- * or we run out of values to try.  We don't bother to try incrementing
- * higher-order bytes, so there's no growth in runtime for wider characters.
- * (If we did try to do that, we'd need to consider the likelihood that 255
- * is not a valid final byte in the encoding.)
- */
-static bool
-pg_generic_charinc(unsigned char *charptr, int len)
-{
-   unsigned char *lastbyte = charptr + len - 1;
-   mbverifier  mbverify;
-
-   /* We can just invoke the character verifier directly. */
-   mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
-
-   while (*lastbyte < (unsigned char) 255)
-   {
-       (*lastbyte)++;
-       if ((*mbverify) (charptr, len) == len)
-           return true;
-   }
-
-   return false;
-}
-
-/*
- * UTF-8 character incrementer function.
- *
- * For a one-byte character less than 0x7F, we just increment the byte.
- *
- * For a multibyte character, every byte but the first must fall between 0x80
- * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
- * the last byte that's not already at its maximum value.  If we can't find a
- * byte that's less than the maximum allowable value, we simply fail.  We also
- * need some special-case logic to skip regions used for surrogate pair
- * handling, as those should not occur in valid UTF-8.
- *
- * Note that we don't reset lower-order bytes back to their minimums, since
- * we can't afford to make an exhaustive search (see make_greater_string).
- */
-static bool
-pg_utf8_increment(unsigned char *charptr, int length)
-{
-   unsigned char a;
-   unsigned char limit;
-
-   switch (length)
-   {
-       default:
-           /* reject lengths 5 and 6 for now */
-           return false;
-       case 4:
-           a = charptr[3];
-           if (a < 0xBF)
-           {
-               charptr[3]++;
-               break;
-           }
-           /* FALL THRU */
-       case 3:
-           a = charptr[2];
-           if (a < 0xBF)
-           {
-               charptr[2]++;
-               break;
-           }
-           /* FALL THRU */
-       case 2:
-           a = charptr[1];
-           switch (*charptr)
-           {
-               case 0xED:
-                   limit = 0x9F;
-                   break;
-               case 0xF4:
-                   limit = 0x8F;
-                   break;
-               default:
-                   limit = 0xBF;
-                   break;
-           }
-           if (a < limit)
-           {
-               charptr[1]++;
-               break;
-           }
-           /* FALL THRU */
-       case 1:
-           a = *charptr;
-           if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
-               return false;
-           charptr[0]++;
-           break;
-   }
-
-   return true;
-}
-
-/*
- * EUC-JP character incrementer function.
- *
- * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
- * representing JIS X 0201 characters with the second byte ranging between
- * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
- * and otherwise rewrite the whole sequence to 0xa1 0xa1.
- *
- * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
- * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
- * is incremented if possible, otherwise the second-to-last byte.
- *
- * If the sequence starts with a value other than the above and its MSB
- * is set, it must be a two-byte sequence representing JIS X 0208 characters
- * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
- * incremented if possible, otherwise the second-to-last byte.
- *
- * Otherwise, the sequence is a single-byte ASCII character. It is
- * incremented up to 0x7f.
- */
-static bool
-pg_eucjp_increment(unsigned char *charptr, int length)
-{
-   unsigned char c1,
-               c2;
-   int         i;
-
-   c1 = *charptr;
-
-   switch (c1)
-   {
-       case SS2:               /* JIS X 0201 */
-           if (length != 2)
-               return false;
-
-           c2 = charptr[1];
-
-           if (c2 >= 0xdf)
-               charptr[0] = charptr[1] = 0xa1;
-           else if (c2 < 0xa1)
-               charptr[1] = 0xa1;
-           else
-               charptr[1]++;
-           break;
-
-       case SS3:               /* JIS X 0212 */
-           if (length != 3)
-               return false;
-
-           for (i = 2; i > 0; i--)
-           {
-               c2 = charptr[i];
-               if (c2 < 0xa1)
-               {
-                   charptr[i] = 0xa1;
-                   return true;
-               }
-               else if (c2 < 0xfe)
-               {
-                   charptr[i]++;
-                   return true;
-               }
-           }
-
-           /* Out of 3-byte code region */
-           return false;
-
-       default:
-           if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
-           {
-               if (length != 2)
-                   return false;
-
-               for (i = 1; i >= 0; i--)
-               {
-                   c2 = charptr[i];
-                   if (c2 < 0xa1)
-                   {
-                       charptr[i] = 0xa1;
-                       return true;
-                   }
-                   else if (c2 < 0xfe)
-                   {
-                       charptr[i]++;
-                       return true;
-                   }
-               }
-
-               /* Out of 2 byte code region */
-               return false;
-           }
-           else
-           {                   /* ASCII, single byte */
-               if (c1 > 0x7e)
-                   return false;
-               (*charptr)++;
-           }
-           break;
-   }
-
-   return true;
-}
-#endif                         /* !FRONTEND */
-
  
  /*
   *-------------------------------------------------------------------
@@ -1758,13 +1547,6 @@ const pg_wchar_tbl pg_wchar_table[] = {
     {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}  /* PG_SHIFT_JIS_2004 */
  };
  
-/* returns the byte length of a word for mule internal code */
-int
-pg_mic_mblen(const unsigned char *mbstr)
-{
-   return pg_mule_mblen(mbstr);
-}
-
  /*
   * Returns the byte length of a multibyte character.
   */
@@ -1810,232 +1592,3 @@ pg_encoding_max_length(int encoding)
  
     return pg_wchar_table[encoding].maxmblen;
  }
-
-#ifndef FRONTEND
-
-/*
- * fetch maximum length of the encoding for the current database
- */
-int
-pg_database_encoding_max_length(void)
-{
-   return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
-}
-
-/*
- * get the character incrementer for the encoding for the current database
- */
-mbcharacter_incrementer
-pg_database_encoding_character_incrementer(void)
-{
-   /*
-    * Eventually it might be best to add a field to pg_wchar_table[], but for
-    * now we just use a switch.
-    */
-   switch (GetDatabaseEncoding())
-   {
-       case PG_UTF8:
-           return pg_utf8_increment;
-
-       case PG_EUC_JP:
-           return pg_eucjp_increment;
-
-       default:
-           return pg_generic_charinc;
-   }
-}
-
-/*
- * Verify mbstr to make sure that it is validly encoded in the current
- * database encoding.  Otherwise same as pg_verify_mbstr().
- */
-bool
-pg_verifymbstr(const char *mbstr, int len, bool noError)
-{
-   return
-       pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
-}
-
-/*
- * Verify mbstr to make sure that it is validly encoded in the specified
- * encoding.
- */
-bool
-pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
-{
-   return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
-}
-
-/*
- * Verify mbstr to make sure that it is validly encoded in the specified
- * encoding.
- *
- * mbstr is not necessarily zero terminated; length of mbstr is
- * specified by len.
- *
- * If OK, return length of string in the encoding.
- * If a problem is found, return -1 when noError is
- * true; when noError is false, ereport() a descriptive message.
- */
-int
-pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
-{
-   mbverifier  mbverify;
-   int         mb_len;
-
-   Assert(PG_VALID_ENCODING(encoding));
-
-   /*
-    * In single-byte encodings, we need only reject nulls (\0).
-    */
-   if (pg_encoding_max_length(encoding) <= 1)
-   {
-       const char *nullpos = memchr(mbstr, 0, len);
-
-       if (nullpos == NULL)
-           return len;
-       if (noError)
-           return -1;
-       report_invalid_encoding(encoding, nullpos, 1);
-   }
-
-   /* fetch function pointer just once */
-   mbverify = pg_wchar_table[encoding].mbverify;
-
-   mb_len = 0;
-
-   while (len > 0)
-   {
-       int         l;
-
-       /* fast path for ASCII-subset characters */
-       if (!IS_HIGHBIT_SET(*mbstr))
-       {
-           if (*mbstr != '\0')
-           {
-               mb_len++;
-               mbstr++;
-               len--;
-               continue;
-           }
-           if (noError)
-               return -1;
-           report_invalid_encoding(encoding, mbstr, len);
-       }
-
-       l = (*mbverify) ((const unsigned char *) mbstr, len);
-
-       if (l < 0)
-       {
-           if (noError)
-               return -1;
-           report_invalid_encoding(encoding, mbstr, len);
-       }
-
-       mbstr += l;
-       len -= l;
-       mb_len++;
-   }
-   return mb_len;
-}
-
-/*
- * check_encoding_conversion_args: check arguments of a conversion function
- *
- * "expected" arguments can be either an encoding ID or -1 to indicate that
- * the caller will check whether it accepts the ID.
- *
- * Note: the errors here are not really user-facing, so elog instead of
- * ereport seems sufficient.  Also, we trust that the "expected" encoding
- * arguments are valid encoding IDs, but we don't trust the actuals.
- */
-void
-check_encoding_conversion_args(int src_encoding,
-                              int dest_encoding,
-                              int len,
-                              int expected_src_encoding,
-                              int expected_dest_encoding)
-{
-   if (!PG_VALID_ENCODING(src_encoding))
-       elog(ERROR, "invalid source encoding ID: %d", src_encoding);
-   if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
-       elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
-            pg_enc2name_tbl[expected_src_encoding].name,
-            pg_enc2name_tbl[src_encoding].name);
-   if (!PG_VALID_ENCODING(dest_encoding))
-       elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
-   if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
-       elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
-            pg_enc2name_tbl[expected_dest_encoding].name,
-            pg_enc2name_tbl[dest_encoding].name);
-   if (len < 0)
-       elog(ERROR, "encoding conversion length must not be negative");
-}
-
-/*
- * report_invalid_encoding: complain about invalid multibyte character
- *
- * note: len is remaining length of string, not length of character;
- * len must be greater than zero, as we always examine the first byte.
- */
-void
-report_invalid_encoding(int encoding, const char *mbstr, int len)
-{
-   int         l = pg_encoding_mblen(encoding, mbstr);
-   char        buf[8 * 5 + 1];
-   char       *p = buf;
-   int         j,
-               jlimit;
-
-   jlimit = Min(l, len);
-   jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
-
-   for (j = 0; j < jlimit; j++)
-   {
-       p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
-       if (j < jlimit - 1)
-           p += sprintf(p, " ");
-   }
-
-   ereport(ERROR,
-           (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-            errmsg("invalid byte sequence for encoding \"%s\": %s",
-                   pg_enc2name_tbl[encoding].name,
-                   buf)));
-}
-
-/*
- * report_untranslatable_char: complain about untranslatable character
- *
- * note: len is remaining length of string, not length of character;
- * len must be greater than zero, as we always examine the first byte.
- */
-void
-report_untranslatable_char(int src_encoding, int dest_encoding,
-                          const char *mbstr, int len)
-{
-   int         l = pg_encoding_mblen(src_encoding, mbstr);
-   char        buf[8 * 5 + 1];
-   char       *p = buf;
-   int         j,
-               jlimit;
-
-   jlimit = Min(l, len);
-   jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
-
-   for (j = 0; j < jlimit; j++)
-   {
-       p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
-       if (j < jlimit - 1)
-           p += sprintf(p, " ");
-   }
-
-   ereport(ERROR,
-           (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
-            errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
-                   buf,
-                   pg_enc2name_tbl[src_encoding].name,
-                   pg_enc2name_tbl[dest_encoding].name)));
-}
-
-#endif                         /* !FRONTEND */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h

index 026f64f90be40456365e9fc20fe50f26823e2460..b8892ef730e3f3384885c7bd8c795a372d103302 100644 (file)
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -9,7 +9,7 @@
   * src/include/mb/pg_wchar.h
   *
   * NOTES
- *     This is used both by the backend and by libpq, but should not be
+ *     This is used both by the backend and by frontends, but should not be
   *     included by libpq client programs.  In particular, a libpq client
   *     should not assume that the encoding IDs used by the version of libpq
   *     it's linked to match up with the IDs declared here.
@@ -345,12 +345,6 @@ typedef struct pg_enc2gettext
  
  extern const pg_enc2gettext pg_enc2gettext_tbl[];
  
-/*
- * Encoding names for ICU
- */
-extern bool is_encoding_supported_by_icu(int encoding);
-extern const char *get_encoding_name_for_icu(int encoding);
-
  /*
   * pg_wchar stuff
   */
@@ -539,8 +533,27 @@ extern const char *pg_encoding_to_char(int encoding);
  extern int pg_valid_server_encoding_id(int encoding);
  
  /*
- * Remaining functions are not considered part of libpq's API, though many
- * of them do exist inside libpq.
+ * These functions are available to frontend code that links with libpgcommon
+ * (in addition to the ones just above).  The constant tables declared
+ * earlier in this file are also available from libpgcommon.
+ */
+extern int pg_encoding_mblen(int encoding, const char *mbstr);
+extern int pg_encoding_dsplen(int encoding, const char *mbstr);
+extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len);
+extern int pg_encoding_max_length(int encoding);
+extern int pg_valid_client_encoding(const char *name);
+extern int pg_valid_server_encoding(const char *name);
+extern bool is_encoding_supported_by_icu(int encoding);
+extern const char *get_encoding_name_for_icu(int encoding);
+
+extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
+extern pg_wchar utf8_to_unicode(const unsigned char *c);
+extern bool pg_utf8_islegal(const unsigned char *source, int length);
+extern int pg_utf_mblen(const unsigned char *s);
+extern int pg_mule_mblen(const unsigned char *s);
+
+/*
+ * The remaining functions are backend-only.
   */
  extern int pg_mb2wchar(const char *from, pg_wchar *to);
  extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len);
@@ -556,18 +569,12 @@ extern int    pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t
  extern size_t pg_wchar_strlen(const pg_wchar *wstr);
  extern int pg_mblen(const char *mbstr);
  extern int pg_dsplen(const char *mbstr);
-extern int pg_encoding_mblen(int encoding, const char *mbstr);
-extern int pg_encoding_dsplen(int encoding, const char *mbstr);
-extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len);
-extern int pg_mule_mblen(const unsigned char *mbstr);
-extern int pg_mic_mblen(const unsigned char *mbstr);
  extern int pg_mbstrlen(const char *mbstr);
  extern int pg_mbstrlen_with_len(const char *mbstr, int len);
  extern int pg_mbcliplen(const char *mbstr, int len, int limit);
  extern int pg_encoding_mbcliplen(int encoding, const char *mbstr,
                                   int len, int limit);
  extern int pg_mbcharcliplen(const char *mbstr, int len, int limit);
-extern int pg_encoding_max_length(int encoding);
  extern int pg_database_encoding_max_length(void);
  extern mbcharacter_incrementer pg_database_encoding_character_incrementer(void);
  
@@ -587,12 +594,6 @@ extern int GetMessageEncoding(void);
  extern int pg_bind_textdomain_codeset(const char *domainname);
  #endif
  
-extern int pg_valid_client_encoding(const char *name);
-extern int pg_valid_server_encoding(const char *name);
-
-extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
-extern pg_wchar utf8_to_unicode(const unsigned char *c);
-extern int pg_utf_mblen(const unsigned char *);
  extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
                                                 int src_encoding,
                                                 int dest_encoding);
@@ -647,8 +648,6 @@ extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
                                  int len, int lc, int encoding,
                                  const unsigned char *tab);
  
-extern bool pg_utf8_islegal(const unsigned char *source, int length);
-
  #ifdef WIN32
  extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
  #endif
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 16 Jan 2020 23:08:21 +0000 (18:08 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 16 Jan 2020 23:08:21 +0000 (18:08 -0500)
src/backend/utils/adt/pg_locale.c		patch \| blob \| blame \| history
src/backend/utils/mb/conv.c		patch \| blob \| blame \| history
src/backend/utils/mb/mbutils.c		patch \| blob \| blame \| history
src/common/encnames.c		patch \| blob \| blame \| history
src/common/wchar.c		patch \| blob \| blame \| history
src/include/mb/pg_wchar.h		patch \| blob \| blame \| history