UConverter *conv;
if (icu_converter)
- return;
+ return; /* already done */
icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
+ if (!icu_encoding_name)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("encoding \"%s\" not supported by ICU",
+ pg_encoding_to_char(GetDatabaseEncoding()))));
status = U_ZERO_ERROR;
conv = ucnv_open(icu_encoding_name, &status);
}
else
{
- int l = pg_mic_mblen(mic);
+ int l = pg_mule_mblen(mic);
if (len < l)
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
}
else
{
- int l = pg_mic_mblen(mic);
+ int l = pg_mule_mblen(mic);
if (len < l)
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
}
+Datum
+PG_char_to_encoding(PG_FUNCTION_ARGS)
+{
+ Name s = PG_GETARG_NAME(0);
+
+ PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
+}
+
+Datum
+PG_encoding_to_char(PG_FUNCTION_ARGS)
+{
+ int32 encoding = PG_GETARG_INT32(0);
+ const char *encoding_name = pg_encoding_to_char(encoding);
+
+ return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
+}
+
/*
* gettext() returns messages in this encoding. This often matches the
* database encoding, but it differs for SQL_ASCII databases, for processes
return MessageEncoding->encoding;
}
+
+/*
+ * Generic character incrementer function.
+ *
+ * Not knowing anything about the properties of the encoding in use, we just
+ * keep incrementing the last byte until we get a validly-encoded result,
+ * or we run out of values to try. We don't bother to try incrementing
+ * higher-order bytes, so there's no growth in runtime for wider characters.
+ * (If we did try to do that, we'd need to consider the likelihood that 255
+ * is not a valid final byte in the encoding.)
+ */
+static bool
+pg_generic_charinc(unsigned char *charptr, int len)
+{
+ unsigned char *lastbyte = charptr + len - 1;
+ mbverifier mbverify;
+
+ /* We can just invoke the character verifier directly. */
+ mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
+
+ while (*lastbyte < (unsigned char) 255)
+ {
+ (*lastbyte)++;
+ if ((*mbverify) (charptr, len) == len)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * UTF-8 character incrementer function.
+ *
+ * For a one-byte character less than 0x7F, we just increment the byte.
+ *
+ * For a multibyte character, every byte but the first must fall between 0x80
+ * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
+ * the last byte that's not already at its maximum value. If we can't find a
+ * byte that's less than the maximum allowable value, we simply fail. We also
+ * need some special-case logic to skip regions used for surrogate pair
+ * handling, as those should not occur in valid UTF-8.
+ *
+ * Note that we don't reset lower-order bytes back to their minimums, since
+ * we can't afford to make an exhaustive search (see make_greater_string).
+ */
+static bool
+pg_utf8_increment(unsigned char *charptr, int length)
+{
+ unsigned char a;
+ unsigned char limit;
+
+ switch (length)
+ {
+ default:
+ /* reject lengths 5 and 6 for now */
+ return false;
+ case 4:
+ a = charptr[3];
+ if (a < 0xBF)
+ {
+ charptr[3]++;
+ break;
+ }
+ /* FALL THRU */
+ case 3:
+ a = charptr[2];
+ if (a < 0xBF)
+ {
+ charptr[2]++;
+ break;
+ }
+ /* FALL THRU */
+ case 2:
+ a = charptr[1];
+ switch (*charptr)
+ {
+ case 0xED:
+ limit = 0x9F;
+ break;
+ case 0xF4:
+ limit = 0x8F;
+ break;
+ default:
+ limit = 0xBF;
+ break;
+ }
+ if (a < limit)
+ {
+ charptr[1]++;
+ break;
+ }
+ /* FALL THRU */
+ case 1:
+ a = *charptr;
+ if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
+ return false;
+ charptr[0]++;
+ break;
+ }
+
+ return true;
+}
+
+/*
+ * EUC-JP character incrementer function.
+ *
+ * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
+ * representing JIS X 0201 characters with the second byte ranging between
+ * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
+ * and otherwise rewrite the whole sequence to 0xa1 0xa1.
+ *
+ * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
+ * in which the last two bytes range between 0xa1 and 0xfe. The last byte
+ * is incremented if possible, otherwise the second-to-last byte.
+ *
+ * If the sequence starts with a value other than the above and its MSB
+ * is set, it must be a two-byte sequence representing JIS X 0208 characters
+ * with both bytes ranging between 0xa1 and 0xfe. The last byte is
+ * incremented if possible, otherwise the second-to-last byte.
+ *
+ * Otherwise, the sequence is a single-byte ASCII character. It is
+ * incremented up to 0x7f.
+ */
+static bool
+pg_eucjp_increment(unsigned char *charptr, int length)
+{
+ unsigned char c1,
+ c2;
+ int i;
+
+ c1 = *charptr;
+
+ switch (c1)
+ {
+ case SS2: /* JIS X 0201 */
+ if (length != 2)
+ return false;
+
+ c2 = charptr[1];
+
+ if (c2 >= 0xdf)
+ charptr[0] = charptr[1] = 0xa1;
+ else if (c2 < 0xa1)
+ charptr[1] = 0xa1;
+ else
+ charptr[1]++;
+ break;
+
+ case SS3: /* JIS X 0212 */
+ if (length != 3)
+ return false;
+
+ for (i = 2; i > 0; i--)
+ {
+ c2 = charptr[i];
+ if (c2 < 0xa1)
+ {
+ charptr[i] = 0xa1;
+ return true;
+ }
+ else if (c2 < 0xfe)
+ {
+ charptr[i]++;
+ return true;
+ }
+ }
+
+ /* Out of 3-byte code region */
+ return false;
+
+ default:
+ if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
+ {
+ if (length != 2)
+ return false;
+
+ for (i = 1; i >= 0; i--)
+ {
+ c2 = charptr[i];
+ if (c2 < 0xa1)
+ {
+ charptr[i] = 0xa1;
+ return true;
+ }
+ else if (c2 < 0xfe)
+ {
+ charptr[i]++;
+ return true;
+ }
+ }
+
+ /* Out of 2 byte code region */
+ return false;
+ }
+ else
+ { /* ASCII, single byte */
+ if (c1 > 0x7e)
+ return false;
+ (*charptr)++;
+ }
+ break;
+ }
+
+ return true;
+}
+
+/*
+ * get the character incrementer for the encoding for the current database
+ */
+mbcharacter_incrementer
+pg_database_encoding_character_incrementer(void)
+{
+ /*
+ * Eventually it might be best to add a field to pg_wchar_table[], but for
+ * now we just use a switch.
+ */
+ switch (GetDatabaseEncoding())
+ {
+ case PG_UTF8:
+ return pg_utf8_increment;
+
+ case PG_EUC_JP:
+ return pg_eucjp_increment;
+
+ default:
+ return pg_generic_charinc;
+ }
+}
+
+/*
+ * fetch maximum length of the encoding for the current database
+ */
+int
+pg_database_encoding_max_length(void)
+{
+ return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the current
+ * database encoding. Otherwise same as pg_verify_mbstr().
+ */
+bool
+pg_verifymbstr(const char *mbstr, int len, bool noError)
+{
+ return
+ pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the specified
+ * encoding.
+ */
+bool
+pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
+{
+ return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the specified
+ * encoding.
+ *
+ * mbstr is not necessarily zero terminated; length of mbstr is
+ * specified by len.
+ *
+ * If OK, return length of string in the encoding.
+ * If a problem is found, return -1 when noError is
+ * true; when noError is false, ereport() a descriptive message.
+ */
+int
+pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
+{
+ mbverifier mbverify;
+ int mb_len;
+
+ Assert(PG_VALID_ENCODING(encoding));
+
+ /*
+ * In single-byte encodings, we need only reject nulls (\0).
+ */
+ if (pg_encoding_max_length(encoding) <= 1)
+ {
+ const char *nullpos = memchr(mbstr, 0, len);
+
+ if (nullpos == NULL)
+ return len;
+ if (noError)
+ return -1;
+ report_invalid_encoding(encoding, nullpos, 1);
+ }
+
+ /* fetch function pointer just once */
+ mbverify = pg_wchar_table[encoding].mbverify;
+
+ mb_len = 0;
+
+ while (len > 0)
+ {
+ int l;
+
+ /* fast path for ASCII-subset characters */
+ if (!IS_HIGHBIT_SET(*mbstr))
+ {
+ if (*mbstr != '\0')
+ {
+ mb_len++;
+ mbstr++;
+ len--;
+ continue;
+ }
+ if (noError)
+ return -1;
+ report_invalid_encoding(encoding, mbstr, len);
+ }
+
+ l = (*mbverify) ((const unsigned char *) mbstr, len);
+
+ if (l < 0)
+ {
+ if (noError)
+ return -1;
+ report_invalid_encoding(encoding, mbstr, len);
+ }
+
+ mbstr += l;
+ len -= l;
+ mb_len++;
+ }
+ return mb_len;
+}
+
+/*
+ * check_encoding_conversion_args: check arguments of a conversion function
+ *
+ * "expected" arguments can be either an encoding ID or -1 to indicate that
+ * the caller will check whether it accepts the ID.
+ *
+ * Note: the errors here are not really user-facing, so elog instead of
+ * ereport seems sufficient. Also, we trust that the "expected" encoding
+ * arguments are valid encoding IDs, but we don't trust the actuals.
+ */
+void
+check_encoding_conversion_args(int src_encoding,
+ int dest_encoding,
+ int len,
+ int expected_src_encoding,
+ int expected_dest_encoding)
+{
+ if (!PG_VALID_ENCODING(src_encoding))
+ elog(ERROR, "invalid source encoding ID: %d", src_encoding);
+ if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
+ elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
+ pg_enc2name_tbl[expected_src_encoding].name,
+ pg_enc2name_tbl[src_encoding].name);
+ if (!PG_VALID_ENCODING(dest_encoding))
+ elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
+ if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
+ elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
+ pg_enc2name_tbl[expected_dest_encoding].name,
+ pg_enc2name_tbl[dest_encoding].name);
+ if (len < 0)
+ elog(ERROR, "encoding conversion length must not be negative");
+}
+
+/*
+ * report_invalid_encoding: complain about invalid multibyte character
+ *
+ * note: len is remaining length of string, not length of character;
+ * len must be greater than zero, as we always examine the first byte.
+ */
+void
+report_invalid_encoding(int encoding, const char *mbstr, int len)
+{
+ int l = pg_encoding_mblen(encoding, mbstr);
+ char buf[8 * 5 + 1];
+ char *p = buf;
+ int j,
+ jlimit;
+
+ jlimit = Min(l, len);
+ jlimit = Min(jlimit, 8); /* prevent buffer overrun */
+
+ for (j = 0; j < jlimit; j++)
+ {
+ p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
+ if (j < jlimit - 1)
+ p += sprintf(p, " ");
+ }
+
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ errmsg("invalid byte sequence for encoding \"%s\": %s",
+ pg_enc2name_tbl[encoding].name,
+ buf)));
+}
+
+/*
+ * report_untranslatable_char: complain about untranslatable character
+ *
+ * note: len is remaining length of string, not length of character;
+ * len must be greater than zero, as we always examine the first byte.
+ */
+void
+report_untranslatable_char(int src_encoding, int dest_encoding,
+ const char *mbstr, int len)
+{
+ int l = pg_encoding_mblen(src_encoding, mbstr);
+ char buf[8 * 5 + 1];
+ char *p = buf;
+ int j,
+ jlimit;
+
+ jlimit = Min(l, len);
+ jlimit = Min(jlimit, 8); /* prevent buffer overrun */
+
+ for (j = 0; j < jlimit; j++)
+ {
+ p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
+ if (j < jlimit - 1)
+ p += sprintf(p, " ");
+ }
+
+ ereport(ERROR,
+ (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
+ errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
+ buf,
+ pg_enc2name_tbl[src_encoding].name,
+ pg_enc2name_tbl[dest_encoding].name)));
+}
+
+
#ifdef WIN32
/*
* Convert from MessageEncoding to a palloc'ed, null-terminated utf16
return utf16;
}
-#endif
+#endif /* WIN32 */
*
*-------------------------------------------------------------------------
*/
-#ifdef FRONTEND
-#include "postgres_fe.h"
-#else
-#include "postgres.h"
-#include "utils/builtins.h"
-#endif
+#include "c.h"
#include <ctype.h>
#include <unistd.h>
#else
#define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
#endif
+
const pg_enc2name pg_enc2name_tbl[] =
{
DEF_ENC2NAME(SQL_ASCII, 0),
};
-#ifndef FRONTEND
-
/*
- * Table of encoding names for ICU
+ * Table of encoding names for ICU (currently covers backend encodings only)
*
* Reference: <https://ssl.icu-project.org/icu-bin/convexp>
*
"KOI8-U", /* PG_KOI8U */
};
+
+/*
+ * Is this encoding supported by ICU?
+ */
bool
is_encoding_supported_by_icu(int encoding)
{
+ if (!PG_VALID_BE_ENCODING(encoding))
+ return false;
return (pg_enc2icu_tbl[encoding] != NULL);
}
+/*
+ * Returns ICU's name for encoding, or NULL if not supported
+ */
const char *
get_encoding_name_for_icu(int encoding)
{
- const char *icu_encoding_name;
-
StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
"pg_enc2icu_tbl incomplete");
- icu_encoding_name = pg_enc2icu_tbl[encoding];
-
- if (!icu_encoding_name)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("encoding \"%s\" not supported by ICU",
- pg_encoding_to_char(encoding))));
-
- return icu_encoding_name;
+ if (!PG_VALID_BE_ENCODING(encoding))
+ return NULL;
+ return pg_enc2icu_tbl[encoding];
}
-#endif /* not FRONTEND */
-
/* ----------
* Encoding checks, for error returns -1 else encoding id
return PG_VALID_BE_ENCODING(encoding);
}
-/* ----------
- * Remove irrelevant chars from encoding name
- * ----------
+/*
+ * Remove irrelevant chars from encoding name, store at *newkey
+ *
+ * (Caller's responsibility to provide a large enough buffer)
*/
static char *
clean_encoding_name(const char *key, char *newkey)
return newkey;
}
-/* ----------
+/*
* Search encoding by encoding name
*
- * Returns encoding ID, or -1 for error
- * ----------
+ * Returns encoding ID, or -1 if not recognized
*/
int
pg_char_to_encoding(const char *name)
return -1;
if (strlen(name) >= NAMEDATALEN)
- {
-#ifdef FRONTEND
- fprintf(stderr, "encoding name too long\n");
- return -1;
-#else
- ereport(ERROR,
- (errcode(ERRCODE_NAME_TOO_LONG),
- errmsg("encoding name too long")));
-#endif
- }
+ return -1; /* it's certainly not in the table */
+
key = clean_encoding_name(name, buff);
while (last >= base)
return -1;
}
-#ifndef FRONTEND
-Datum
-PG_char_to_encoding(PG_FUNCTION_ARGS)
-{
- Name s = PG_GETARG_NAME(0);
-
- PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
-}
-#endif
-
const char *
pg_encoding_to_char(int encoding)
{
}
return "";
}
-
-#ifndef FRONTEND
-Datum
-PG_encoding_to_char(PG_FUNCTION_ARGS)
-{
- int32 encoding = PG_GETARG_INT32(0);
- const char *encoding_name = pg_encoding_to_char(encoding);
-
- return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
-}
-
-#endif
*
*-------------------------------------------------------------------------
*/
-#ifdef FRONTEND
-#include "postgres_fe.h"
-#else
-#include "postgres.h"
-#endif
+#include "c.h"
#include "mb/pg_wchar.h"
return cnt;
}
+/* exported for direct use by conv.c */
int
pg_mule_mblen(const unsigned char *s)
{
return true;
}
-#ifndef FRONTEND
-
-/*
- * Generic character incrementer function.
- *
- * Not knowing anything about the properties of the encoding in use, we just
- * keep incrementing the last byte until we get a validly-encoded result,
- * or we run out of values to try. We don't bother to try incrementing
- * higher-order bytes, so there's no growth in runtime for wider characters.
- * (If we did try to do that, we'd need to consider the likelihood that 255
- * is not a valid final byte in the encoding.)
- */
-static bool
-pg_generic_charinc(unsigned char *charptr, int len)
-{
- unsigned char *lastbyte = charptr + len - 1;
- mbverifier mbverify;
-
- /* We can just invoke the character verifier directly. */
- mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
-
- while (*lastbyte < (unsigned char) 255)
- {
- (*lastbyte)++;
- if ((*mbverify) (charptr, len) == len)
- return true;
- }
-
- return false;
-}
-
-/*
- * UTF-8 character incrementer function.
- *
- * For a one-byte character less than 0x7F, we just increment the byte.
- *
- * For a multibyte character, every byte but the first must fall between 0x80
- * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
- * the last byte that's not already at its maximum value. If we can't find a
- * byte that's less than the maximum allowable value, we simply fail. We also
- * need some special-case logic to skip regions used for surrogate pair
- * handling, as those should not occur in valid UTF-8.
- *
- * Note that we don't reset lower-order bytes back to their minimums, since
- * we can't afford to make an exhaustive search (see make_greater_string).
- */
-static bool
-pg_utf8_increment(unsigned char *charptr, int length)
-{
- unsigned char a;
- unsigned char limit;
-
- switch (length)
- {
- default:
- /* reject lengths 5 and 6 for now */
- return false;
- case 4:
- a = charptr[3];
- if (a < 0xBF)
- {
- charptr[3]++;
- break;
- }
- /* FALL THRU */
- case 3:
- a = charptr[2];
- if (a < 0xBF)
- {
- charptr[2]++;
- break;
- }
- /* FALL THRU */
- case 2:
- a = charptr[1];
- switch (*charptr)
- {
- case 0xED:
- limit = 0x9F;
- break;
- case 0xF4:
- limit = 0x8F;
- break;
- default:
- limit = 0xBF;
- break;
- }
- if (a < limit)
- {
- charptr[1]++;
- break;
- }
- /* FALL THRU */
- case 1:
- a = *charptr;
- if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
- return false;
- charptr[0]++;
- break;
- }
-
- return true;
-}
-
-/*
- * EUC-JP character incrementer function.
- *
- * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
- * representing JIS X 0201 characters with the second byte ranging between
- * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
- * and otherwise rewrite the whole sequence to 0xa1 0xa1.
- *
- * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
- * in which the last two bytes range between 0xa1 and 0xfe. The last byte
- * is incremented if possible, otherwise the second-to-last byte.
- *
- * If the sequence starts with a value other than the above and its MSB
- * is set, it must be a two-byte sequence representing JIS X 0208 characters
- * with both bytes ranging between 0xa1 and 0xfe. The last byte is
- * incremented if possible, otherwise the second-to-last byte.
- *
- * Otherwise, the sequence is a single-byte ASCII character. It is
- * incremented up to 0x7f.
- */
-static bool
-pg_eucjp_increment(unsigned char *charptr, int length)
-{
- unsigned char c1,
- c2;
- int i;
-
- c1 = *charptr;
-
- switch (c1)
- {
- case SS2: /* JIS X 0201 */
- if (length != 2)
- return false;
-
- c2 = charptr[1];
-
- if (c2 >= 0xdf)
- charptr[0] = charptr[1] = 0xa1;
- else if (c2 < 0xa1)
- charptr[1] = 0xa1;
- else
- charptr[1]++;
- break;
-
- case SS3: /* JIS X 0212 */
- if (length != 3)
- return false;
-
- for (i = 2; i > 0; i--)
- {
- c2 = charptr[i];
- if (c2 < 0xa1)
- {
- charptr[i] = 0xa1;
- return true;
- }
- else if (c2 < 0xfe)
- {
- charptr[i]++;
- return true;
- }
- }
-
- /* Out of 3-byte code region */
- return false;
-
- default:
- if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
- {
- if (length != 2)
- return false;
-
- for (i = 1; i >= 0; i--)
- {
- c2 = charptr[i];
- if (c2 < 0xa1)
- {
- charptr[i] = 0xa1;
- return true;
- }
- else if (c2 < 0xfe)
- {
- charptr[i]++;
- return true;
- }
- }
-
- /* Out of 2 byte code region */
- return false;
- }
- else
- { /* ASCII, single byte */
- if (c1 > 0x7e)
- return false;
- (*charptr)++;
- }
- break;
- }
-
- return true;
-}
-#endif /* !FRONTEND */
-
/*
*-------------------------------------------------------------------
{0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */
};
-/* returns the byte length of a word for mule internal code */
-int
-pg_mic_mblen(const unsigned char *mbstr)
-{
- return pg_mule_mblen(mbstr);
-}
-
/*
* Returns the byte length of a multibyte character.
*/
return pg_wchar_table[encoding].maxmblen;
}
-
-#ifndef FRONTEND
-
-/*
- * fetch maximum length of the encoding for the current database
- */
-int
-pg_database_encoding_max_length(void)
-{
- return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
-}
-
-/*
- * get the character incrementer for the encoding for the current database
- */
-mbcharacter_incrementer
-pg_database_encoding_character_incrementer(void)
-{
- /*
- * Eventually it might be best to add a field to pg_wchar_table[], but for
- * now we just use a switch.
- */
- switch (GetDatabaseEncoding())
- {
- case PG_UTF8:
- return pg_utf8_increment;
-
- case PG_EUC_JP:
- return pg_eucjp_increment;
-
- default:
- return pg_generic_charinc;
- }
-}
-
-/*
- * Verify mbstr to make sure that it is validly encoded in the current
- * database encoding. Otherwise same as pg_verify_mbstr().
- */
-bool
-pg_verifymbstr(const char *mbstr, int len, bool noError)
-{
- return
- pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
-}
-
-/*
- * Verify mbstr to make sure that it is validly encoded in the specified
- * encoding.
- */
-bool
-pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
-{
- return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
-}
-
-/*
- * Verify mbstr to make sure that it is validly encoded in the specified
- * encoding.
- *
- * mbstr is not necessarily zero terminated; length of mbstr is
- * specified by len.
- *
- * If OK, return length of string in the encoding.
- * If a problem is found, return -1 when noError is
- * true; when noError is false, ereport() a descriptive message.
- */
-int
-pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
-{
- mbverifier mbverify;
- int mb_len;
-
- Assert(PG_VALID_ENCODING(encoding));
-
- /*
- * In single-byte encodings, we need only reject nulls (\0).
- */
- if (pg_encoding_max_length(encoding) <= 1)
- {
- const char *nullpos = memchr(mbstr, 0, len);
-
- if (nullpos == NULL)
- return len;
- if (noError)
- return -1;
- report_invalid_encoding(encoding, nullpos, 1);
- }
-
- /* fetch function pointer just once */
- mbverify = pg_wchar_table[encoding].mbverify;
-
- mb_len = 0;
-
- while (len > 0)
- {
- int l;
-
- /* fast path for ASCII-subset characters */
- if (!IS_HIGHBIT_SET(*mbstr))
- {
- if (*mbstr != '\0')
- {
- mb_len++;
- mbstr++;
- len--;
- continue;
- }
- if (noError)
- return -1;
- report_invalid_encoding(encoding, mbstr, len);
- }
-
- l = (*mbverify) ((const unsigned char *) mbstr, len);
-
- if (l < 0)
- {
- if (noError)
- return -1;
- report_invalid_encoding(encoding, mbstr, len);
- }
-
- mbstr += l;
- len -= l;
- mb_len++;
- }
- return mb_len;
-}
-
-/*
- * check_encoding_conversion_args: check arguments of a conversion function
- *
- * "expected" arguments can be either an encoding ID or -1 to indicate that
- * the caller will check whether it accepts the ID.
- *
- * Note: the errors here are not really user-facing, so elog instead of
- * ereport seems sufficient. Also, we trust that the "expected" encoding
- * arguments are valid encoding IDs, but we don't trust the actuals.
- */
-void
-check_encoding_conversion_args(int src_encoding,
- int dest_encoding,
- int len,
- int expected_src_encoding,
- int expected_dest_encoding)
-{
- if (!PG_VALID_ENCODING(src_encoding))
- elog(ERROR, "invalid source encoding ID: %d", src_encoding);
- if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
- elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
- pg_enc2name_tbl[expected_src_encoding].name,
- pg_enc2name_tbl[src_encoding].name);
- if (!PG_VALID_ENCODING(dest_encoding))
- elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
- if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
- elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
- pg_enc2name_tbl[expected_dest_encoding].name,
- pg_enc2name_tbl[dest_encoding].name);
- if (len < 0)
- elog(ERROR, "encoding conversion length must not be negative");
-}
-
-/*
- * report_invalid_encoding: complain about invalid multibyte character
- *
- * note: len is remaining length of string, not length of character;
- * len must be greater than zero, as we always examine the first byte.
- */
-void
-report_invalid_encoding(int encoding, const char *mbstr, int len)
-{
- int l = pg_encoding_mblen(encoding, mbstr);
- char buf[8 * 5 + 1];
- char *p = buf;
- int j,
- jlimit;
-
- jlimit = Min(l, len);
- jlimit = Min(jlimit, 8); /* prevent buffer overrun */
-
- for (j = 0; j < jlimit; j++)
- {
- p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
- if (j < jlimit - 1)
- p += sprintf(p, " ");
- }
-
- ereport(ERROR,
- (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
- errmsg("invalid byte sequence for encoding \"%s\": %s",
- pg_enc2name_tbl[encoding].name,
- buf)));
-}
-
-/*
- * report_untranslatable_char: complain about untranslatable character
- *
- * note: len is remaining length of string, not length of character;
- * len must be greater than zero, as we always examine the first byte.
- */
-void
-report_untranslatable_char(int src_encoding, int dest_encoding,
- const char *mbstr, int len)
-{
- int l = pg_encoding_mblen(src_encoding, mbstr);
- char buf[8 * 5 + 1];
- char *p = buf;
- int j,
- jlimit;
-
- jlimit = Min(l, len);
- jlimit = Min(jlimit, 8); /* prevent buffer overrun */
-
- for (j = 0; j < jlimit; j++)
- {
- p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
- if (j < jlimit - 1)
- p += sprintf(p, " ");
- }
-
- ereport(ERROR,
- (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
- errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
- buf,
- pg_enc2name_tbl[src_encoding].name,
- pg_enc2name_tbl[dest_encoding].name)));
-}
-
-#endif /* !FRONTEND */
* src/include/mb/pg_wchar.h
*
* NOTES
- * This is used both by the backend and by libpq, but should not be
+ * This is used both by the backend and by frontends, but should not be
* included by libpq client programs. In particular, a libpq client
* should not assume that the encoding IDs used by the version of libpq
* it's linked to match up with the IDs declared here.
extern const pg_enc2gettext pg_enc2gettext_tbl[];
-/*
- * Encoding names for ICU
- */
-extern bool is_encoding_supported_by_icu(int encoding);
-extern const char *get_encoding_name_for_icu(int encoding);
-
/*
* pg_wchar stuff
*/
extern int pg_valid_server_encoding_id(int encoding);
/*
- * Remaining functions are not considered part of libpq's API, though many
- * of them do exist inside libpq.
+ * These functions are available to frontend code that links with libpgcommon
+ * (in addition to the ones just above). The constant tables declared
+ * earlier in this file are also available from libpgcommon.
+ */
+extern int pg_encoding_mblen(int encoding, const char *mbstr);
+extern int pg_encoding_dsplen(int encoding, const char *mbstr);
+extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len);
+extern int pg_encoding_max_length(int encoding);
+extern int pg_valid_client_encoding(const char *name);
+extern int pg_valid_server_encoding(const char *name);
+extern bool is_encoding_supported_by_icu(int encoding);
+extern const char *get_encoding_name_for_icu(int encoding);
+
+extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
+extern pg_wchar utf8_to_unicode(const unsigned char *c);
+extern bool pg_utf8_islegal(const unsigned char *source, int length);
+extern int pg_utf_mblen(const unsigned char *s);
+extern int pg_mule_mblen(const unsigned char *s);
+
+/*
+ * The remaining functions are backend-only.
*/
extern int pg_mb2wchar(const char *from, pg_wchar *to);
extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len);
extern size_t pg_wchar_strlen(const pg_wchar *wstr);
extern int pg_mblen(const char *mbstr);
extern int pg_dsplen(const char *mbstr);
-extern int pg_encoding_mblen(int encoding, const char *mbstr);
-extern int pg_encoding_dsplen(int encoding, const char *mbstr);
-extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len);
-extern int pg_mule_mblen(const unsigned char *mbstr);
-extern int pg_mic_mblen(const unsigned char *mbstr);
extern int pg_mbstrlen(const char *mbstr);
extern int pg_mbstrlen_with_len(const char *mbstr, int len);
extern int pg_mbcliplen(const char *mbstr, int len, int limit);
extern int pg_encoding_mbcliplen(int encoding, const char *mbstr,
int len, int limit);
extern int pg_mbcharcliplen(const char *mbstr, int len, int limit);
-extern int pg_encoding_max_length(int encoding);
extern int pg_database_encoding_max_length(void);
extern mbcharacter_incrementer pg_database_encoding_character_incrementer(void);
extern int pg_bind_textdomain_codeset(const char *domainname);
#endif
-extern int pg_valid_client_encoding(const char *name);
-extern int pg_valid_server_encoding(const char *name);
-
-extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
-extern pg_wchar utf8_to_unicode(const unsigned char *c);
-extern int pg_utf_mblen(const unsigned char *);
extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
int src_encoding,
int dest_encoding);
int len, int lc, int encoding,
const unsigned char *tab);
-extern bool pg_utf8_islegal(const unsigned char *source, int length);
-
#ifdef WIN32
extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
#endif