Refactor to add pg_strcoll(), pg_strxfrm(), and variants.

author Jeff Davis <jdavis@postgresql.org>

Thu, 23 Feb 2023 18:55:20 +0000 (10:55 -0800)

committer Jeff Davis <jdavis@postgresql.org>

Thu, 23 Feb 2023 18:55:20 +0000 (10:55 -0800)
author Jeff Davis <jdavis@postgresql.org>
Thu, 23 Feb 2023 18:55:20 +0000 (10:55 -0800)
committer Jeff Davis <jdavis@postgresql.org>
Thu, 23 Feb 2023 18:55:20 +0000 (10:55 -0800)
diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c

index e3e40d6c21aaa219714f6123d5a26b8f2caa2edc..2deba44abd3b568ae1fd6782e502e93adeee31f6 100644 (file)
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -292,21 +292,24 @@ hashtext(PG_FUNCTION_ARGS)
  #ifdef USE_ICU
                 if (mylocale->provider == COLLPROVIDER_ICU)
                 {
-                       int32_t         ulen = -1;
-                       UChar      *uchar = NULL;
-                       Size            bsize;
-                       uint8_t    *buf;
+                       Size            bsize, rsize;
+                       char       *buf;
+                       const char *keydata = VARDATA_ANY(key);
+                       size_t          keylen = VARSIZE_ANY_EXHDR(key);
  
-                       ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key));
+                       bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale);
+                       buf = palloc(bsize + 1);
  
-                       bsize = ucol_getSortKey(mylocale->info.icu.ucol,
-                                                                       uchar, ulen, NULL, 0);
-                       buf = palloc(bsize);
-                       ucol_getSortKey(mylocale->info.icu.ucol,
-                                                       uchar, ulen, buf, bsize);
-                       pfree(uchar);
+                       rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale);
+                       if (rsize != bsize)
+                               elog(ERROR, "pg_strnxfrm() returned unexpected result");
  
-                       result = hash_any(buf, bsize);
+                       /*
+                        * In principle, there's no reason to include the terminating NUL
+                        * character in the hash, but it was done before and the behavior
+                        * must be preserved.
+                        */
+                       result = hash_any((uint8_t *) buf, bsize + 1);
  
                         pfree(buf);
                 }
@@ -350,21 +353,25 @@ hashtextextended(PG_FUNCTION_ARGS)
  #ifdef USE_ICU
                 if (mylocale->provider == COLLPROVIDER_ICU)
                 {
-                       int32_t         ulen = -1;
-                       UChar      *uchar = NULL;
-                       Size            bsize;
-                       uint8_t    *buf;
-
-                       ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key));
-
-                       bsize = ucol_getSortKey(mylocale->info.icu.ucol,
-                                                                       uchar, ulen, NULL, 0);
-                       buf = palloc(bsize);
-                       ucol_getSortKey(mylocale->info.icu.ucol,
-                                                       uchar, ulen, buf, bsize);
-                       pfree(uchar);
-
-                       result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1));
+                       Size            bsize, rsize;
+                       char       *buf;
+                       const char *keydata = VARDATA_ANY(key);
+                       size_t          keylen = VARSIZE_ANY_EXHDR(key);
+
+                       bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale);
+                       buf = palloc(bsize + 1);
+
+                       rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale);
+                       if (rsize != bsize)
+                               elog(ERROR, "pg_strnxfrm() returned unexpected result");
+
+                       /*
+                        * In principle, there's no reason to include the terminating NUL
+                        * character in the hash, but it was done before and the behavior
+                        * must be preserved.
+                        */
+                       result = hash_any_extended((uint8_t *) buf, bsize + 1,
+                                                                          PG_GETARG_INT64(1));
  
                         pfree(buf);
                 }
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c

index 059e4fd79f08d7ba739d8b4b3d539f290a641dd0..ef9efb4a7c901502e593094a41e1ea87a135a115 100644 (file)
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -79,6 +79,12 @@
  #include <shlwapi.h>
  #endif
  
+/*
+ * This should be large enough that most strings will fit, but small enough
+ * that we feel comfortable putting it on the stack
+ */
+#define                TEXTBUFLEN                      1024
+
  #define                MAX_L10N_DATA           80
  
  
@@ -123,6 +129,19 @@ static char *IsoLocaleName(const char *);
  #endif
  
  #ifdef USE_ICU
+/*
+ * Converter object for converting between ICU's UChar strings and C strings
+ * in database encoding.  Since the database encoding doesn't change, we only
+ * need one of these per session.
+ */
+static UConverter *icu_converter = NULL;
+
+static void init_icu_converter(void);
+static size_t uchar_length(UConverter *converter,
+                                                  const char *str, int32_t len);
+static int32_t uchar_convert(UConverter *converter,
+                                                        UChar *dest, int32_t destlen,
+                                                        const char *str, int32_t srclen);
  static void icu_set_collation_attributes(UCollator *collator, const char *loc);
  #endif
  
@@ -1731,15 +1750,705 @@ get_collation_actual_version(char collprovider, const char *collcollate)
         return collversion;
  }
  
+/*
+ * pg_strncoll_libc_win32_utf8
+ *
+ * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
+ * invoke wcscoll() or wcscoll_l().
+ */
+#ifdef WIN32
+static int
+pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
+                                                       size_t len2, pg_locale_t locale)
+{
+       char            sbuf[TEXTBUFLEN];
+       char       *buf = sbuf;
+       char       *a1p,
+                          *a2p;
+       int                     a1len = len1 * 2 + 2;
+       int                     a2len = len2 * 2 + 2;
+       int                     r;
+       int                     result;
+
+       Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
+       Assert(GetDatabaseEncoding() == PG_UTF8);
+#ifndef WIN32
+       Assert(false);
+#endif
+
+       if (a1len + a2len > TEXTBUFLEN)
+               buf = palloc(a1len + a2len);
+
+       a1p = buf;
+       a2p = buf + a1len;
+
+       /* API does not work for zero-length input */
+       if (len1 == 0)
+               r = 0;
+       else
+       {
+               r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
+                                                               (LPWSTR) a1p, a1len / 2);
+               if (!r)
+                       ereport(ERROR,
+                                       (errmsg("could not convert string to UTF-16: error code %lu",
+                                                       GetLastError())));
+       }
+       ((LPWSTR) a1p)[r] = 0;
+
+       if (len2 == 0)
+               r = 0;
+       else
+       {
+               r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
+                                                               (LPWSTR) a2p, a2len / 2);
+               if (!r)
+                       ereport(ERROR,
+                                       (errmsg("could not convert string to UTF-16: error code %lu",
+                                                       GetLastError())));
+       }
+       ((LPWSTR) a2p)[r] = 0;
+
+       errno = 0;
+#ifdef HAVE_LOCALE_T
+       if (locale)
+               result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
+       else
+#endif
+               result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
+       if (result == 2147483647)       /* _NLSCMPERROR; missing from mingw
+                                                                * headers */
+               ereport(ERROR,
+                               (errmsg("could not compare Unicode strings: %m")));
+
+       if (buf != sbuf)
+               pfree(buf);
+
+       return result;
+}
+#endif                                                 /* WIN32 */
+
+/*
+ * pg_strcoll_libc
+ *
+ * Call strcoll(), strcoll_l(), wcscoll(), or wcscoll_l() as appropriate for
+ * the given locale, platform, and database encoding. If the locale is NULL,
+ * use the database collation.
+ *
+ * Arguments must be encoded in the database encoding and nul-terminated.
+ */
+static int
+pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
+{
+       int result;
+
+       Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
+#ifdef WIN32
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+               size_t len1 = strlen(arg1);
+               size_t len2 = strlen(arg2);
+               result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
+       }
+       else
+#endif                                                 /* WIN32 */
+       if (locale)
+       {
+#ifdef HAVE_LOCALE_T
+               result = strcoll_l(arg1, arg2, locale->info.lt);
+#else
+               /* shouldn't happen */
+               elog(ERROR, "unsupported collprovider: %c", locale->provider);
+#endif
+       }
+       else
+               result = strcoll(arg1, arg2);
+
+       return result;
+}
+
+/*
+ * pg_strncoll_libc
+ *
+ * Nul-terminate the arguments and call pg_strcoll_libc().
+ */
+static int
+pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
+                                pg_locale_t locale)
+{
+       char     sbuf[TEXTBUFLEN];
+       char    *buf      = sbuf;
+       size_t   bufsize1 = len1 + 1;
+       size_t   bufsize2 = len2 + 1;
+       char    *arg1n;
+       char    *arg2n;
+       int              result;
+
+       Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
+
+#ifdef WIN32
+       /* check for this case before doing the work for nul-termination */
+       if (GetDatabaseEncoding() == PG_UTF8)
+               return pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
+#endif                                                 /* WIN32 */
+
+       if (bufsize1 + bufsize2 > TEXTBUFLEN)
+               buf = palloc(bufsize1 + bufsize2);
+
+       arg1n = buf;
+       arg2n = buf + bufsize1;
+
+       /* nul-terminate arguments */
+       memcpy(arg1n, arg1, len1);
+       arg1n[len1] = '\0';
+       memcpy(arg2n, arg2, len2);
+       arg2n[len2] = '\0';
+
+       result = pg_strcoll_libc(arg1n, arg2n, locale);
+
+       if (buf != sbuf)
+               pfree(buf);
+
+       return result;
+}
  
  #ifdef USE_ICU
+
  /*
- * Converter object for converting between ICU's UChar strings and C strings
- * in database encoding.  Since the database encoding doesn't change, we only
- * need one of these per session.
+ * pg_strncoll_icu_no_utf8
+ *
+ * Convert the arguments from the database encoding to UChar strings, then
+ * call ucol_strcoll(). An argument length of -1 means that the string is
+ * NUL-terminated.
+ *
+ * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
+ * caller should call that instead.
   */
-static UConverter *icu_converter = NULL;
+static int
+pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1,
+                                               const char *arg2, int32_t len2, pg_locale_t locale)
+{
+       char     sbuf[TEXTBUFLEN];
+       char    *buf = sbuf;
+       int32_t  ulen1;
+       int32_t  ulen2;
+       size_t   bufsize1;
+       size_t   bufsize2;
+       UChar   *uchar1,
+                       *uchar2;
+       int              result;
+
+       Assert(locale->provider == COLLPROVIDER_ICU);
+#ifdef HAVE_UCOL_STRCOLLUTF8
+       Assert(GetDatabaseEncoding() != PG_UTF8);
+#endif
+
+       init_icu_converter();
+
+       ulen1 = uchar_length(icu_converter, arg1, len1);
+       ulen2 = uchar_length(icu_converter, arg2, len2);
+
+       bufsize1 = (ulen1 + 1) * sizeof(UChar);
+       bufsize2 = (ulen2 + 1) * sizeof(UChar);
+
+       if (bufsize1 + bufsize2 > TEXTBUFLEN)
+               buf = palloc(bufsize1 + bufsize2);
+
+       uchar1 = (UChar *) buf;
+       uchar2 = (UChar *) (buf + bufsize1);
+
+       ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
+       ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
+
+       result = ucol_strcoll(locale->info.icu.ucol,
+                                                 uchar1, ulen1,
+                                                 uchar2, ulen2);
+
+       if (buf != sbuf)
+               pfree(buf);
+
+       return result;
+}
+
+/*
+ * pg_strncoll_icu
+ *
+ * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
+ * database encoding. An argument length of -1 means the string is
+ * NUL-terminated.
+ *
+ * Arguments must be encoded in the database encoding.
+ */
+static int
+pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
+                               pg_locale_t locale)
+{
+       int result;
+
+       Assert(locale->provider == COLLPROVIDER_ICU);
+
+#ifdef HAVE_UCOL_STRCOLLUTF8
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+               UErrorCode      status;
+
+               status = U_ZERO_ERROR;
+               result = ucol_strcollUTF8(locale->info.icu.ucol,
+                                                                 arg1, len1,
+                                                                 arg2, len2,
+                                                                 &status);
+               if (U_FAILURE(status))
+                       ereport(ERROR,
+                                       (errmsg("collation failed: %s", u_errorName(status))));
+       }
+       else
+#endif
+       {
+               result = pg_strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
+       }
+
+       return result;
+}
+
+#endif                                                 /* USE_ICU */
+
+/*
+ * pg_strcoll
+ *
+ * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(),
+ * or wcscoll_l() as appropriate for the given locale, platform, and database
+ * encoding. If the locale is not specified, use the database collation.
+ *
+ * Arguments must be encoded in the database encoding and nul-terminated.
+ *
+ * The caller is responsible for breaking ties if the collation is
+ * deterministic; this maintains consistency with pg_strxfrm(), which cannot
+ * easily account for deterministic collations.
+ */
+int
+pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
+{
+       int                     result;
+
+       if (!locale || locale->provider == COLLPROVIDER_LIBC)
+               result = pg_strcoll_libc(arg1, arg2, locale);
+#ifdef USE_ICU
+       else if (locale->provider == COLLPROVIDER_ICU)
+               result = pg_strncoll_icu(arg1, -1, arg2, -1, locale);
+#endif
+       else
+               /* shouldn't happen */
+               elog(ERROR, "unsupported collprovider: %c", locale->provider);
+
+       return result;
+}
+
+/*
+ * pg_strncoll
+ *
+ * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(),
+ * or wcscoll_l() as appropriate for the given locale, platform, and database
+ * encoding. If the locale is not specified, use the database collation.
+ *
+ * Arguments must be encoded in the database encoding.
+ *
+ * This function may need to nul-terminate the arguments for libc functions;
+ * so if the caller already has nul-terminated strings, it should call
+ * pg_strcoll() instead.
+ *
+ * The caller is responsible for breaking ties if the collation is
+ * deterministic; this maintains consistency with pg_strnxfrm(), which cannot
+ * easily account for deterministic collations.
+ */
+int
+pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2,
+                       pg_locale_t locale)
+{
+       int              result;
+
+       if (!locale || locale->provider == COLLPROVIDER_LIBC)
+               result = pg_strncoll_libc(arg1, len1, arg2, len2, locale);
+#ifdef USE_ICU
+       else if (locale->provider == COLLPROVIDER_ICU)
+               result = pg_strncoll_icu(arg1, len1, arg2, len2, locale);
+#endif
+       else
+               /* shouldn't happen */
+               elog(ERROR, "unsupported collprovider: %c", locale->provider);
+
+       return result;
+}
+
+
+static size_t
+pg_strxfrm_libc(char *dest, const char *src, size_t destsize,
+                               pg_locale_t locale)
+{
+       Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
+
+#ifdef TRUST_STRXFRM
+#ifdef HAVE_LOCALE_T
+       if (locale)
+               return strxfrm_l(dest, src, destsize, locale->info.lt);
+       else
+#endif
+               return strxfrm(dest, src, destsize);
+#else
+       /* shouldn't happen */
+       elog(ERROR, "unsupported collprovider: %c", locale->provider);
+#endif
+}
+
+static size_t
+pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
+                                pg_locale_t locale)
+{
+       char     sbuf[TEXTBUFLEN];
+       char    *buf     = sbuf;
+       size_t   bufsize = srclen + 1;
+       size_t   result;
+
+       Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
+
+       if (bufsize > TEXTBUFLEN)
+               buf = palloc(bufsize);
  
+       /* nul-terminate arguments */
+       memcpy(buf, src, srclen);
+       buf[srclen] = '\0';
+
+       result = pg_strxfrm_libc(dest, buf, destsize, locale);
+
+       if (buf != sbuf)
+               pfree(buf);
+
+       /* if dest is defined, it should be nul-terminated */
+       Assert(result >= destsize || dest[result] == '\0');
+
+       return result;
+}
+
+#ifdef USE_ICU
+
+/* 'srclen' of -1 means the strings are NUL-terminated */
+static size_t
+pg_strnxfrm_icu(char *dest, const char *src, int32_t srclen, int32_t destsize,
+                               pg_locale_t locale)
+{
+       char     sbuf[TEXTBUFLEN];
+       char    *buf    = sbuf;
+       UChar   *uchar;
+       int32_t  ulen;
+       size_t   uchar_bsize;
+       Size     result_bsize;
+
+       Assert(locale->provider == COLLPROVIDER_ICU);
+
+       init_icu_converter();
+
+       ulen = uchar_length(icu_converter, src, srclen);
+
+       uchar_bsize = (ulen + 1) * sizeof(UChar);
+
+       if (uchar_bsize > TEXTBUFLEN)
+               buf = palloc(uchar_bsize);
+
+       uchar = (UChar *) buf;
+
+       ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
+
+       result_bsize = ucol_getSortKey(locale->info.icu.ucol,
+                                                                  uchar, ulen,
+                                                                  (uint8_t *) dest, destsize);
+
+       /*
+        * ucol_getSortKey() counts the nul-terminator in the result length, but
+        * this function should not.
+        */
+       Assert(result_bsize > 0);
+       result_bsize--;
+
+       if (buf != sbuf)
+               pfree(buf);
+
+       /* if dest is defined, it should be nul-terminated */
+       Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
+
+       return result_bsize;
+}
+
+/* 'srclen' of -1 means the strings are NUL-terminated */
+static size_t
+pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, int32_t srclen,
+                                                          int32_t destsize, pg_locale_t locale)
+{
+       char                     sbuf[TEXTBUFLEN];
+       char                    *buf   = sbuf;
+       UCharIterator    iter;
+       uint32_t                 state[2];
+       UErrorCode               status;
+       int32_t                  ulen  = -1;
+       UChar                   *uchar = NULL;
+       size_t                   uchar_bsize;
+       Size                     result_bsize;
+
+       Assert(locale->provider == COLLPROVIDER_ICU);
+       Assert(GetDatabaseEncoding() != PG_UTF8);
+
+       init_icu_converter();
+
+       ulen = uchar_length(icu_converter, src, srclen);
+
+       uchar_bsize = (ulen + 1) * sizeof(UChar);
+
+       if (uchar_bsize > TEXTBUFLEN)
+               buf = palloc(uchar_bsize);
+
+       uchar = (UChar *) buf;
+
+       ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
+
+       uiter_setString(&iter, uchar, ulen);
+       state[0] = state[1] = 0;        /* won't need that again */
+       status = U_ZERO_ERROR;
+       result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
+                                                                               &iter,
+                                                                               state,
+                                                                               (uint8_t *) dest,
+                                                                               destsize,
+                                                                               &status);
+       if (U_FAILURE(status))
+               ereport(ERROR,
+                               (errmsg("sort key generation failed: %s",
+                                               u_errorName(status))));
+
+       return result_bsize;
+}
+
+/* 'srclen' of -1 means the strings are NUL-terminated */
+static size_t
+pg_strnxfrm_prefix_icu(char *dest, const char *src, int32_t srclen,
+                                          int32_t destsize, pg_locale_t locale)
+{
+       size_t result;
+
+       Assert(locale->provider == COLLPROVIDER_ICU);
+
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+               UCharIterator iter;
+               uint32_t        state[2];
+               UErrorCode      status;
+
+               uiter_setUTF8(&iter, src, srclen);
+               state[0] = state[1] = 0;        /* won't need that again */
+               status = U_ZERO_ERROR;
+               result = ucol_nextSortKeyPart(locale->info.icu.ucol,
+                                                                         &iter,
+                                                                         state,
+                                                                         (uint8_t *) dest,
+                                                                         destsize,
+                                                                         &status);
+               if (U_FAILURE(status))
+                       ereport(ERROR,
+                                       (errmsg("sort key generation failed: %s",
+                                                       u_errorName(status))));
+       }
+       else
+               result = pg_strnxfrm_prefix_icu_no_utf8(dest, src, srclen, destsize,
+                                                                                               locale);
+
+       return result;
+}
+
+#endif
+
+/*
+ * Return true if the collation provider supports pg_strxfrm() and
+ * pg_strnxfrm(); otherwise false.
+ *
+ * Unfortunately, it seems that strxfrm() for non-C collations is broken on
+ * many common platforms; testing of multiple versions of glibc reveals that,
+ * for many locales, strcoll() and strxfrm() do not return consistent
+ * results. While no other libc other than Cygwin has so far been shown to
+ * have a problem, we take the conservative course of action for right now and
+ * disable this categorically.  (Users who are certain this isn't a problem on
+ * their system can define TRUST_STRXFRM.)
+ *
+ * No similar problem is known for the ICU provider.
+ */
+bool
+pg_strxfrm_enabled(pg_locale_t locale)
+{
+       if (!locale || locale->provider == COLLPROVIDER_LIBC)
+#ifdef TRUST_STRXFRM
+               return true;
+#else
+               return false;
+#endif
+       else if (locale->provider == COLLPROVIDER_ICU)
+               return true;
+       else
+               /* shouldn't happen */
+               elog(ERROR, "unsupported collprovider: %c", locale->provider);
+}
+
+/*
+ * pg_strxfrm
+ *
+ * Transforms 'src' to a nul-terminated string stored in 'dest' such that
+ * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
+ * untransformed strings.
+ *
+ * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest'
+ * may be NULL.
+ *
+ * Returns the number of bytes needed to store the transformed string,
+ * excluding the terminating nul byte. If the value returned is 'destsize' or
+ * greater, the resulting contents of 'dest' are undefined.
+ */
+size_t
+pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
+{
+       size_t result;
+
+       if (!locale || locale->provider == COLLPROVIDER_LIBC)
+               result = pg_strxfrm_libc(dest, src, destsize, locale);
+#ifdef USE_ICU
+       else if (locale->provider == COLLPROVIDER_ICU)
+               result = pg_strnxfrm_icu(dest, src, -1, destsize, locale);
+#endif
+       else
+               /* shouldn't happen */
+               elog(ERROR, "unsupported collprovider: %c", locale->provider);
+
+       return result;
+}
+
+/*
+ * pg_strnxfrm
+ *
+ * Transforms 'src' to a nul-terminated string stored in 'dest' such that
+ * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
+ * untransformed strings.
+ *
+ * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may
+ * be NULL.
+ *
+ * Returns the number of bytes needed to store the transformed string,
+ * excluding the terminating nul byte. If the value returned is 'destsize' or
+ * greater, the resulting contents of 'dest' are undefined.
+ *
+ * This function may need to nul-terminate the argument for libc functions;
+ * so if the caller already has a nul-terminated string, it should call
+ * pg_strxfrm() instead.
+ */
+size_t
+pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen,
+                       pg_locale_t locale)
+{
+       size_t result;
+
+       if (!locale || locale->provider == COLLPROVIDER_LIBC)
+               result = pg_strnxfrm_libc(dest, src, srclen, destsize, locale);
+#ifdef USE_ICU
+       else if (locale->provider == COLLPROVIDER_ICU)
+               result = pg_strnxfrm_icu(dest, src, srclen, destsize, locale);
+#endif
+       else
+               /* shouldn't happen */
+               elog(ERROR, "unsupported collprovider: %c", locale->provider);
+
+       return result;
+}
+
+/*
+ * Return true if the collation provider supports pg_strxfrm_prefix() and
+ * pg_strnxfrm_prefix(); otherwise false.
+ */
+bool
+pg_strxfrm_prefix_enabled(pg_locale_t locale)
+{
+       if (!locale || locale->provider == COLLPROVIDER_LIBC)
+               return false;
+       else if (locale->provider == COLLPROVIDER_ICU)
+               return true;
+       else
+               /* shouldn't happen */
+               elog(ERROR, "unsupported collprovider: %c", locale->provider);
+}
+
+/*
+ * pg_strxfrm_prefix
+ *
+ * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
+ * memcmp() on the byte sequence is equivalent to pg_strcoll() on
+ * untransformed strings. The result is not nul-terminated.
+ *
+ * The provided 'src' must be nul-terminated.
+ *
+ * If destsize is not large enough to hold the resulting byte sequence, stores
+ * only the first destsize bytes in 'dest'. Returns the number of bytes
+ * actually copied to 'dest'.
+ */
+size_t
+pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
+                                 pg_locale_t locale)
+{
+       size_t result;
+
+       if (!locale || locale->provider == COLLPROVIDER_LIBC)
+               elog(ERROR, "collprovider '%c' does not support pg_strxfrm_prefix()",
+                        locale->provider);
+#ifdef USE_ICU
+       else if (locale->provider == COLLPROVIDER_ICU)
+               result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
+#endif
+       else
+               /* shouldn't happen */
+               elog(ERROR, "unsupported collprovider: %c", locale->provider);
+
+       return result;
+}
+
+/*
+ * pg_strnxfrm_prefix
+ *
+ * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
+ * memcmp() on the byte sequence is equivalent to pg_strcoll() on
+ * untransformed strings. The result is not nul-terminated.
+ *
+ * The provided 'src' must be nul-terminated.
+ *
+ * If destsize is not large enough to hold the resulting byte sequence, stores
+ * only the first destsize bytes in 'dest'. Returns the number of bytes
+ * actually copied to 'dest'.
+ *
+ * This function may need to nul-terminate the argument for libc functions;
+ * so if the caller already has a nul-terminated string, it should call
+ * pg_strxfrm_prefix() instead.
+ */
+size_t
+pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
+                                  size_t srclen, pg_locale_t locale)
+{
+       size_t result;
+
+       if (!locale || locale->provider == COLLPROVIDER_LIBC)
+               elog(ERROR, "collprovider '%c' does not support pg_strnxfrm_prefix()",
+                        locale->provider);
+#ifdef USE_ICU
+       else if (locale->provider == COLLPROVIDER_ICU)
+               result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
+#endif
+       else
+               /* shouldn't happen */
+               elog(ERROR, "unsupported collprovider: %c", locale->provider);
+
+       return result;
+}
+
+#ifdef USE_ICU
  static void
  init_icu_converter(void)
  {
@@ -1767,6 +2476,39 @@ init_icu_converter(void)
         icu_converter = conv;
  }
  
+/*
+ * Find length, in UChars, of given string if converted to UChar string.
+ */
+static size_t
+uchar_length(UConverter *converter, const char *str, int32_t len)
+{
+       UErrorCode      status = U_ZERO_ERROR;
+       int32_t         ulen;
+       ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
+       if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
+               ereport(ERROR,
+                               (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
+       return ulen;
+}
+
+/*
+ * Convert the given source string into a UChar string, stored in dest, and
+ * return the length (in UChars).
+ */
+static int32_t
+uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
+                         const char *src, int32_t srclen)
+{
+       UErrorCode      status = U_ZERO_ERROR;
+       int32_t         ulen;
+       status = U_ZERO_ERROR;
+       ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
+       if (U_FAILURE(status))
+               ereport(ERROR,
+                               (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
+       return ulen;
+}
+
  /*
   * Convert a string in the database encoding into a string of UChars.
   *
@@ -1782,26 +2524,15 @@ init_icu_converter(void)
  int32_t
  icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
  {
-       UErrorCode      status;
-       int32_t         len_uchar;
+       int32_t len_uchar;
  
         init_icu_converter();
  
-       status = U_ZERO_ERROR;
-       len_uchar = ucnv_toUChars(icu_converter, NULL, 0,
-                                                         buff, nbytes, &status);
-       if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
-               ereport(ERROR,
-                               (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
+       len_uchar = uchar_length(icu_converter, buff, nbytes);
  
         *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
-
-       status = U_ZERO_ERROR;
-       len_uchar = ucnv_toUChars(icu_converter, *buff_uchar, len_uchar + 1,
-                                                         buff, nbytes, &status);
-       if (U_FAILURE(status))
-               ereport(ERROR,
-                               (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
+       len_uchar = uchar_convert(icu_converter,
+                                                         *buff_uchar, len_uchar + 1, buff, nbytes);
  
         return len_uchar;
  }
diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c

index 8ddbae8f51dfb780bc462070dc4ff027f00fa902..9ff3bcbdb75c7c9c02e6d08e9f446179c224f912 100644 (file)
--- a/src/backend/utils/adt/varchar.c
+++ b/src/backend/utils/adt/varchar.c
@@ -1024,21 +1024,22 @@ hashbpchar(PG_FUNCTION_ARGS)
  #ifdef USE_ICU
                 if (mylocale->provider == COLLPROVIDER_ICU)
                 {
-                       int32_t         ulen = -1;
-                       UChar      *uchar = NULL;
-                       Size            bsize;
-                       uint8_t    *buf;
+                       Size            bsize, rsize;
+                       char       *buf;
  
-                       ulen = icu_to_uchar(&uchar, keydata, keylen);
+                       bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale);
+                       buf = palloc(bsize + 1);
  
-                       bsize = ucol_getSortKey(mylocale->info.icu.ucol,
-                                                                       uchar, ulen, NULL, 0);
-                       buf = palloc(bsize);
-                       ucol_getSortKey(mylocale->info.icu.ucol,
-                                                       uchar, ulen, buf, bsize);
-                       pfree(uchar);
+                       rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale);
+                       if (rsize != bsize)
+                               elog(ERROR, "pg_strnxfrm() returned unexpected result");
  
-                       result = hash_any(buf, bsize);
+                       /*
+                        * In principle, there's no reason to include the terminating NUL
+                        * character in the hash, but it was done before and the behavior
+                        * must be preserved.
+                        */
+                       result = hash_any((uint8_t *) buf, bsize + 1);
  
                         pfree(buf);
                 }
@@ -1086,21 +1087,23 @@ hashbpcharextended(PG_FUNCTION_ARGS)
  #ifdef USE_ICU
                 if (mylocale->provider == COLLPROVIDER_ICU)
                 {
-                       int32_t         ulen = -1;
-                       UChar      *uchar = NULL;
-                       Size            bsize;
-                       uint8_t    *buf;
+                       Size            bsize, rsize;
+                       char       *buf;
  
-                       ulen = icu_to_uchar(&uchar, keydata, keylen);
+                       bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale);
+                       buf = palloc(bsize + 1);
  
-                       bsize = ucol_getSortKey(mylocale->info.icu.ucol,
-                                                                       uchar, ulen, NULL, 0);
-                       buf = palloc(bsize);
-                       ucol_getSortKey(mylocale->info.icu.ucol,
-                                                       uchar, ulen, buf, bsize);
-                       pfree(uchar);
+                       rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale);
+                       if (rsize != bsize)
+                               elog(ERROR, "pg_strnxfrm() returned unexpected result");
  
-                       result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1));
+                       /*
+                        * In principle, there's no reason to include the terminating NUL
+                        * character in the hash, but it was done before and the behavior
+                        * must be preserved.
+                        */
+                       result = hash_any_extended((uint8_t *) buf, bsize + 1,
+                                                                          PG_GETARG_INT64(1));
  
                         pfree(buf);
                 }
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c

index 170b3a3820b7524d5228f08c8c8713bedf8a0577..4ca823ca7b136735e06124c95ce9e7a5b987c5d1 100644 (file)
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -1553,10 +1553,6 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
         }
         else
         {
-               char            a1buf[TEXTBUFLEN];
-               char            a2buf[TEXTBUFLEN];
-               char       *a1p,
-                                  *a2p;
                 pg_locale_t mylocale;
  
                 mylocale = pg_newlocale_from_collation(collid);
@@ -1573,171 +1569,16 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
                 if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
                         return 0;
  
-#ifdef WIN32
-               /* Win32 does not have UTF-8, so we need to map to UTF-16 */
-               if (GetDatabaseEncoding() == PG_UTF8
-                       && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
-               {
-                       int                     a1len;
-                       int                     a2len;
-                       int                     r;
-
-                       if (len1 >= TEXTBUFLEN / 2)
-                       {
-                               a1len = len1 * 2 + 2;
-                               a1p = palloc(a1len);
-                       }
-                       else
-                       {
-                               a1len = TEXTBUFLEN;
-                               a1p = a1buf;
-                       }
-                       if (len2 >= TEXTBUFLEN / 2)
-                       {
-                               a2len = len2 * 2 + 2;
-                               a2p = palloc(a2len);
-                       }
-                       else
-                       {
-                               a2len = TEXTBUFLEN;
-                               a2p = a2buf;
-                       }
-
-                       /* stupid Microsloth API does not work for zero-length input */
-                       if (len1 == 0)
-                               r = 0;
-                       else
-                       {
-                               r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
-                                                                               (LPWSTR) a1p, a1len / 2);
-                               if (!r)
-                                       ereport(ERROR,
-                                                       (errmsg("could not convert string to UTF-16: error code %lu",
-                                                                       GetLastError())));
-                       }
-                       ((LPWSTR) a1p)[r] = 0;
-
-                       if (len2 == 0)
-                               r = 0;
-                       else
-                       {
-                               r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
-                                                                               (LPWSTR) a2p, a2len / 2);
-                               if (!r)
-                                       ereport(ERROR,
-                                                       (errmsg("could not convert string to UTF-16: error code %lu",
-                                                                       GetLastError())));
-                       }
-                       ((LPWSTR) a2p)[r] = 0;
-
-                       errno = 0;
-#ifdef HAVE_LOCALE_T
-                       if (mylocale)
-                               result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
-                       else
-#endif
-                               result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
-                       if (result == 2147483647)       /* _NLSCMPERROR; missing from mingw
-                                                                                * headers */
-                               ereport(ERROR,
-                                               (errmsg("could not compare Unicode strings: %m")));
-
-                       /* Break tie if necessary. */
-                       if (result == 0 &&
-                               (!mylocale || mylocale->deterministic))
-                       {
-                               result = memcmp(arg1, arg2, Min(len1, len2));
-                               if ((result == 0) && (len1 != len2))
-                                       result = (len1 < len2) ? -1 : 1;
-                       }
-
-                       if (a1p != a1buf)
-                               pfree(a1p);
-                       if (a2p != a2buf)
-                               pfree(a2p);
-
-                       return result;
-               }
-#endif                                                 /* WIN32 */
-
-               if (len1 >= TEXTBUFLEN)
-                       a1p = (char *) palloc(len1 + 1);
-               else
-                       a1p = a1buf;
-               if (len2 >= TEXTBUFLEN)
-                       a2p = (char *) palloc(len2 + 1);
-               else
-                       a2p = a2buf;
-
-               memcpy(a1p, arg1, len1);
-               a1p[len1] = '\0';
-               memcpy(a2p, arg2, len2);
-               a2p[len2] = '\0';
-
-               if (mylocale)
-               {
-                       if (mylocale->provider == COLLPROVIDER_ICU)
-                       {
-#ifdef USE_ICU
-#ifdef HAVE_UCOL_STRCOLLUTF8
-                               if (GetDatabaseEncoding() == PG_UTF8)
-                               {
-                                       UErrorCode      status;
-
-                                       status = U_ZERO_ERROR;
-                                       result = ucol_strcollUTF8(mylocale->info.icu.ucol,
-                                                                                         arg1, len1,
-                                                                                         arg2, len2,
-                                                                                         &status);
-                                       if (U_FAILURE(status))
-                                               ereport(ERROR,
-                                                               (errmsg("collation failed: %s", u_errorName(status))));
-                               }
-                               else
-#endif
-                               {
-                                       int32_t         ulen1,
-                                                               ulen2;
-                                       UChar      *uchar1,
-                                                          *uchar2;
-
-                                       ulen1 = icu_to_uchar(&uchar1, arg1, len1);
-                                       ulen2 = icu_to_uchar(&uchar2, arg2, len2);
-
-                                       result = ucol_strcoll(mylocale->info.icu.ucol,
-                                                                                 uchar1, ulen1,
-                                                                                 uchar2, ulen2);
-
-                                       pfree(uchar1);
-                                       pfree(uchar2);
-                               }
-#else                                                  /* not USE_ICU */
-                               /* shouldn't happen */
-                               elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
-#endif                                                 /* not USE_ICU */
-                       }
-                       else
-                       {
-#ifdef HAVE_LOCALE_T
-                               result = strcoll_l(a1p, a2p, mylocale->info.lt);
-#else
-                               /* shouldn't happen */
-                               elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
-#endif
-                       }
-               }
-               else
-                       result = strcoll(a1p, a2p);
+               result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
  
                 /* Break tie if necessary. */
                 if (result == 0 &&
                         (!mylocale || mylocale->deterministic))
-                       result = strcmp(a1p, a2p);
-
-               if (a1p != a1buf)
-                       pfree(a1p);
-               if (a2p != a2buf)
-                       pfree(a2p);
+               {
+                       result = memcmp(arg1, arg2, Min(len1, len2));
+                       if ((result == 0) && (len1 != len2))
+                               result = (len1 < len2) ? -1 : 1;
+               }
         }
  
         return result;
@@ -2073,20 +1914,6 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
                  */
                 locale = pg_newlocale_from_collation(collid);
  
-               /*
-                * There is a further exception on Windows.  When the database
-                * encoding is UTF-8 and we are not using the C collation, complex
-                * hacks are required.  We don't currently have a comparator that
-                * handles that case, so we fall back on the slow method of having the
-                * sort code invoke bttextcmp() (in the case of text) via the fmgr
-                * trampoline.  ICU locales work just the same on Windows, however.
-                */
-#ifdef WIN32
-               if (GetDatabaseEncoding() == PG_UTF8 &&
-                       !(locale && locale->provider == COLLPROVIDER_ICU))
-                       return;
-#endif
-
                 /*
                  * We use varlenafastcmp_locale except for type NAME.
                  */
@@ -2102,13 +1929,7 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
  
         /*
          * Unfortunately, it seems that abbreviation for non-C collations is
-        * broken on many common platforms; testing of multiple versions of glibc
-        * reveals that, for many locales, strcoll() and strxfrm() do not return
-        * consistent results, which is fatal to this optimization.  While no
-        * other libc other than Cygwin has so far been shown to have a problem,
-        * we take the conservative course of action for right now and disable
-        * this categorically.  (Users who are certain this isn't a problem on
-        * their system can define TRUST_STRXFRM.)
+        * broken on many common platforms; see pg_strxfrm_enabled().
          *
          * Even apart from the risk of broken locales, it's possible that there
          * are platforms where the use of abbreviated keys should be disabled at
@@ -2121,10 +1942,8 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
          * categorically, we may still want or need to disable it for particular
          * platforms.
          */
-#ifndef TRUST_STRXFRM
-       if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
+       if (!collate_c && !pg_strxfrm_enabled(locale))
                 abbreviate = false;
-#endif
  
         /*
          * If we're using abbreviated keys, or if we're using a locale-aware
@@ -2395,60 +2214,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
                 return sss->last_returned;
         }
  
-       if (sss->locale)
-       {
-               if (sss->locale->provider == COLLPROVIDER_ICU)
-               {
-#ifdef USE_ICU
-#ifdef HAVE_UCOL_STRCOLLUTF8
-                       if (GetDatabaseEncoding() == PG_UTF8)
-                       {
-                               UErrorCode      status;
-
-                               status = U_ZERO_ERROR;
-                               result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
-                                                                                 a1p, len1,
-                                                                                 a2p, len2,
-                                                                                 &status);
-                               if (U_FAILURE(status))
-                                       ereport(ERROR,
-                                                       (errmsg("collation failed: %s", u_errorName(status))));
-                       }
-                       else
-#endif
-                       {
-                               int32_t         ulen1,
-                                                       ulen2;
-                               UChar      *uchar1,
-                                                  *uchar2;
-
-                               ulen1 = icu_to_uchar(&uchar1, a1p, len1);
-                               ulen2 = icu_to_uchar(&uchar2, a2p, len2);
-
-                               result = ucol_strcoll(sss->locale->info.icu.ucol,
-                                                                         uchar1, ulen1,
-                                                                         uchar2, ulen2);
-
-                               pfree(uchar1);
-                               pfree(uchar2);
-                       }
-#else                                                  /* not USE_ICU */
-                       /* shouldn't happen */
-                       elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
-#endif                                                 /* not USE_ICU */
-               }
-               else
-               {
-#ifdef HAVE_LOCALE_T
-                       result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
-#else
-                       /* shouldn't happen */
-                       elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
-#endif
-               }
-       }
-       else
-               result = strcoll(sss->buf1, sss->buf2);
+       result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
  
         /* Break tie if necessary. */
         if (result == 0 &&
@@ -2471,6 +2237,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
  static Datum
  varstr_abbrev_convert(Datum original, SortSupport ssup)
  {
+       const size_t max_prefix_bytes = sizeof(Datum);
         VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
         VarString  *authoritative = DatumGetVarStringPP(original);
         char       *authoritative_data = VARDATA_ANY(authoritative);
@@ -2483,7 +2250,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
  
         pres = (char *) &res;
         /* memset(), so any non-overwritten bytes are NUL */
-       memset(pres, 0, sizeof(Datum));
+       memset(pres, 0, max_prefix_bytes);
         len = VARSIZE_ANY_EXHDR(authoritative);
  
         /* Get number of bytes, ignoring trailing spaces */
@@ -2518,14 +2285,10 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
          * thing: explicitly consider string length.
          */
         if (sss->collate_c)
-               memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
+               memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
         else
         {
                 Size            bsize;
-#ifdef USE_ICU
-               int32_t         ulen = -1;
-               UChar      *uchar = NULL;
-#endif
  
                 /*
                  * We're not using the C collation, so fall back on strxfrm or ICU
@@ -2543,7 +2306,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
                 if (sss->last_len1 == len && sss->cache_blob &&
                         memcmp(sss->buf1, authoritative_data, len) == 0)
                 {
-                       memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
+                       memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
                         /* No change affecting cardinality, so no hashing required */
                         goto done;
                 }
@@ -2551,81 +2314,49 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
                 memcpy(sss->buf1, authoritative_data, len);
  
                 /*
-                * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
-                * necessary for ICU, but doesn't hurt.
+                * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated
+                * strings.
                  */
                 sss->buf1[len] = '\0';
                 sss->last_len1 = len;
  
-#ifdef USE_ICU
-               /* When using ICU and not UTF8, convert string to UChar. */
-               if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
-                       GetDatabaseEncoding() != PG_UTF8)
-                       ulen = icu_to_uchar(&uchar, sss->buf1, len);
-#endif
-
-               /*
-                * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
-                * and try again.  Both of these functions have the result buffer
-                * content undefined if the result did not fit, so we need to retry
-                * until everything fits, even though we only need the first few bytes
-                * in the end.  When using ucol_nextSortKeyPart(), however, we only
-                * ask for as many bytes as we actually need.
-                */
-               for (;;)
+               if (pg_strxfrm_prefix_enabled(sss->locale))
                 {
-#ifdef USE_ICU
-                       if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
+                       if (sss->buflen2 < max_prefix_bytes)
                         {
-                               /*
-                                * When using UTF8, use the iteration interface so we only
-                                * need to produce as many bytes as we actually need.
-                                */
-                               if (GetDatabaseEncoding() == PG_UTF8)
-                               {
-                                       UCharIterator iter;
-                                       uint32_t        state[2];
-                                       UErrorCode      status;
-
-                                       uiter_setUTF8(&iter, sss->buf1, len);
-                                       state[0] = state[1] = 0;        /* won't need that again */
-                                       status = U_ZERO_ERROR;
-                                       bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
-                                                                                                &iter,
-                                                                                                state,
-                                                                                                (uint8_t *) sss->buf2,
-                                                                                                Min(sizeof(Datum), sss->buflen2),
-                                                                                                &status);
-                                       if (U_FAILURE(status))
-                                               ereport(ERROR,
-                                                               (errmsg("sort key generation failed: %s",
-                                                                               u_errorName(status))));
-                               }
-                               else
-                                       bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
-                                                                                       uchar, ulen,
-                                                                                       (uint8_t *) sss->buf2, sss->buflen2);
+                               sss->buflen2 = Max(max_prefix_bytes,
+                                                                  Min(sss->buflen2 * 2, MaxAllocSize));
+                               sss->buf2 = repalloc(sss->buf2, sss->buflen2);
                         }
-                       else
-#endif
-#ifdef HAVE_LOCALE_T
-                       if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
-                               bsize = strxfrm_l(sss->buf2, sss->buf1,
-                                                                 sss->buflen2, sss->locale->info.lt);
-                       else
-#endif
-                               bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
-
-                       sss->last_len2 = bsize;
-                       if (bsize < sss->buflen2)
-                               break;
  
+                       bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
+                                                                         max_prefix_bytes, sss->locale);
+               }
+               else
+               {
                         /*
-                        * Grow buffer and retry.
+                        * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
+                        * again.  The pg_strxfrm() function leaves the result buffer
+                        * content undefined if the result did not fit, so we need to
+                        * retry until everything fits, even though we only need the first
+                        * few bytes in the end.
                          */
-                       sss->buflen2 = Max(bsize + 1,
-                                                          Min(sss->buflen2 * 2, MaxAllocSize));
-                       sss->buf2 = repalloc(sss->buf2, sss->buflen2);
+                       for (;;)
+                       {
+                               bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
+                                                                  sss->locale);
+
+                               sss->last_len2 = bsize;
+                               if (bsize < sss->buflen2)
+                                       break;
+
+                               /*
+                                * Grow buffer and retry.
+                                */
+                               sss->buflen2 = Max(bsize + 1,
+                                                                  Min(sss->buflen2 * 2, MaxAllocSize));
+                               sss->buf2 = repalloc(sss->buf2, sss->buflen2);
+                       }
                 }
  
                 /*
@@ -2637,12 +2368,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
                  * (Actually, even if there were NUL bytes in the blob it would be
                  * okay.  See remarks on bytea case above.)
                  */
-               memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
-
-#ifdef USE_ICU
-               if (uchar)
-                       pfree(uchar);
-#endif
+               memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
         }
  
         /*
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h

index cede43440b575abc1f9e0faa1ba5fe68b8737db1..def2b55f94177f88bd2dec1e387d6c66be9d9506 100644 (file)
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -100,6 +100,19 @@ extern void make_icu_collator(const char *iculocstr,
  extern pg_locale_t pg_newlocale_from_collation(Oid collid);
  
  extern char *get_collation_actual_version(char collprovider, const char *collcollate);
+extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
+extern int pg_strncoll(const char *arg1, size_t len1,
+                                          const char *arg2, size_t len2, pg_locale_t locale);
+extern bool pg_strxfrm_enabled(pg_locale_t locale);
+extern size_t pg_strxfrm(char *dest, const char *src, size_t destsize,
+                                                pg_locale_t locale);
+extern size_t pg_strnxfrm(char *dest, size_t destsize, const char *src,
+                                                 size_t srclen, pg_locale_t locale);
+extern bool pg_strxfrm_prefix_enabled(pg_locale_t locale);
+extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
+                                                               pg_locale_t locale);
+extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
+                                                                size_t srclen, pg_locale_t locale);
  
  #ifdef USE_ICU
  extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes);
author	Jeff Davis <jdavis@postgresql.org>
	Thu, 23 Feb 2023 18:55:20 +0000 (10:55 -0800)
committer	Jeff Davis <jdavis@postgresql.org>
	Thu, 23 Feb 2023 18:55:20 +0000 (10:55 -0800)
src/backend/access/hash/hashfunc.c		patch \| blob \| blame \| history
src/backend/utils/adt/pg_locale.c		patch \| blob \| blame \| history
src/backend/utils/adt/varchar.c		patch \| blob \| blame \| history
src/backend/utils/adt/varlena.c		patch \| blob \| blame \| history
src/include/utils/pg_locale.h		patch \| blob \| blame \| history