LCOV - PostgreSQL 19devel - src/backend/utils/adt/varlena.c

LCOV - code coverage report

Current view:	top level - src/backend/utils/adt - varlena.c (source / functions)		Hit	Total	Coverage
Test:	PostgreSQL 19devel	Lines:	1710	1891	90.4 %
Date:	2025-07-09 01:17:29	Functions:	132	143	92.3 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * varlena.c
       4             :  *    Functions for the variable-length built-in types.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/utils/adt/varlena.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : #include "postgres.h"
      16             : 
      17             : #include <ctype.h>
      18             : #include <limits.h>
      19             : 
      20             : #include "access/detoast.h"
      21             : #include "access/toast_compression.h"
      22             : #include "catalog/pg_collation.h"
      23             : #include "catalog/pg_type.h"
      24             : #include "common/hashfn.h"
      25             : #include "common/int.h"
      26             : #include "common/unicode_category.h"
      27             : #include "common/unicode_norm.h"
      28             : #include "common/unicode_version.h"
      29             : #include "funcapi.h"
      30             : #include "lib/hyperloglog.h"
      31             : #include "libpq/pqformat.h"
      32             : #include "miscadmin.h"
      33             : #include "nodes/execnodes.h"
      34             : #include "parser/scansup.h"
      35             : #include "port/pg_bswap.h"
      36             : #include "regex/regex.h"
      37             : #include "utils/builtins.h"
      38             : #include "utils/guc.h"
      39             : #include "utils/lsyscache.h"
      40             : #include "utils/memutils.h"
      41             : #include "utils/pg_locale.h"
      42             : #include "utils/sortsupport.h"
      43             : #include "utils/varlena.h"
      44             : 
      45             : typedef struct varlena VarString;
      46             : 
      47             : /*
      48             :  * State for text_position_* functions.
      49             :  */
      50             : typedef struct
      51             : {
      52             :     pg_locale_t locale;         /* collation used for substring matching */
      53             :     bool        is_multibyte_char_in_char;  /* need to check char boundaries? */
      54             :     bool        greedy;         /* find longest possible substring? */
      55             : 
      56             :     char       *str1;           /* haystack string */
      57             :     char       *str2;           /* needle string */
      58             :     int         len1;           /* string lengths in bytes */
      59             :     int         len2;
      60             : 
      61             :     /* Skip table for Boyer-Moore-Horspool search algorithm: */
      62             :     int         skiptablemask;  /* mask for ANDing with skiptable subscripts */
      63             :     int         skiptable[256]; /* skip distance for given mismatched char */
      64             : 
      65             :     /*
      66             :      * Note that with nondeterministic collations, the length of the last
      67             :      * match is not necessarily equal to the length of the "needle" passed in.
      68             :      */
      69             :     char       *last_match;     /* pointer to last match in 'str1' */
      70             :     int         last_match_len; /* length of last match */
      71             :     int         last_match_len_tmp; /* same but for internal use */
      72             : 
      73             :     /*
      74             :      * Sometimes we need to convert the byte position of a match to a
      75             :      * character position.  These store the last position that was converted,
      76             :      * so that on the next call, we can continue from that point, rather than
      77             :      * count characters from the very beginning.
      78             :      */
      79             :     char       *refpoint;       /* pointer within original haystack string */
      80             :     int         refpos;         /* 0-based character offset of the same point */
      81             : } TextPositionState;
      82             : 
      83             : typedef struct
      84             : {
      85             :     char       *buf1;           /* 1st string, or abbreviation original string
      86             :                                  * buf */
      87             :     char       *buf2;           /* 2nd string, or abbreviation strxfrm() buf */
      88             :     int         buflen1;        /* Allocated length of buf1 */
      89             :     int         buflen2;        /* Allocated length of buf2 */
      90             :     int         last_len1;      /* Length of last buf1 string/strxfrm() input */
      91             :     int         last_len2;      /* Length of last buf2 string/strxfrm() blob */
      92             :     int         last_returned;  /* Last comparison result (cache) */
      93             :     bool        cache_blob;     /* Does buf2 contain strxfrm() blob, etc? */
      94             :     bool        collate_c;
      95             :     Oid         typid;          /* Actual datatype (text/bpchar/bytea/name) */
      96             :     hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
      97             :     hyperLogLogState full_card; /* Full key cardinality state */
      98             :     double      prop_card;      /* Required cardinality proportion */
      99             :     pg_locale_t locale;
     100             : } VarStringSortSupport;
     101             : 
     102             : /*
     103             :  * Output data for split_text(): we output either to an array or a table.
     104             :  * tupstore and tupdesc must be set up in advance to output to a table.
     105             :  */
     106             : typedef struct
     107             : {
     108             :     ArrayBuildState *astate;
     109             :     Tuplestorestate *tupstore;
     110             :     TupleDesc   tupdesc;
     111             : } SplitTextOutputData;
     112             : 
     113             : /*
     114             :  * This should be large enough that most strings will fit, but small enough
     115             :  * that we feel comfortable putting it on the stack
     116             :  */
     117             : #define TEXTBUFLEN      1024
     118             : 
     119             : #define DatumGetVarStringP(X)       ((VarString *) PG_DETOAST_DATUM(X))
     120             : #define DatumGetVarStringPP(X)      ((VarString *) PG_DETOAST_DATUM_PACKED(X))
     121             : 
     122             : static int  varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
     123             : static int  bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
     124             : static int  namefastcmp_c(Datum x, Datum y, SortSupport ssup);
     125             : static int  varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
     126             : static int  namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
     127             : static int  varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
     128             : static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
     129             : static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
     130             : static int32 text_length(Datum str);
     131             : static text *text_catenate(text *t1, text *t2);
     132             : static text *text_substring(Datum str,
     133             :                             int32 start,
     134             :                             int32 length,
     135             :                             bool length_not_specified);
     136             : static text *text_overlay(text *t1, text *t2, int sp, int sl);
     137             : static int  text_position(text *t1, text *t2, Oid collid);
     138             : static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
     139             : static bool text_position_next(TextPositionState *state);
     140             : static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
     141             : static char *text_position_get_match_ptr(TextPositionState *state);
     142             : static int  text_position_get_match_pos(TextPositionState *state);
     143             : static void text_position_cleanup(TextPositionState *state);
     144             : static void check_collation_set(Oid collid);
     145             : static int  text_cmp(text *arg1, text *arg2, Oid collid);
     146             : static void appendStringInfoText(StringInfo str, const text *t);
     147             : static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
     148             : static void split_text_accum_result(SplitTextOutputData *tstate,
     149             :                                     text *field_value,
     150             :                                     text *null_string,
     151             :                                     Oid collation);
     152             : static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
     153             :                                     const char *fldsep, const char *null_string);
     154             : static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
     155             : static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
     156             :                                      int *value);
     157             : static const char *text_format_parse_format(const char *start_ptr,
     158             :                                             const char *end_ptr,
     159             :                                             int *argpos, int *widthpos,
     160             :                                             int *flags, int *width);
     161             : static void text_format_string_conversion(StringInfo buf, char conversion,
     162             :                                           FmgrInfo *typOutputInfo,
     163             :                                           Datum value, bool isNull,
     164             :                                           int flags, int width);
     165             : static void text_format_append_string(StringInfo buf, const char *str,
     166             :                                       int flags, int width);
     167             : 
     168             : 
     169             : /*****************************************************************************
     170             :  *   CONVERSION ROUTINES EXPORTED FOR USE BY C CODE                          *
     171             :  *****************************************************************************/
     172             : 
     173             : /*
     174             :  * cstring_to_text
     175             :  *
     176             :  * Create a text value from a null-terminated C string.
     177             :  *
     178             :  * The new text value is freshly palloc'd with a full-size VARHDR.
     179             :  */
     180             : text *
     181    25598500 : cstring_to_text(const char *s)
     182             : {
     183    25598500 :     return cstring_to_text_with_len(s, strlen(s));
     184             : }
     185             : 
     186             : /*
     187             :  * cstring_to_text_with_len
     188             :  *
     189             :  * Same as cstring_to_text except the caller specifies the string length;
     190             :  * the string need not be null_terminated.
     191             :  */
     192             : text *
     193    28402826 : cstring_to_text_with_len(const char *s, int len)
     194             : {
     195    28402826 :     text       *result = (text *) palloc(len + VARHDRSZ);
     196             : 
     197    28402826 :     SET_VARSIZE(result, len + VARHDRSZ);
     198    28402826 :     memcpy(VARDATA(result), s, len);
     199             : 
     200    28402826 :     return result;
     201             : }
     202             : 
     203             : /*
     204             :  * text_to_cstring
     205             :  *
     206             :  * Create a palloc'd, null-terminated C string from a text value.
     207             :  *
     208             :  * We support being passed a compressed or toasted text value.
     209             :  * This is a bit bogus since such values shouldn't really be referred to as
     210             :  * "text *", but it seems useful for robustness.  If we didn't handle that
     211             :  * case here, we'd need another routine that did, anyway.
     212             :  */
     213             : char *
     214    18573102 : text_to_cstring(const text *t)
     215             : {
     216             :     /* must cast away the const, unfortunately */
     217    18573102 :     text       *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
     218    18573102 :     int         len = VARSIZE_ANY_EXHDR(tunpacked);
     219             :     char       *result;
     220             : 
     221    18573102 :     result = (char *) palloc(len + 1);
     222    18573102 :     memcpy(result, VARDATA_ANY(tunpacked), len);
     223    18573102 :     result[len] = '\0';
     224             : 
     225    18573102 :     if (tunpacked != t)
     226       45890 :         pfree(tunpacked);
     227             : 
     228    18573102 :     return result;
     229             : }
     230             : 
     231             : /*
     232             :  * text_to_cstring_buffer
     233             :  *
     234             :  * Copy a text value into a caller-supplied buffer of size dst_len.
     235             :  *
     236             :  * The text string is truncated if necessary to fit.  The result is
     237             :  * guaranteed null-terminated (unless dst_len == 0).
     238             :  *
     239             :  * We support being passed a compressed or toasted text value.
     240             :  * This is a bit bogus since such values shouldn't really be referred to as
     241             :  * "text *", but it seems useful for robustness.  If we didn't handle that
     242             :  * case here, we'd need another routine that did, anyway.
     243             :  */
     244             : void
     245         994 : text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
     246             : {
     247             :     /* must cast away the const, unfortunately */
     248         994 :     text       *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
     249         994 :     size_t      src_len = VARSIZE_ANY_EXHDR(srcunpacked);
     250             : 
     251         994 :     if (dst_len > 0)
     252             :     {
     253         994 :         dst_len--;
     254         994 :         if (dst_len >= src_len)
     255         994 :             dst_len = src_len;
     256             :         else                    /* ensure truncation is encoding-safe */
     257           0 :             dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
     258         994 :         memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
     259         994 :         dst[dst_len] = '\0';
     260             :     }
     261             : 
     262         994 :     if (srcunpacked != src)
     263           0 :         pfree(srcunpacked);
     264         994 : }
     265             : 
     266             : 
     267             : /*****************************************************************************
     268             :  *   USER I/O ROUTINES                                                       *
     269             :  *****************************************************************************/
     270             : 
     271             : /*
     272             :  *      textin          - converts cstring to internal representation
     273             :  */
     274             : Datum
     275    22308166 : textin(PG_FUNCTION_ARGS)
     276             : {
     277    22308166 :     char       *inputText = PG_GETARG_CSTRING(0);
     278             : 
     279    22308166 :     PG_RETURN_TEXT_P(cstring_to_text(inputText));
     280             : }
     281             : 
     282             : /*
     283             :  *      textout         - converts internal representation to cstring
     284             :  */
     285             : Datum
     286     9438060 : textout(PG_FUNCTION_ARGS)
     287             : {
     288     9438060 :     Datum       txt = PG_GETARG_DATUM(0);
     289             : 
     290     9438060 :     PG_RETURN_CSTRING(TextDatumGetCString(txt));
     291             : }
     292             : 
     293             : /*
     294             :  *      textrecv            - converts external binary format to text
     295             :  */
     296             : Datum
     297          48 : textrecv(PG_FUNCTION_ARGS)
     298             : {
     299          48 :     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     300             :     text       *result;
     301             :     char       *str;
     302             :     int         nbytes;
     303             : 
     304          48 :     str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
     305             : 
     306          48 :     result = cstring_to_text_with_len(str, nbytes);
     307          48 :     pfree(str);
     308          48 :     PG_RETURN_TEXT_P(result);
     309             : }
     310             : 
     311             : /*
     312             :  *      textsend            - converts text to binary format
     313             :  */
     314             : Datum
     315        4914 : textsend(PG_FUNCTION_ARGS)
     316             : {
     317        4914 :     text       *t = PG_GETARG_TEXT_PP(0);
     318             :     StringInfoData buf;
     319             : 
     320        4914 :     pq_begintypsend(&buf);
     321        4914 :     pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
     322        4914 :     PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
     323             : }
     324             : 
     325             : 
     326             : /*
     327             :  *      unknownin           - converts cstring to internal representation
     328             :  */
     329             : Datum
     330           0 : unknownin(PG_FUNCTION_ARGS)
     331             : {
     332           0 :     char       *str = PG_GETARG_CSTRING(0);
     333             : 
     334             :     /* representation is same as cstring */
     335           0 :     PG_RETURN_CSTRING(pstrdup(str));
     336             : }
     337             : 
     338             : /*
     339             :  *      unknownout          - converts internal representation to cstring
     340             :  */
     341             : Datum
     342         952 : unknownout(PG_FUNCTION_ARGS)
     343             : {
     344             :     /* representation is same as cstring */
     345         952 :     char       *str = PG_GETARG_CSTRING(0);
     346             : 
     347         952 :     PG_RETURN_CSTRING(pstrdup(str));
     348             : }
     349             : 
     350             : /*
     351             :  *      unknownrecv         - converts external binary format to unknown
     352             :  */
     353             : Datum
     354           0 : unknownrecv(PG_FUNCTION_ARGS)
     355             : {
     356           0 :     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     357             :     char       *str;
     358             :     int         nbytes;
     359             : 
     360           0 :     str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
     361             :     /* representation is same as cstring */
     362           0 :     PG_RETURN_CSTRING(str);
     363             : }
     364             : 
     365             : /*
     366             :  *      unknownsend         - converts unknown to binary format
     367             :  */
     368             : Datum
     369           0 : unknownsend(PG_FUNCTION_ARGS)
     370             : {
     371             :     /* representation is same as cstring */
     372           0 :     char       *str = PG_GETARG_CSTRING(0);
     373             :     StringInfoData buf;
     374             : 
     375           0 :     pq_begintypsend(&buf);
     376           0 :     pq_sendtext(&buf, str, strlen(str));
     377           0 :     PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
     378             : }
     379             : 
     380             : 
     381             : /* ========== PUBLIC ROUTINES ========== */
     382             : 
     383             : /*
     384             :  * textlen -
     385             :  *    returns the logical length of a text*
     386             :  *     (which is less than the VARSIZE of the text*)
     387             :  */
     388             : Datum
     389      430792 : textlen(PG_FUNCTION_ARGS)
     390             : {
     391      430792 :     Datum       str = PG_GETARG_DATUM(0);
     392             : 
     393             :     /* try to avoid decompressing argument */
     394      430792 :     PG_RETURN_INT32(text_length(str));
     395             : }
     396             : 
     397             : /*
     398             :  * text_length -
     399             :  *  Does the real work for textlen()
     400             :  *
     401             :  *  This is broken out so it can be called directly by other string processing
     402             :  *  functions.  Note that the argument is passed as a Datum, to indicate that
     403             :  *  it may still be in compressed form.  We can avoid decompressing it at all
     404             :  *  in some cases.
     405             :  */
     406             : static int32
     407      430804 : text_length(Datum str)
     408             : {
     409             :     /* fastpath when max encoding length is one */
     410      430804 :     if (pg_database_encoding_max_length() == 1)
     411          20 :         PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
     412             :     else
     413             :     {
     414      430784 :         text       *t = DatumGetTextPP(str);
     415             : 
     416      430784 :         PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
     417             :                                              VARSIZE_ANY_EXHDR(t)));
     418             :     }
     419             : }
     420             : 
     421             : /*
     422             :  * textoctetlen -
     423             :  *    returns the physical length of a text*
     424             :  *     (which is less than the VARSIZE of the text*)
     425             :  */
     426             : Datum
     427          70 : textoctetlen(PG_FUNCTION_ARGS)
     428             : {
     429          70 :     Datum       str = PG_GETARG_DATUM(0);
     430             : 
     431             :     /* We need not detoast the input at all */
     432          70 :     PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
     433             : }
     434             : 
     435             : /*
     436             :  * textcat -
     437             :  *    takes two text* and returns a text* that is the concatenation of
     438             :  *    the two.
     439             :  *
     440             :  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
     441             :  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
     442             :  * Allocate space for output in all cases.
     443             :  * XXX - thomas 1997-07-10
     444             :  */
     445             : Datum
     446     1953372 : textcat(PG_FUNCTION_ARGS)
     447             : {
     448     1953372 :     text       *t1 = PG_GETARG_TEXT_PP(0);
     449     1953372 :     text       *t2 = PG_GETARG_TEXT_PP(1);
     450             : 
     451     1953372 :     PG_RETURN_TEXT_P(text_catenate(t1, t2));
     452             : }
     453             : 
     454             : /*
     455             :  * text_catenate
     456             :  *  Guts of textcat(), broken out so it can be used by other functions
     457             :  *
     458             :  * Arguments can be in short-header form, but not compressed or out-of-line
     459             :  */
     460             : static text *
     461     1953452 : text_catenate(text *t1, text *t2)
     462             : {
     463             :     text       *result;
     464             :     int         len1,
     465             :                 len2,
     466             :                 len;
     467             :     char       *ptr;
     468             : 
     469     1953452 :     len1 = VARSIZE_ANY_EXHDR(t1);
     470     1953452 :     len2 = VARSIZE_ANY_EXHDR(t2);
     471             : 
     472             :     /* paranoia ... probably should throw error instead? */
     473     1953452 :     if (len1 < 0)
     474           0 :         len1 = 0;
     475     1953452 :     if (len2 < 0)
     476           0 :         len2 = 0;
     477             : 
     478     1953452 :     len = len1 + len2 + VARHDRSZ;
     479     1953452 :     result = (text *) palloc(len);
     480             : 
     481             :     /* Set size of result string... */
     482     1953452 :     SET_VARSIZE(result, len);
     483             : 
     484             :     /* Fill data field of result string... */
     485     1953452 :     ptr = VARDATA(result);
     486     1953452 :     if (len1 > 0)
     487     1952628 :         memcpy(ptr, VARDATA_ANY(t1), len1);
     488     1953452 :     if (len2 > 0)
     489     1953242 :         memcpy(ptr + len1, VARDATA_ANY(t2), len2);
     490             : 
     491     1953452 :     return result;
     492             : }
     493             : 
     494             : /*
     495             :  * charlen_to_bytelen()
     496             :  *  Compute the number of bytes occupied by n characters starting at *p
     497             :  *
     498             :  * It is caller's responsibility that there actually are n characters;
     499             :  * the string need not be null-terminated.
     500             :  */
     501             : static int
     502       17142 : charlen_to_bytelen(const char *p, int n)
     503             : {
     504       17142 :     if (pg_database_encoding_max_length() == 1)
     505             :     {
     506             :         /* Optimization for single-byte encodings */
     507         180 :         return n;
     508             :     }
     509             :     else
     510             :     {
     511             :         const char *s;
     512             : 
     513     6051642 :         for (s = p; n > 0; n--)
     514     6034680 :             s += pg_mblen(s);
     515             : 
     516       16962 :         return s - p;
     517             :     }
     518             : }
     519             : 
     520             : /*
     521             :  * text_substr()
     522             :  * Return a substring starting at the specified position.
     523             :  * - thomas 1997-12-31
     524             :  *
     525             :  * Input:
     526             :  *  - string
     527             :  *  - starting position (is one-based)
     528             :  *  - string length
     529             :  *
     530             :  * If the starting position is zero or less, then return from the start of the string
     531             :  *  adjusting the length to be consistent with the "negative start" per SQL.
     532             :  * If the length is less than zero, return the remaining string.
     533             :  *
     534             :  * Added multibyte support.
     535             :  * - Tatsuo Ishii 1998-4-21
     536             :  * Changed behavior if starting position is less than one to conform to SQL behavior.
     537             :  * Formerly returned the entire string; now returns a portion.
     538             :  * - Thomas Lockhart 1998-12-10
     539             :  * Now uses faster TOAST-slicing interface
     540             :  * - John Gray 2002-02-22
     541             :  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
     542             :  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
     543             :  * error; if E < 1, return '', not entire string). Fixed MB related bug when
     544             :  * S > LC and < LC + 4 sometimes garbage characters are returned.
     545             :  * - Joe Conway 2002-08-10
     546             :  */
     547             : Datum
     548      658050 : text_substr(PG_FUNCTION_ARGS)
     549             : {
     550      658050 :     PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
     551             :                                     PG_GETARG_INT32(1),
     552             :                                     PG_GETARG_INT32(2),
     553             :                                     false));
     554             : }
     555             : 
     556             : /*
     557             :  * text_substr_no_len -
     558             :  *    Wrapper to avoid opr_sanity failure due to
     559             :  *    one function accepting a different number of args.
     560             :  */
     561             : Datum
     562          36 : text_substr_no_len(PG_FUNCTION_ARGS)
     563             : {
     564          36 :     PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
     565             :                                     PG_GETARG_INT32(1),
     566             :                                     -1, true));
     567             : }
     568             : 
     569             : /*
     570             :  * text_substring -
     571             :  *  Does the real work for text_substr() and text_substr_no_len()
     572             :  *
     573             :  *  This is broken out so it can be called directly by other string processing
     574             :  *  functions.  Note that the argument is passed as a Datum, to indicate that
     575             :  *  it may still be in compressed/toasted form.  We can avoid detoasting all
     576             :  *  of it in some cases.
     577             :  *
     578             :  *  The result is always a freshly palloc'd datum.
     579             :  */
     580             : static text *
     581      698198 : text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
     582             : {
     583      698198 :     int32       eml = pg_database_encoding_max_length();
     584      698198 :     int32       S = start;      /* start position */
     585             :     int32       S1;             /* adjusted start position */
     586             :     int32       L1;             /* adjusted substring length */
     587             :     int32       E;              /* end position */
     588             : 
     589             :     /*
     590             :      * SQL99 says S can be zero or negative (which we don't document), but we
     591             :      * still must fetch from the start of the string.
     592             :      * https://www.postgresql.org/message-id/170905442373.643.11536838320909376197%40wrigleys.postgresql.org
     593             :      */
     594      698198 :     S1 = Max(S, 1);
     595             : 
     596             :     /* life is easy if the encoding max length is 1 */
     597      698198 :     if (eml == 1)
     598             :     {
     599          22 :         if (length_not_specified)   /* special case - get length to end of
     600             :                                      * string */
     601           0 :             L1 = -1;
     602          22 :         else if (length < 0)
     603             :         {
     604             :             /* SQL99 says to throw an error for E < S, i.e., negative length */
     605           0 :             ereport(ERROR,
     606             :                     (errcode(ERRCODE_SUBSTRING_ERROR),
     607             :                      errmsg("negative substring length not allowed")));
     608             :             L1 = -1;            /* silence stupider compilers */
     609             :         }
     610          22 :         else if (pg_add_s32_overflow(S, length, &E))
     611             :         {
     612             :             /*
     613             :              * L could be large enough for S + L to overflow, in which case
     614             :              * the substring must run to end of string.
     615             :              */
     616           0 :             L1 = -1;
     617             :         }
     618             :         else
     619             :         {
     620             :             /*
     621             :              * A zero or negative value for the end position can happen if the
     622             :              * start was negative or one. SQL99 says to return a zero-length
     623             :              * string.
     624             :              */
     625          22 :             if (E < 1)
     626           0 :                 return cstring_to_text("");
     627             : 
     628          22 :             L1 = E - S1;
     629             :         }
     630             : 
     631             :         /*
     632             :          * If the start position is past the end of the string, SQL99 says to
     633             :          * return a zero-length string -- DatumGetTextPSlice() will do that
     634             :          * for us.  We need only convert S1 to zero-based starting position.
     635             :          */
     636          22 :         return DatumGetTextPSlice(str, S1 - 1, L1);
     637             :     }
     638      698176 :     else if (eml > 1)
     639             :     {
     640             :         /*
     641             :          * When encoding max length is > 1, we can't get LC without
     642             :          * detoasting, so we'll grab a conservatively large slice now and go
     643             :          * back later to do the right thing
     644             :          */
     645             :         int32       slice_start;
     646             :         int32       slice_size;
     647             :         int32       slice_strlen;
     648             :         text       *slice;
     649             :         int32       E1;
     650             :         int32       i;
     651             :         char       *p;
     652             :         char       *s;
     653             :         text       *ret;
     654             : 
     655             :         /*
     656             :          * We need to start at position zero because there is no way to know
     657             :          * in advance which byte offset corresponds to the supplied start
     658             :          * position.
     659             :          */
     660      698176 :         slice_start = 0;
     661             : 
     662      698176 :         if (length_not_specified)   /* special case - get length to end of
     663             :                                      * string */
     664          76 :             slice_size = L1 = -1;
     665      698100 :         else if (length < 0)
     666             :         {
     667             :             /* SQL99 says to throw an error for E < S, i.e., negative length */
     668          12 :             ereport(ERROR,
     669             :                     (errcode(ERRCODE_SUBSTRING_ERROR),
     670             :                      errmsg("negative substring length not allowed")));
     671             :             slice_size = L1 = -1;   /* silence stupider compilers */
     672             :         }
     673      698088 :         else if (pg_add_s32_overflow(S, length, &E))
     674             :         {
     675             :             /*
     676             :              * L could be large enough for S + L to overflow, in which case
     677             :              * the substring must run to end of string.
     678             :              */
     679           6 :             slice_size = L1 = -1;
     680             :         }
     681             :         else
     682             :         {
     683             :             /*
     684             :              * A zero or negative value for the end position can happen if the
     685             :              * start was negative or one. SQL99 says to return a zero-length
     686             :              * string.
     687             :              */
     688      698082 :             if (E < 1)
     689           0 :                 return cstring_to_text("");
     690             : 
     691             :             /*
     692             :              * if E is past the end of the string, the tuple toaster will
     693             :              * truncate the length for us
     694             :              */
     695      698082 :             L1 = E - S1;
     696             : 
     697             :             /*
     698             :              * Total slice size in bytes can't be any longer than the start
     699             :              * position plus substring length times the encoding max length.
     700             :              * If that overflows, we can just use -1.
     701             :              */
     702      698082 :             if (pg_mul_s32_overflow(E, eml, &slice_size))
     703           6 :                 slice_size = -1;
     704             :         }
     705             : 
     706             :         /*
     707             :          * If we're working with an untoasted source, no need to do an extra
     708             :          * copying step.
     709             :          */
     710      698164 :         if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
     711      698110 :             VARATT_IS_EXTERNAL(DatumGetPointer(str)))
     712         324 :             slice = DatumGetTextPSlice(str, slice_start, slice_size);
     713             :         else
     714      697840 :             slice = (text *) DatumGetPointer(str);
     715             : 
     716             :         /* see if we got back an empty string */
     717      698164 :         if (VARSIZE_ANY_EXHDR(slice) == 0)
     718             :         {
     719           0 :             if (slice != (text *) DatumGetPointer(str))
     720           0 :                 pfree(slice);
     721           0 :             return cstring_to_text("");
     722             :         }
     723             : 
     724             :         /* Now we can get the actual length of the slice in MB characters */
     725      698164 :         slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
     726      698164 :                                             VARSIZE_ANY_EXHDR(slice));
     727             : 
     728             :         /*
     729             :          * Check that the start position wasn't > slice_strlen. If so, SQL99
     730             :          * says to return a zero-length string.
     731             :          */
     732      698164 :         if (S1 > slice_strlen)
     733             :         {
     734          22 :             if (slice != (text *) DatumGetPointer(str))
     735           0 :                 pfree(slice);
     736          22 :             return cstring_to_text("");
     737             :         }
     738             : 
     739             :         /*
     740             :          * Adjust L1 and E1 now that we know the slice string length. Again
     741             :          * remember that S1 is one based, and slice_start is zero based.
     742             :          */
     743      698142 :         if (L1 > -1)
     744      698082 :             E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
     745             :         else
     746          60 :             E1 = slice_start + 1 + slice_strlen;
     747             : 
     748             :         /*
     749             :          * Find the start position in the slice; remember S1 is not zero based
     750             :          */
     751      698142 :         p = VARDATA_ANY(slice);
     752     6713912 :         for (i = 0; i < S1 - 1; i++)
     753     6015770 :             p += pg_mblen(p);
     754             : 
     755             :         /* hang onto a pointer to our start position */
     756      698142 :         s = p;
     757             : 
     758             :         /*
     759             :          * Count the actual bytes used by the substring of the requested
     760             :          * length.
     761             :          */
     762     9936426 :         for (i = S1; i < E1; i++)
     763     9238284 :             p += pg_mblen(p);
     764             : 
     765      698142 :         ret = (text *) palloc(VARHDRSZ + (p - s));
     766      698142 :         SET_VARSIZE(ret, VARHDRSZ + (p - s));
     767      698142 :         memcpy(VARDATA(ret), s, (p - s));
     768             : 
     769      698142 :         if (slice != (text *) DatumGetPointer(str))
     770         324 :             pfree(slice);
     771             : 
     772      698142 :         return ret;
     773             :     }
     774             :     else
     775           0 :         elog(ERROR, "invalid backend encoding: encoding max length < 1");
     776             : 
     777             :     /* not reached: suppress compiler warning */
     778             :     return NULL;
     779             : }
     780             : 
     781             : /*
     782             :  * textoverlay
     783             :  *  Replace specified substring of first string with second
     784             :  *
     785             :  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
     786             :  * This code is a direct implementation of what the standard says.
     787             :  */
     788             : Datum
     789          28 : textoverlay(PG_FUNCTION_ARGS)
     790             : {
     791          28 :     text       *t1 = PG_GETARG_TEXT_PP(0);
     792          28 :     text       *t2 = PG_GETARG_TEXT_PP(1);
     793          28 :     int         sp = PG_GETARG_INT32(2);    /* substring start position */
     794          28 :     int         sl = PG_GETARG_INT32(3);    /* substring length */
     795             : 
     796          28 :     PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
     797             : }
     798             : 
     799             : Datum
     800          12 : textoverlay_no_len(PG_FUNCTION_ARGS)
     801             : {
     802          12 :     text       *t1 = PG_GETARG_TEXT_PP(0);
     803          12 :     text       *t2 = PG_GETARG_TEXT_PP(1);
     804          12 :     int         sp = PG_GETARG_INT32(2);    /* substring start position */
     805             :     int         sl;
     806             : 
     807          12 :     sl = text_length(PointerGetDatum(t2));  /* defaults to length(t2) */
     808          12 :     PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
     809             : }
     810             : 
     811             : static text *
     812          40 : text_overlay(text *t1, text *t2, int sp, int sl)
     813             : {
     814             :     text       *result;
     815             :     text       *s1;
     816             :     text       *s2;
     817             :     int         sp_pl_sl;
     818             : 
     819             :     /*
     820             :      * Check for possible integer-overflow cases.  For negative sp, throw a
     821             :      * "substring length" error because that's what should be expected
     822             :      * according to the spec's definition of OVERLAY().
     823             :      */
     824          40 :     if (sp <= 0)
     825           0 :         ereport(ERROR,
     826             :                 (errcode(ERRCODE_SUBSTRING_ERROR),
     827             :                  errmsg("negative substring length not allowed")));
     828          40 :     if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
     829           0 :         ereport(ERROR,
     830             :                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
     831             :                  errmsg("integer out of range")));
     832             : 
     833          40 :     s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
     834          40 :     s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
     835          40 :     result = text_catenate(s1, t2);
     836          40 :     result = text_catenate(result, s2);
     837             : 
     838          40 :     return result;
     839             : }
     840             : 
     841             : /*
     842             :  * textpos -
     843             :  *    Return the position of the specified substring.
     844             :  *    Implements the SQL POSITION() function.
     845             :  *    Ref: A Guide To The SQL Standard, Date & Darwen, 1997
     846             :  * - thomas 1997-07-27
     847             :  */
     848             : Datum
     849         130 : textpos(PG_FUNCTION_ARGS)
     850             : {
     851         130 :     text       *str = PG_GETARG_TEXT_PP(0);
     852         130 :     text       *search_str = PG_GETARG_TEXT_PP(1);
     853             : 
     854         130 :     PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
     855             : }
     856             : 
     857             : /*
     858             :  * text_position -
     859             :  *  Does the real work for textpos()
     860             :  *
     861             :  * Inputs:
     862             :  *      t1 - string to be searched
     863             :  *      t2 - pattern to match within t1
     864             :  * Result:
     865             :  *      Character index of the first matched char, starting from 1,
     866             :  *      or 0 if no match.
     867             :  *
     868             :  *  This is broken out so it can be called directly by other string processing
     869             :  *  functions.
     870             :  */
     871             : static int
     872         130 : text_position(text *t1, text *t2, Oid collid)
     873             : {
     874             :     TextPositionState state;
     875             :     int         result;
     876             : 
     877         130 :     check_collation_set(collid);
     878             : 
     879             :     /* Empty needle always matches at position 1 */
     880         130 :     if (VARSIZE_ANY_EXHDR(t2) < 1)
     881          12 :         return 1;
     882             : 
     883             :     /* Otherwise, can't match if haystack is shorter than needle */
     884         118 :     if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2) &&
     885          22 :         pg_newlocale_from_collation(collid)->deterministic)
     886          22 :         return 0;
     887             : 
     888          96 :     text_position_setup(t1, t2, collid, &state);
     889             :     /* don't need greedy mode here */
     890          96 :     state.greedy = false;
     891             : 
     892          96 :     if (!text_position_next(&state))
     893          24 :         result = 0;
     894             :     else
     895          72 :         result = text_position_get_match_pos(&state);
     896          96 :     text_position_cleanup(&state);
     897          96 :     return result;
     898             : }
     899             : 
     900             : 
     901             : /*
     902             :  * text_position_setup, text_position_next, text_position_cleanup -
     903             :  *  Component steps of text_position()
     904             :  *
     905             :  * These are broken out so that a string can be efficiently searched for
     906             :  * multiple occurrences of the same pattern.  text_position_next may be
     907             :  * called multiple times, and it advances to the next match on each call.
     908             :  * text_position_get_match_ptr() and text_position_get_match_pos() return
     909             :  * a pointer or 1-based character position of the last match, respectively.
     910             :  *
     911             :  * The "state" variable is normally just a local variable in the caller.
     912             :  *
     913             :  * NOTE: text_position_next skips over the matched portion.  For example,
     914             :  * searching for "xx" in "xxx" returns only one match, not two.
     915             :  */
     916             : 
     917             : static void
     918        1772 : text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
     919             : {
     920        1772 :     int         len1 = VARSIZE_ANY_EXHDR(t1);
     921        1772 :     int         len2 = VARSIZE_ANY_EXHDR(t2);
     922             : 
     923        1772 :     check_collation_set(collid);
     924             : 
     925        1772 :     state->locale = pg_newlocale_from_collation(collid);
     926             : 
     927             :     /*
     928             :      * Most callers need greedy mode, but some might want to unset this to
     929             :      * optimize.
     930             :      */
     931        1772 :     state->greedy = true;
     932             : 
     933             :     Assert(len2 > 0);
     934             : 
     935             :     /*
     936             :      * Even with a multi-byte encoding, we perform the search using the raw
     937             :      * byte sequence, ignoring multibyte issues.  For UTF-8, that works fine,
     938             :      * because in UTF-8 the byte sequence of one character cannot contain
     939             :      * another character.  For other multi-byte encodings, we do the search
     940             :      * initially as a simple byte search, ignoring multibyte issues, but
     941             :      * verify afterwards that the match we found is at a character boundary,
     942             :      * and continue the search if it was a false match.
     943             :      */
     944        1772 :     if (pg_database_encoding_max_length() == 1)
     945         108 :         state->is_multibyte_char_in_char = false;
     946        1664 :     else if (GetDatabaseEncoding() == PG_UTF8)
     947        1664 :         state->is_multibyte_char_in_char = false;
     948             :     else
     949           0 :         state->is_multibyte_char_in_char = true;
     950             : 
     951        1772 :     state->str1 = VARDATA_ANY(t1);
     952        1772 :     state->str2 = VARDATA_ANY(t2);
     953        1772 :     state->len1 = len1;
     954        1772 :     state->len2 = len2;
     955        1772 :     state->last_match = NULL;
     956        1772 :     state->refpoint = state->str1;
     957        1772 :     state->refpos = 0;
     958             : 
     959             :     /*
     960             :      * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
     961             :      * notes we use the terminology that the "haystack" is the string to be
     962             :      * searched (t1) and the "needle" is the pattern being sought (t2).
     963             :      *
     964             :      * If the needle is empty or bigger than the haystack then there is no
     965             :      * point in wasting cycles initializing the table.  We also choose not to
     966             :      * use B-M-H for needles of length 1, since the skip table can't possibly
     967             :      * save anything in that case.
     968             :      *
     969             :      * (With nondeterministic collations, the search is already
     970             :      * multibyte-aware, so we don't need this.)
     971             :      */
     972        1772 :     if (len1 >= len2 && len2 > 1 && state->locale->deterministic)
     973             :     {
     974        1444 :         int         searchlength = len1 - len2;
     975             :         int         skiptablemask;
     976             :         int         last;
     977             :         int         i;
     978        1444 :         const char *str2 = state->str2;
     979             : 
     980             :         /*
     981             :          * First we must determine how much of the skip table to use.  The
     982             :          * declaration of TextPositionState allows up to 256 elements, but for
     983             :          * short search problems we don't really want to have to initialize so
     984             :          * many elements --- it would take too long in comparison to the
     985             :          * actual search time.  So we choose a useful skip table size based on
     986             :          * the haystack length minus the needle length.  The closer the needle
     987             :          * length is to the haystack length the less useful skipping becomes.
     988             :          *
     989             :          * Note: since we use bit-masking to select table elements, the skip
     990             :          * table size MUST be a power of 2, and so the mask must be 2^N-1.
     991             :          */
     992        1444 :         if (searchlength < 16)
     993         114 :             skiptablemask = 3;
     994        1330 :         else if (searchlength < 64)
     995          16 :             skiptablemask = 7;
     996        1314 :         else if (searchlength < 128)
     997          14 :             skiptablemask = 15;
     998        1300 :         else if (searchlength < 512)
     999         254 :             skiptablemask = 31;
    1000        1046 :         else if (searchlength < 2048)
    1001         794 :             skiptablemask = 63;
    1002         252 :         else if (searchlength < 4096)
    1003         176 :             skiptablemask = 127;
    1004             :         else
    1005          76 :             skiptablemask = 255;
    1006        1444 :         state->skiptablemask = skiptablemask;
    1007             : 
    1008             :         /*
    1009             :          * Initialize the skip table.  We set all elements to the needle
    1010             :          * length, since this is the correct skip distance for any character
    1011             :          * not found in the needle.
    1012             :          */
    1013      103180 :         for (i = 0; i <= skiptablemask; i++)
    1014      101736 :             state->skiptable[i] = len2;
    1015             : 
    1016             :         /*
    1017             :          * Now examine the needle.  For each character except the last one,
    1018             :          * set the corresponding table element to the appropriate skip
    1019             :          * distance.  Note that when two characters share the same skip table
    1020             :          * entry, the one later in the needle must determine the skip
    1021             :          * distance.
    1022             :          */
    1023        1444 :         last = len2 - 1;
    1024             : 
    1025       19164 :         for (i = 0; i < last; i++)
    1026       17720 :             state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
    1027             :     }
    1028        1772 : }
    1029             : 
    1030             : /*
    1031             :  * Advance to the next match, starting from the end of the previous match
    1032             :  * (or the beginning of the string, on first call).  Returns true if a match
    1033             :  * is found.
    1034             :  *
    1035             :  * Note that this refuses to match an empty-string needle.  Most callers
    1036             :  * will have handled that case specially and we'll never see it here.
    1037             :  */
    1038             : static bool
    1039        8066 : text_position_next(TextPositionState *state)
    1040             : {
    1041        8066 :     int         needle_len = state->len2;
    1042             :     char       *start_ptr;
    1043             :     char       *matchptr;
    1044             : 
    1045        8066 :     if (needle_len <= 0)
    1046           0 :         return false;           /* result for empty pattern */
    1047             : 
    1048             :     /* Start from the point right after the previous match. */
    1049        8066 :     if (state->last_match)
    1050        6282 :         start_ptr = state->last_match + state->last_match_len;
    1051             :     else
    1052        1784 :         start_ptr = state->str1;
    1053             : 
    1054        8066 : retry:
    1055        8066 :     matchptr = text_position_next_internal(start_ptr, state);
    1056             : 
    1057        8066 :     if (!matchptr)
    1058        1688 :         return false;
    1059             : 
    1060             :     /*
    1061             :      * Found a match for the byte sequence.  If this is a multibyte encoding,
    1062             :      * where one character's byte sequence can appear inside a longer
    1063             :      * multi-byte character, we need to verify that the match was at a
    1064             :      * character boundary, not in the middle of a multi-byte character.
    1065             :      */
    1066        6378 :     if (state->is_multibyte_char_in_char && state->locale->deterministic)
    1067             :     {
    1068             :         /* Walk one character at a time, until we reach the match. */
    1069             : 
    1070             :         /* the search should never move backwards. */
    1071             :         Assert(state->refpoint <= matchptr);
    1072             : 
    1073           0 :         while (state->refpoint < matchptr)
    1074             :         {
    1075             :             /* step to next character. */
    1076           0 :             state->refpoint += pg_mblen(state->refpoint);
    1077           0 :             state->refpos++;
    1078             : 
    1079             :             /*
    1080             :              * If we stepped over the match's start position, then it was a
    1081             :              * false positive, where the byte sequence appeared in the middle
    1082             :              * of a multi-byte character.  Skip it, and continue the search at
    1083             :              * the next character boundary.
    1084             :              */
    1085           0 :             if (state->refpoint > matchptr)
    1086             :             {
    1087           0 :                 start_ptr = state->refpoint;
    1088           0 :                 goto retry;
    1089             :             }
    1090             :         }
    1091             :     }
    1092             : 
    1093        6378 :     state->last_match = matchptr;
    1094        6378 :     state->last_match_len = state->last_match_len_tmp;
    1095        6378 :     return true;
    1096             : }
    1097             : 
    1098             : /*
    1099             :  * Subroutine of text_position_next().  This searches for the raw byte
    1100             :  * sequence, ignoring any multi-byte encoding issues.  Returns the first
    1101             :  * match starting at 'start_ptr', or NULL if no match is found.
    1102             :  */
    1103             : static char *
    1104        8066 : text_position_next_internal(char *start_ptr, TextPositionState *state)
    1105             : {
    1106        8066 :     int         haystack_len = state->len1;
    1107        8066 :     int         needle_len = state->len2;
    1108        8066 :     int         skiptablemask = state->skiptablemask;
    1109        8066 :     const char *haystack = state->str1;
    1110        8066 :     const char *needle = state->str2;
    1111        8066 :     const char *haystack_end = &haystack[haystack_len];
    1112             :     const char *hptr;
    1113             : 
    1114             :     Assert(start_ptr >= haystack && start_ptr <= haystack_end);
    1115             : 
    1116        8066 :     state->last_match_len_tmp = needle_len;
    1117             : 
    1118        8066 :     if (!state->locale->deterministic)
    1119             :     {
    1120             :         /*
    1121             :          * With a nondeterministic collation, we have to use an unoptimized
    1122             :          * route.  We walk through the haystack and see if at each position
    1123             :          * there is a substring of the remaining string that is equal to the
    1124             :          * needle under the given collation.
    1125             :          *
    1126             :          * Note, the found substring could have a different length than the
    1127             :          * needle, including being empty.  Callers that want to skip over the
    1128             :          * found string need to read the length of the found substring from
    1129             :          * last_match_len rather than just using the length of their needle.
    1130             :          *
    1131             :          * Most callers will require "greedy" semantics, meaning that we need
    1132             :          * to find the longest such substring, not the shortest.  For callers
    1133             :          * that don't need greedy semantics, we can finish on the first match.
    1134             :          */
    1135         240 :         const char *result_hptr = NULL;
    1136             : 
    1137         240 :         hptr = start_ptr;
    1138         642 :         while (hptr < haystack_end)
    1139             :         {
    1140             :             /*
    1141             :              * First check the common case that there is a match in the
    1142             :              * haystack of exactly the length of the needle.
    1143             :              */
    1144         534 :             if (!state->greedy &&
    1145         108 :                 haystack_end - hptr >= needle_len &&
    1146          54 :                 pg_strncoll(hptr, needle_len, needle, needle_len, state->locale) == 0)
    1147          12 :                 return (char *) hptr;
    1148             : 
    1149             :             /*
    1150             :              * Else check if any of the possible substrings starting at hptr
    1151             :              * are equal to the needle.
    1152             :              */
    1153        2586 :             for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end))
    1154             :             {
    1155        2064 :                 if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
    1156             :                 {
    1157         132 :                     state->last_match_len_tmp = (test_end - hptr);
    1158         132 :                     result_hptr = hptr;
    1159         132 :                     if (!state->greedy)
    1160           0 :                         break;
    1161             :                 }
    1162             :             }
    1163         522 :             if (result_hptr)
    1164         120 :                 break;
    1165             : 
    1166         402 :             hptr += pg_mblen(hptr);
    1167             :         }
    1168             : 
    1169         228 :         return (char *) result_hptr;
    1170             :     }
    1171        7826 :     else if (needle_len == 1)
    1172             :     {
    1173             :         /* No point in using B-M-H for a one-character needle */
    1174         760 :         char        nchar = *needle;
    1175             : 
    1176         760 :         hptr = start_ptr;
    1177        5878 :         while (hptr < haystack_end)
    1178             :         {
    1179        5712 :             if (*hptr == nchar)
    1180         594 :                 return (char *) hptr;
    1181        5118 :             hptr++;
    1182             :         }
    1183             :     }
    1184             :     else
    1185             :     {
    1186        7066 :         const char *needle_last = &needle[needle_len - 1];
    1187             : 
    1188             :         /* Start at startpos plus the length of the needle */
    1189        7066 :         hptr = start_ptr + needle_len - 1;
    1190      180742 :         while (hptr < haystack_end)
    1191             :         {
    1192             :             /* Match the needle scanning *backward* */
    1193             :             const char *nptr;
    1194             :             const char *p;
    1195             : 
    1196      179328 :             nptr = needle_last;
    1197      179328 :             p = hptr;
    1198      263578 :             while (*nptr == *p)
    1199             :             {
    1200             :                 /* Matched it all?  If so, return 1-based position */
    1201       89902 :                 if (nptr == needle)
    1202        5652 :                     return (char *) p;
    1203       84250 :                 nptr--, p--;
    1204             :             }
    1205             : 
    1206             :             /*
    1207             :              * No match, so use the haystack char at hptr to decide how far to
    1208             :              * advance.  If the needle had any occurrence of that character
    1209             :              * (or more precisely, one sharing the same skiptable entry)
    1210             :              * before its last character, then we advance far enough to align
    1211             :              * the last such needle character with that haystack position.
    1212             :              * Otherwise we can advance by the whole needle length.
    1213             :              */
    1214      173676 :             hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
    1215             :         }
    1216             :     }
    1217             : 
    1218        1580 :     return 0;                   /* not found */
    1219             : }
    1220             : 
    1221             : /*
    1222             :  * Return a pointer to the current match.
    1223             :  *
    1224             :  * The returned pointer points into the original haystack string.
    1225             :  */
    1226             : static char *
    1227        6276 : text_position_get_match_ptr(TextPositionState *state)
    1228             : {
    1229        6276 :     return state->last_match;
    1230             : }
    1231             : 
    1232             : /*
    1233             :  * Return the offset of the current match.
    1234             :  *
    1235             :  * The offset is in characters, 1-based.
    1236             :  */
    1237             : static int
    1238          72 : text_position_get_match_pos(TextPositionState *state)
    1239             : {
    1240             :     /* Convert the byte position to char position. */
    1241         144 :     state->refpos += pg_mbstrlen_with_len(state->refpoint,
    1242          72 :                                           state->last_match - state->refpoint);
    1243          72 :     state->refpoint = state->last_match;
    1244          72 :     return state->refpos + 1;
    1245             : }
    1246             : 
    1247             : /*
    1248             :  * Reset search state to the initial state installed by text_position_setup.
    1249             :  *
    1250             :  * The next call to text_position_next will search from the beginning
    1251             :  * of the string.
    1252             :  */
    1253             : static void
    1254          12 : text_position_reset(TextPositionState *state)
    1255             : {
    1256          12 :     state->last_match = NULL;
    1257          12 :     state->refpoint = state->str1;
    1258          12 :     state->refpos = 0;
    1259          12 : }
    1260             : 
    1261             : static void
    1262        1772 : text_position_cleanup(TextPositionState *state)
    1263             : {
    1264             :     /* no cleanup needed */
    1265        1772 : }
    1266             : 
    1267             : 
    1268             : static void
    1269    16991066 : check_collation_set(Oid collid)
    1270             : {
    1271    16991066 :     if (!OidIsValid(collid))
    1272             :     {
    1273             :         /*
    1274             :          * This typically means that the parser could not resolve a conflict
    1275             :          * of implicit collations, so report it that way.
    1276             :          */
    1277          30 :         ereport(ERROR,
    1278             :                 (errcode(ERRCODE_INDETERMINATE_COLLATION),
    1279             :                  errmsg("could not determine which collation to use for string comparison"),
    1280             :                  errhint("Use the COLLATE clause to set the collation explicitly.")));
    1281             :     }
    1282    16991036 : }
    1283             : 
    1284             : /*
    1285             :  * varstr_cmp()
    1286             :  *
    1287             :  * Comparison function for text strings with given lengths, using the
    1288             :  * appropriate locale. Returns an integer less than, equal to, or greater than
    1289             :  * zero, indicating whether arg1 is less than, equal to, or greater than arg2.
    1290             :  *
    1291             :  * Note: many functions that depend on this are marked leakproof; therefore,
    1292             :  * avoid reporting the actual contents of the input when throwing errors.
    1293             :  * All errors herein should be things that can't happen except on corrupt
    1294             :  * data, anyway; otherwise we will have trouble with indexing strings that
    1295             :  * would cause them.
    1296             :  */
    1297             : int
    1298     9830094 : varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
    1299             : {
    1300             :     int         result;
    1301             :     pg_locale_t mylocale;
    1302             : 
    1303     9830094 :     check_collation_set(collid);
    1304             : 
    1305     9830076 :     mylocale = pg_newlocale_from_collation(collid);
    1306             : 
    1307     9830076 :     if (mylocale->collate_is_c)
    1308             :     {
    1309     3827404 :         result = memcmp(arg1, arg2, Min(len1, len2));
    1310     3827404 :         if ((result == 0) && (len1 != len2))
    1311      133916 :             result = (len1 < len2) ? -1 : 1;
    1312             :     }
    1313             :     else
    1314             :     {
    1315             :         /*
    1316             :          * memcmp() can't tell us which of two unequal strings sorts first,
    1317             :          * but it's a cheap way to tell if they're equal.  Testing shows that
    1318             :          * memcmp() followed by strcoll() is only trivially slower than
    1319             :          * strcoll() by itself, so we don't lose much if this doesn't work out
    1320             :          * very often, and if it does - for example, because there are many
    1321             :          * equal strings in the input - then we win big by avoiding expensive
    1322             :          * collation-aware comparisons.
    1323             :          */
    1324     6002672 :         if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
    1325     1558124 :             return 0;
    1326             : 
    1327     4444548 :         result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
    1328             : 
    1329             :         /* Break tie if necessary. */
    1330     4444548 :         if (result == 0 && mylocale->deterministic)
    1331             :         {
    1332           0 :             result = memcmp(arg1, arg2, Min(len1, len2));
    1333           0 :             if ((result == 0) && (len1 != len2))
    1334           0 :                 result = (len1 < len2) ? -1 : 1;
    1335             :         }
    1336             :     }
    1337             : 
    1338     8271952 :     return result;
    1339             : }
    1340             : 
    1341             : /* text_cmp()
    1342             :  * Internal comparison function for text strings.
    1343             :  * Returns -1, 0 or 1
    1344             :  */
    1345             : static int
    1346     7740710 : text_cmp(text *arg1, text *arg2, Oid collid)
    1347             : {
    1348             :     char       *a1p,
    1349             :                *a2p;
    1350             :     int         len1,
    1351             :                 len2;
    1352             : 
    1353     7740710 :     a1p = VARDATA_ANY(arg1);
    1354     7740710 :     a2p = VARDATA_ANY(arg2);
    1355             : 
    1356     7740710 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    1357     7740710 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    1358             : 
    1359     7740710 :     return varstr_cmp(a1p, len1, a2p, len2, collid);
    1360             : }
    1361             : 
    1362             : /*
    1363             :  * Comparison functions for text strings.
    1364             :  *
    1365             :  * Note: btree indexes need these routines not to leak memory; therefore,
    1366             :  * be careful to free working copies of toasted datums.  Most places don't
    1367             :  * need to be so careful.
    1368             :  */
    1369             : 
    1370             : Datum
    1371     6734060 : texteq(PG_FUNCTION_ARGS)
    1372             : {
    1373     6734060 :     Oid         collid = PG_GET_COLLATION();
    1374     6734060 :     pg_locale_t mylocale = 0;
    1375             :     bool        result;
    1376             : 
    1377     6734060 :     check_collation_set(collid);
    1378             : 
    1379     6734060 :     mylocale = pg_newlocale_from_collation(collid);
    1380             : 
    1381     6734060 :     if (mylocale->deterministic)
    1382             :     {
    1383     6725620 :         Datum       arg1 = PG_GETARG_DATUM(0);
    1384     6725620 :         Datum       arg2 = PG_GETARG_DATUM(1);
    1385             :         Size        len1,
    1386             :                     len2;
    1387             : 
    1388             :         /*
    1389             :          * Since we only care about equality or not-equality, we can avoid all
    1390             :          * the expense of strcoll() here, and just do bitwise comparison.  In
    1391             :          * fact, we don't even have to do a bitwise comparison if we can show
    1392             :          * the lengths of the strings are unequal; which might save us from
    1393             :          * having to detoast one or both values.
    1394             :          */
    1395     6725620 :         len1 = toast_raw_datum_size(arg1);
    1396     6725620 :         len2 = toast_raw_datum_size(arg2);
    1397     6725620 :         if (len1 != len2)
    1398     3200982 :             result = false;
    1399             :         else
    1400             :         {
    1401     3524638 :             text       *targ1 = DatumGetTextPP(arg1);
    1402     3524638 :             text       *targ2 = DatumGetTextPP(arg2);
    1403             : 
    1404     3524638 :             result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1405             :                              len1 - VARHDRSZ) == 0);
    1406             : 
    1407     3524638 :             PG_FREE_IF_COPY(targ1, 0);
    1408     3524638 :             PG_FREE_IF_COPY(targ2, 1);
    1409             :         }
    1410             :     }
    1411             :     else
    1412             :     {
    1413        8440 :         text       *arg1 = PG_GETARG_TEXT_PP(0);
    1414        8440 :         text       *arg2 = PG_GETARG_TEXT_PP(1);
    1415             : 
    1416        8440 :         result = (text_cmp(arg1, arg2, collid) == 0);
    1417             : 
    1418        8440 :         PG_FREE_IF_COPY(arg1, 0);
    1419        8440 :         PG_FREE_IF_COPY(arg2, 1);
    1420             :     }
    1421             : 
    1422     6734060 :     PG_RETURN_BOOL(result);
    1423             : }
    1424             : 
    1425             : Datum
    1426       25446 : textne(PG_FUNCTION_ARGS)
    1427             : {
    1428       25446 :     Oid         collid = PG_GET_COLLATION();
    1429             :     pg_locale_t mylocale;
    1430             :     bool        result;
    1431             : 
    1432       25446 :     check_collation_set(collid);
    1433             : 
    1434       25446 :     mylocale = pg_newlocale_from_collation(collid);
    1435             : 
    1436       25446 :     if (mylocale->deterministic)
    1437             :     {
    1438       25422 :         Datum       arg1 = PG_GETARG_DATUM(0);
    1439       25422 :         Datum       arg2 = PG_GETARG_DATUM(1);
    1440             :         Size        len1,
    1441             :                     len2;
    1442             : 
    1443             :         /* See comment in texteq() */
    1444       25422 :         len1 = toast_raw_datum_size(arg1);
    1445       25422 :         len2 = toast_raw_datum_size(arg2);
    1446       25422 :         if (len1 != len2)
    1447        4458 :             result = true;
    1448             :         else
    1449             :         {
    1450       20964 :             text       *targ1 = DatumGetTextPP(arg1);
    1451       20964 :             text       *targ2 = DatumGetTextPP(arg2);
    1452             : 
    1453       20964 :             result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1454             :                              len1 - VARHDRSZ) != 0);
    1455             : 
    1456       20964 :             PG_FREE_IF_COPY(targ1, 0);
    1457       20964 :             PG_FREE_IF_COPY(targ2, 1);
    1458             :         }
    1459             :     }
    1460             :     else
    1461             :     {
    1462          24 :         text       *arg1 = PG_GETARG_TEXT_PP(0);
    1463          24 :         text       *arg2 = PG_GETARG_TEXT_PP(1);
    1464             : 
    1465          24 :         result = (text_cmp(arg1, arg2, collid) != 0);
    1466             : 
    1467          24 :         PG_FREE_IF_COPY(arg1, 0);
    1468          24 :         PG_FREE_IF_COPY(arg2, 1);
    1469             :     }
    1470             : 
    1471       25446 :     PG_RETURN_BOOL(result);
    1472             : }
    1473             : 
    1474             : Datum
    1475      212408 : text_lt(PG_FUNCTION_ARGS)
    1476             : {
    1477      212408 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1478      212408 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1479             :     bool        result;
    1480             : 
    1481      212408 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
    1482             : 
    1483      212390 :     PG_FREE_IF_COPY(arg1, 0);
    1484      212390 :     PG_FREE_IF_COPY(arg2, 1);
    1485             : 
    1486      212390 :     PG_RETURN_BOOL(result);
    1487             : }
    1488             : 
    1489             : Datum
    1490      318608 : text_le(PG_FUNCTION_ARGS)
    1491             : {
    1492      318608 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1493      318608 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1494             :     bool        result;
    1495             : 
    1496      318608 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
    1497             : 
    1498      318608 :     PG_FREE_IF_COPY(arg1, 0);
    1499      318608 :     PG_FREE_IF_COPY(arg2, 1);
    1500             : 
    1501      318608 :     PG_RETURN_BOOL(result);
    1502             : }
    1503             : 
    1504             : Datum
    1505      196002 : text_gt(PG_FUNCTION_ARGS)
    1506             : {
    1507      196002 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1508      196002 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1509             :     bool        result;
    1510             : 
    1511      196002 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
    1512             : 
    1513      196002 :     PG_FREE_IF_COPY(arg1, 0);
    1514      196002 :     PG_FREE_IF_COPY(arg2, 1);
    1515             : 
    1516      196002 :     PG_RETURN_BOOL(result);
    1517             : }
    1518             : 
    1519             : Datum
    1520      176220 : text_ge(PG_FUNCTION_ARGS)
    1521             : {
    1522      176220 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1523      176220 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1524             :     bool        result;
    1525             : 
    1526      176220 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
    1527             : 
    1528      176220 :     PG_FREE_IF_COPY(arg1, 0);
    1529      176220 :     PG_FREE_IF_COPY(arg2, 1);
    1530             : 
    1531      176220 :     PG_RETURN_BOOL(result);
    1532             : }
    1533             : 
    1534             : Datum
    1535       37914 : text_starts_with(PG_FUNCTION_ARGS)
    1536             : {
    1537       37914 :     Datum       arg1 = PG_GETARG_DATUM(0);
    1538       37914 :     Datum       arg2 = PG_GETARG_DATUM(1);
    1539       37914 :     Oid         collid = PG_GET_COLLATION();
    1540             :     pg_locale_t mylocale;
    1541             :     bool        result;
    1542             :     Size        len1,
    1543             :                 len2;
    1544             : 
    1545       37914 :     check_collation_set(collid);
    1546             : 
    1547       37914 :     mylocale = pg_newlocale_from_collation(collid);
    1548             : 
    1549       37914 :     if (!mylocale->deterministic)
    1550           0 :         ereport(ERROR,
    1551             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1552             :                  errmsg("nondeterministic collations are not supported for substring searches")));
    1553             : 
    1554       37914 :     len1 = toast_raw_datum_size(arg1);
    1555       37914 :     len2 = toast_raw_datum_size(arg2);
    1556       37914 :     if (len2 > len1)
    1557           0 :         result = false;
    1558             :     else
    1559             :     {
    1560       37914 :         text       *targ1 = text_substring(arg1, 1, len2, false);
    1561       37914 :         text       *targ2 = DatumGetTextPP(arg2);
    1562             : 
    1563       37914 :         result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1564       37914 :                          VARSIZE_ANY_EXHDR(targ2)) == 0);
    1565             : 
    1566       37914 :         PG_FREE_IF_COPY(targ1, 0);
    1567       37914 :         PG_FREE_IF_COPY(targ2, 1);
    1568             :     }
    1569             : 
    1570       37914 :     PG_RETURN_BOOL(result);
    1571             : }
    1572             : 
    1573             : Datum
    1574     6513372 : bttextcmp(PG_FUNCTION_ARGS)
    1575             : {
    1576     6513372 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1577     6513372 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1578             :     int32       result;
    1579             : 
    1580     6513372 :     result = text_cmp(arg1, arg2, PG_GET_COLLATION());
    1581             : 
    1582     6513372 :     PG_FREE_IF_COPY(arg1, 0);
    1583     6513372 :     PG_FREE_IF_COPY(arg2, 1);
    1584             : 
    1585     6513372 :     PG_RETURN_INT32(result);
    1586             : }
    1587             : 
    1588             : Datum
    1589      105644 : bttextsortsupport(PG_FUNCTION_ARGS)
    1590             : {
    1591      105644 :     SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
    1592      105644 :     Oid         collid = ssup->ssup_collation;
    1593             :     MemoryContext oldcontext;
    1594             : 
    1595      105644 :     oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
    1596             : 
    1597             :     /* Use generic string SortSupport */
    1598      105644 :     varstr_sortsupport(ssup, TEXTOID, collid);
    1599             : 
    1600      105632 :     MemoryContextSwitchTo(oldcontext);
    1601             : 
    1602      105632 :     PG_RETURN_VOID();
    1603             : }
    1604             : 
    1605             : /*
    1606             :  * Generic sortsupport interface for character type's operator classes.
    1607             :  * Includes locale support, and support for BpChar semantics (i.e. removing
    1608             :  * trailing spaces before comparison).
    1609             :  *
    1610             :  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
    1611             :  * same representation.  Callers that always use the C collation (e.g.
    1612             :  * non-collatable type callers like bytea) may have NUL bytes in their strings;
    1613             :  * this will not work with any other collation, though.
    1614             :  */
    1615             : void
    1616      156466 : varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
    1617             : {
    1618      156466 :     bool        abbreviate = ssup->abbreviate;
    1619      156466 :     bool        collate_c = false;
    1620             :     VarStringSortSupport *sss;
    1621             :     pg_locale_t locale;
    1622             : 
    1623      156466 :     check_collation_set(collid);
    1624             : 
    1625      156454 :     locale = pg_newlocale_from_collation(collid);
    1626             : 
    1627             :     /*
    1628             :      * If possible, set ssup->comparator to a function which can be used to
    1629             :      * directly compare two datums.  If we can do this, we'll avoid the
    1630             :      * overhead of a trip through the fmgr layer for every comparison, which
    1631             :      * can be substantial.
    1632             :      *
    1633             :      * Most typically, we'll set the comparator to varlenafastcmp_locale,
    1634             :      * which uses strcoll() to perform comparisons.  We use that for the
    1635             :      * BpChar case too, but type NAME uses namefastcmp_locale. However, if
    1636             :      * LC_COLLATE = C, we can make things quite a bit faster with
    1637             :      * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
    1638             :      * memcmp() rather than strcoll().
    1639             :      */
    1640      156454 :     if (locale->collate_is_c)
    1641             :     {
    1642      110564 :         if (typid == BPCHAROID)
    1643         342 :             ssup->comparator = bpcharfastcmp_c;
    1644      110222 :         else if (typid == NAMEOID)
    1645             :         {
    1646       49726 :             ssup->comparator = namefastcmp_c;
    1647             :             /* Not supporting abbreviation with type NAME, for now */
    1648       49726 :             abbreviate = false;
    1649             :         }
    1650             :         else
    1651       60496 :             ssup->comparator = varstrfastcmp_c;
    1652             : 
    1653      110564 :         collate_c = true;
    1654             :     }
    1655             :     else
    1656             :     {
    1657             :         /*
    1658             :          * We use varlenafastcmp_locale except for type NAME.
    1659             :          */
    1660       45890 :         if (typid == NAMEOID)
    1661             :         {
    1662           0 :             ssup->comparator = namefastcmp_locale;
    1663             :             /* Not supporting abbreviation with type NAME, for now */
    1664           0 :             abbreviate = false;
    1665             :         }
    1666             :         else
    1667       45890 :             ssup->comparator = varlenafastcmp_locale;
    1668             : 
    1669             :         /*
    1670             :          * Unfortunately, it seems that abbreviation for non-C collations is
    1671             :          * broken on many common platforms; see pg_strxfrm_enabled().
    1672             :          *
    1673             :          * Even apart from the risk of broken locales, it's possible that
    1674             :          * there are platforms where the use of abbreviated keys should be
    1675             :          * disabled at compile time.  Having only 4 byte datums could make
    1676             :          * worst-case performance drastically more likely, for example.
    1677             :          * Moreover, macOS's strxfrm() implementation is known to not
    1678             :          * effectively concentrate a significant amount of entropy from the
    1679             :          * original string in earlier transformed blobs.  It's possible that
    1680             :          * other supported platforms are similarly encumbered.  So, if we ever
    1681             :          * get past disabling this categorically, we may still want or need to
    1682             :          * disable it for particular platforms.
    1683             :          */
    1684       45890 :         if (!pg_strxfrm_enabled(locale))
    1685       45094 :             abbreviate = false;
    1686             :     }
    1687             : 
    1688             :     /*
    1689             :      * If we're using abbreviated keys, or if we're using a locale-aware
    1690             :      * comparison, we need to initialize a VarStringSortSupport object. Both
    1691             :      * cases will make use of the temporary buffers we initialize here for
    1692             :      * scratch space (and to detect requirement for BpChar semantics from
    1693             :      * caller), and the abbreviation case requires additional state.
    1694             :      */
    1695      156454 :     if (abbreviate || !collate_c)
    1696             :     {
    1697       88358 :         sss = palloc(sizeof(VarStringSortSupport));
    1698       88358 :         sss->buf1 = palloc(TEXTBUFLEN);
    1699       88358 :         sss->buflen1 = TEXTBUFLEN;
    1700       88358 :         sss->buf2 = palloc(TEXTBUFLEN);
    1701       88358 :         sss->buflen2 = TEXTBUFLEN;
    1702             :         /* Start with invalid values */
    1703       88358 :         sss->last_len1 = -1;
    1704       88358 :         sss->last_len2 = -1;
    1705             :         /* Initialize */
    1706       88358 :         sss->last_returned = 0;
    1707       88358 :         if (collate_c)
    1708       42468 :             sss->locale = NULL;
    1709             :         else
    1710       45890 :             sss->locale = locale;
    1711             : 
    1712             :         /*
    1713             :          * To avoid somehow confusing a strxfrm() blob and an original string,
    1714             :          * constantly keep track of the variety of data that buf1 and buf2
    1715             :          * currently contain.
    1716             :          *
    1717             :          * Comparisons may be interleaved with conversion calls.  Frequently,
    1718             :          * conversions and comparisons are batched into two distinct phases,
    1719             :          * but the correctness of caching cannot hinge upon this.  For
    1720             :          * comparison caching, buffer state is only trusted if cache_blob is
    1721             :          * found set to false, whereas strxfrm() caching only trusts the state
    1722             :          * when cache_blob is found set to true.
    1723             :          *
    1724             :          * Arbitrarily initialize cache_blob to true.
    1725             :          */
    1726       88358 :         sss->cache_blob = true;
    1727       88358 :         sss->collate_c = collate_c;
    1728       88358 :         sss->typid = typid;
    1729       88358 :         ssup->ssup_extra = sss;
    1730             : 
    1731             :         /*
    1732             :          * If possible, plan to use the abbreviated keys optimization.  The
    1733             :          * core code may switch back to authoritative comparator should
    1734             :          * abbreviation be aborted.
    1735             :          */
    1736       88358 :         if (abbreviate)
    1737             :         {
    1738       43066 :             sss->prop_card = 0.20;
    1739       43066 :             initHyperLogLog(&sss->abbr_card, 10);
    1740       43066 :             initHyperLogLog(&sss->full_card, 10);
    1741       43066 :             ssup->abbrev_full_comparator = ssup->comparator;
    1742       43066 :             ssup->comparator = ssup_datum_unsigned_cmp;
    1743       43066 :             ssup->abbrev_converter = varstr_abbrev_convert;
    1744       43066 :             ssup->abbrev_abort = varstr_abbrev_abort;
    1745             :         }
    1746             :     }
    1747      156454 : }
    1748             : 
    1749             : /*
    1750             :  * sortsupport comparison func (for C locale case)
    1751             :  */
    1752             : static int
    1753    43147360 : varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
    1754             : {
    1755    43147360 :     VarString  *arg1 = DatumGetVarStringPP(x);
    1756    43147360 :     VarString  *arg2 = DatumGetVarStringPP(y);
    1757             :     char       *a1p,
    1758             :                *a2p;
    1759             :     int         len1,
    1760             :                 len2,
    1761             :                 result;
    1762             : 
    1763    43147360 :     a1p = VARDATA_ANY(arg1);
    1764    43147360 :     a2p = VARDATA_ANY(arg2);
    1765             : 
    1766    43147360 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    1767    43147360 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    1768             : 
    1769    43147360 :     result = memcmp(a1p, a2p, Min(len1, len2));
    1770    43147360 :     if ((result == 0) && (len1 != len2))
    1771     1173988 :         result = (len1 < len2) ? -1 : 1;
    1772             : 
    1773             :     /* We can't afford to leak memory here. */
    1774    43147360 :     if (PointerGetDatum(arg1) != x)
    1775           4 :         pfree(arg1);
    1776    43147360 :     if (PointerGetDatum(arg2) != y)
    1777           4 :         pfree(arg2);
    1778             : 
    1779    43147360 :     return result;
    1780             : }
    1781             : 
    1782             : /*
    1783             :  * sortsupport comparison func (for BpChar C locale case)
    1784             :  *
    1785             :  * BpChar outsources its sortsupport to this module.  Specialization for the
    1786             :  * varstr_sortsupport BpChar case, modeled on
    1787             :  * internal_bpchar_pattern_compare().
    1788             :  */
    1789             : static int
    1790       63536 : bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
    1791             : {
    1792       63536 :     BpChar     *arg1 = DatumGetBpCharPP(x);
    1793       63536 :     BpChar     *arg2 = DatumGetBpCharPP(y);
    1794             :     char       *a1p,
    1795             :                *a2p;
    1796             :     int         len1,
    1797             :                 len2,
    1798             :                 result;
    1799             : 
    1800       63536 :     a1p = VARDATA_ANY(arg1);
    1801       63536 :     a2p = VARDATA_ANY(arg2);
    1802             : 
    1803       63536 :     len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
    1804       63536 :     len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
    1805             : 
    1806       63536 :     result = memcmp(a1p, a2p, Min(len1, len2));
    1807       63536 :     if ((result == 0) && (len1 != len2))
    1808           8 :         result = (len1 < len2) ? -1 : 1;
    1809             : 
    1810             :     /* We can't afford to leak memory here. */
    1811       63536 :     if (PointerGetDatum(arg1) != x)
    1812           0 :         pfree(arg1);
    1813       63536 :     if (PointerGetDatum(arg2) != y)
    1814           0 :         pfree(arg2);
    1815             : 
    1816       63536 :     return result;
    1817             : }
    1818             : 
    1819             : /*
    1820             :  * sortsupport comparison func (for NAME C locale case)
    1821             :  */
    1822             : static int
    1823    38908976 : namefastcmp_c(Datum x, Datum y, SortSupport ssup)
    1824             : {
    1825    38908976 :     Name        arg1 = DatumGetName(x);
    1826    38908976 :     Name        arg2 = DatumGetName(y);
    1827             : 
    1828    38908976 :     return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
    1829             : }
    1830             : 
    1831             : /*
    1832             :  * sortsupport comparison func (for locale case with all varlena types)
    1833             :  */
    1834             : static int
    1835    35368948 : varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
    1836             : {
    1837    35368948 :     VarString  *arg1 = DatumGetVarStringPP(x);
    1838    35368948 :     VarString  *arg2 = DatumGetVarStringPP(y);
    1839             :     char       *a1p,
    1840             :                *a2p;
    1841             :     int         len1,
    1842             :                 len2,
    1843             :                 result;
    1844             : 
    1845    35368948 :     a1p = VARDATA_ANY(arg1);
    1846    35368948 :     a2p = VARDATA_ANY(arg2);
    1847             : 
    1848    35368948 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    1849    35368948 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    1850             : 
    1851    35368948 :     result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
    1852             : 
    1853             :     /* We can't afford to leak memory here. */
    1854    35368948 :     if (PointerGetDatum(arg1) != x)
    1855           4 :         pfree(arg1);
    1856    35368948 :     if (PointerGetDatum(arg2) != y)
    1857           4 :         pfree(arg2);
    1858             : 
    1859    35368948 :     return result;
    1860             : }
    1861             : 
    1862             : /*
    1863             :  * sortsupport comparison func (for locale case with NAME type)
    1864             :  */
    1865             : static int
    1866           0 : namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
    1867             : {
    1868           0 :     Name        arg1 = DatumGetName(x);
    1869           0 :     Name        arg2 = DatumGetName(y);
    1870             : 
    1871           0 :     return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
    1872           0 :                                 NameStr(*arg2), strlen(NameStr(*arg2)),
    1873             :                                 ssup);
    1874             : }
    1875             : 
    1876             : /*
    1877             :  * sortsupport comparison func for locale cases
    1878             :  */
    1879             : static int
    1880    35368948 : varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
    1881             : {
    1882    35368948 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    1883             :     int         result;
    1884             :     bool        arg1_match;
    1885             : 
    1886             :     /* Fast pre-check for equality, as discussed in varstr_cmp() */
    1887    35368948 :     if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
    1888             :     {
    1889             :         /*
    1890             :          * No change in buf1 or buf2 contents, so avoid changing last_len1 or
    1891             :          * last_len2.  Existing contents of buffers might still be used by
    1892             :          * next call.
    1893             :          *
    1894             :          * It's fine to allow the comparison of BpChar padding bytes here,
    1895             :          * even though that implies that the memcmp() will usually be
    1896             :          * performed for BpChar callers (though multibyte characters could
    1897             :          * still prevent that from occurring).  The memcmp() is still very
    1898             :          * cheap, and BpChar's funny semantics have us remove trailing spaces
    1899             :          * (not limited to padding), so we need make no distinction between
    1900             :          * padding space characters and "real" space characters.
    1901             :          */
    1902     9251480 :         return 0;
    1903             :     }
    1904             : 
    1905    26117468 :     if (sss->typid == BPCHAROID)
    1906             :     {
    1907             :         /* Get true number of bytes, ignoring trailing spaces */
    1908       34544 :         len1 = bpchartruelen(a1p, len1);
    1909       34544 :         len2 = bpchartruelen(a2p, len2);
    1910             :     }
    1911             : 
    1912    26117468 :     if (len1 >= sss->buflen1)
    1913             :     {
    1914          14 :         sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
    1915          14 :         sss->buf1 = repalloc(sss->buf1, sss->buflen1);
    1916             :     }
    1917    26117468 :     if (len2 >= sss->buflen2)
    1918             :     {
    1919          10 :         sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
    1920          10 :         sss->buf2 = repalloc(sss->buf2, sss->buflen2);
    1921             :     }
    1922             : 
    1923             :     /*
    1924             :      * We're likely to be asked to compare the same strings repeatedly, and
    1925             :      * memcmp() is so much cheaper than strcoll() that it pays to try to cache
    1926             :      * comparisons, even though in general there is no reason to think that
    1927             :      * that will work out (every string datum may be unique).  Caching does
    1928             :      * not slow things down measurably when it doesn't work out, and can speed
    1929             :      * things up by rather a lot when it does.  In part, this is because the
    1930             :      * memcmp() compares data from cachelines that are needed in L1 cache even
    1931             :      * when the last comparison's result cannot be reused.
    1932             :      */
    1933    26117468 :     arg1_match = true;
    1934    26117468 :     if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
    1935             :     {
    1936    24143698 :         arg1_match = false;
    1937    24143698 :         memcpy(sss->buf1, a1p, len1);
    1938    24143698 :         sss->buf1[len1] = '\0';
    1939    24143698 :         sss->last_len1 = len1;
    1940             :     }
    1941             : 
    1942             :     /*
    1943             :      * If we're comparing the same two strings as last time, we can return the
    1944             :      * same answer without calling strcoll() again.  This is more likely than
    1945             :      * it seems (at least with moderate to low cardinality sets), because
    1946             :      * quicksort compares the same pivot against many values.
    1947             :      */
    1948    26117468 :     if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
    1949             :     {
    1950     3963368 :         memcpy(sss->buf2, a2p, len2);
    1951     3963368 :         sss->buf2[len2] = '\0';
    1952     3963368 :         sss->last_len2 = len2;
    1953             :     }
    1954    22154100 :     else if (arg1_match && !sss->cache_blob)
    1955             :     {
    1956             :         /* Use result cached following last actual strcoll() call */
    1957     1557712 :         return sss->last_returned;
    1958             :     }
    1959             : 
    1960    24559756 :     result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
    1961             : 
    1962             :     /* Break tie if necessary. */
    1963    24559756 :     if (result == 0 && sss->locale->deterministic)
    1964           0 :         result = strcmp(sss->buf1, sss->buf2);
    1965             : 
    1966             :     /* Cache result, perhaps saving an expensive strcoll() call next time */
    1967    24559756 :     sss->cache_blob = false;
    1968    24559756 :     sss->last_returned = result;
    1969    24559756 :     return result;
    1970             : }
    1971             : 
    1972             : /*
    1973             :  * Conversion routine for sortsupport.  Converts original to abbreviated key
    1974             :  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
    1975             :  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
    1976             :  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
    1977             :  * locale is used, or in case of bytea, just memcpy() from original instead.
    1978             :  */
    1979             : static Datum
    1980      884928 : varstr_abbrev_convert(Datum original, SortSupport ssup)
    1981             : {
    1982      884928 :     const size_t max_prefix_bytes = sizeof(Datum);
    1983      884928 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    1984      884928 :     VarString  *authoritative = DatumGetVarStringPP(original);
    1985      884928 :     char       *authoritative_data = VARDATA_ANY(authoritative);
    1986             : 
    1987             :     /* working state */
    1988             :     Datum       res;
    1989             :     char       *pres;
    1990             :     int         len;
    1991             :     uint32      hash;
    1992             : 
    1993      884928 :     pres = (char *) &res;
    1994             :     /* memset(), so any non-overwritten bytes are NUL */
    1995      884928 :     memset(pres, 0, max_prefix_bytes);
    1996      884928 :     len = VARSIZE_ANY_EXHDR(authoritative);
    1997             : 
    1998             :     /* Get number of bytes, ignoring trailing spaces */
    1999      884928 :     if (sss->typid == BPCHAROID)
    2000        1362 :         len = bpchartruelen(authoritative_data, len);
    2001             : 
    2002             :     /*
    2003             :      * If we're using the C collation, use memcpy(), rather than strxfrm(), to
    2004             :      * abbreviate keys.  The full comparator for the C locale is always
    2005             :      * memcmp().  It would be incorrect to allow bytea callers (callers that
    2006             :      * always force the C collation -- bytea isn't a collatable type, but this
    2007             :      * approach is convenient) to use strxfrm().  This is because bytea
    2008             :      * strings may contain NUL bytes.  Besides, this should be faster, too.
    2009             :      *
    2010             :      * More generally, it's okay that bytea callers can have NUL bytes in
    2011             :      * strings because abbreviated cmp need not make a distinction between
    2012             :      * terminating NUL bytes, and NUL bytes representing actual NULs in the
    2013             :      * authoritative representation.  Hopefully a comparison at or past one
    2014             :      * abbreviated key's terminating NUL byte will resolve the comparison
    2015             :      * without consulting the authoritative representation; specifically, some
    2016             :      * later non-NUL byte in the longer string can resolve the comparison
    2017             :      * against a subsequent terminating NUL in the shorter string.  There will
    2018             :      * usually be what is effectively a "length-wise" resolution there and
    2019             :      * then.
    2020             :      *
    2021             :      * If that doesn't work out -- if all bytes in the longer string
    2022             :      * positioned at or past the offset of the smaller string's (first)
    2023             :      * terminating NUL are actually representative of NUL bytes in the
    2024             :      * authoritative binary string (perhaps with some *terminating* NUL bytes
    2025             :      * towards the end of the longer string iff it happens to still be small)
    2026             :      * -- then an authoritative tie-breaker will happen, and do the right
    2027             :      * thing: explicitly consider string length.
    2028             :      */
    2029      884928 :     if (sss->collate_c)
    2030      883092 :         memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
    2031             :     else
    2032             :     {
    2033             :         Size        bsize;
    2034             : 
    2035             :         /*
    2036             :          * We're not using the C collation, so fall back on strxfrm or ICU
    2037             :          * analogs.
    2038             :          */
    2039             : 
    2040             :         /* By convention, we use buffer 1 to store and NUL-terminate */
    2041        1836 :         if (len >= sss->buflen1)
    2042             :         {
    2043           0 :             sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
    2044           0 :             sss->buf1 = repalloc(sss->buf1, sss->buflen1);
    2045             :         }
    2046             : 
    2047             :         /* Might be able to reuse strxfrm() blob from last call */
    2048        1836 :         if (sss->last_len1 == len && sss->cache_blob &&
    2049         918 :             memcmp(sss->buf1, authoritative_data, len) == 0)
    2050             :         {
    2051         168 :             memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
    2052             :             /* No change affecting cardinality, so no hashing required */
    2053         168 :             goto done;
    2054             :         }
    2055             : 
    2056        1668 :         memcpy(sss->buf1, authoritative_data, len);
    2057             : 
    2058             :         /*
    2059             :          * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated strings.
    2060             :          */
    2061        1668 :         sss->buf1[len] = '\0';
    2062        1668 :         sss->last_len1 = len;
    2063             : 
    2064        1668 :         if (pg_strxfrm_prefix_enabled(sss->locale))
    2065             :         {
    2066        1668 :             if (sss->buflen2 < max_prefix_bytes)
    2067             :             {
    2068           0 :                 sss->buflen2 = Max(max_prefix_bytes,
    2069             :                                    Min(sss->buflen2 * 2, MaxAllocSize));
    2070           0 :                 sss->buf2 = repalloc(sss->buf2, sss->buflen2);
    2071             :             }
    2072             : 
    2073        1668 :             bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
    2074             :                                       max_prefix_bytes, sss->locale);
    2075        1668 :             sss->last_len2 = bsize;
    2076             :         }
    2077             :         else
    2078             :         {
    2079             :             /*
    2080             :              * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
    2081             :              * again.  The pg_strxfrm() function leaves the result buffer
    2082             :              * content undefined if the result did not fit, so we need to
    2083             :              * retry until everything fits, even though we only need the first
    2084             :              * few bytes in the end.
    2085             :              */
    2086             :             for (;;)
    2087             :             {
    2088           0 :                 bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
    2089             :                                    sss->locale);
    2090             : 
    2091           0 :                 sss->last_len2 = bsize;
    2092           0 :                 if (bsize < sss->buflen2)
    2093           0 :                     break;
    2094             : 
    2095             :                 /*
    2096             :                  * Grow buffer and retry.
    2097             :                  */
    2098           0 :                 sss->buflen2 = Max(bsize + 1,
    2099             :                                    Min(sss->buflen2 * 2, MaxAllocSize));
    2100           0 :                 sss->buf2 = repalloc(sss->buf2, sss->buflen2);
    2101             :             }
    2102             :         }
    2103             : 
    2104             :         /*
    2105             :          * Every Datum byte is always compared.  This is safe because the
    2106             :          * strxfrm() blob is itself NUL terminated, leaving no danger of
    2107             :          * misinterpreting any NUL bytes not intended to be interpreted as
    2108             :          * logically representing termination.
    2109             :          *
    2110             :          * (Actually, even if there were NUL bytes in the blob it would be
    2111             :          * okay.  See remarks on bytea case above.)
    2112             :          */
    2113        1668 :         memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
    2114             :     }
    2115             : 
    2116             :     /*
    2117             :      * Maintain approximate cardinality of both abbreviated keys and original,
    2118             :      * authoritative keys using HyperLogLog.  Used as cheap insurance against
    2119             :      * the worst case, where we do many string transformations for no saving
    2120             :      * in full strcoll()-based comparisons.  These statistics are used by
    2121             :      * varstr_abbrev_abort().
    2122             :      *
    2123             :      * First, Hash key proper, or a significant fraction of it.  Mix in length
    2124             :      * in order to compensate for cases where differences are past
    2125             :      * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
    2126             :      */
    2127      884760 :     hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
    2128             :                                    Min(len, PG_CACHE_LINE_SIZE)));
    2129             : 
    2130      884760 :     if (len > PG_CACHE_LINE_SIZE)
    2131         458 :         hash ^= DatumGetUInt32(hash_uint32((uint32) len));
    2132             : 
    2133      884760 :     addHyperLogLog(&sss->full_card, hash);
    2134             : 
    2135             :     /* Hash abbreviated key */
    2136             : #if SIZEOF_DATUM == 8
    2137             :     {
    2138             :         uint32      lohalf,
    2139             :                     hihalf;
    2140             : 
    2141      884760 :         lohalf = (uint32) res;
    2142      884760 :         hihalf = (uint32) (res >> 32);
    2143      884760 :         hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
    2144             :     }
    2145             : #else                           /* SIZEOF_DATUM != 8 */
    2146             :     hash = DatumGetUInt32(hash_uint32((uint32) res));
    2147             : #endif
    2148             : 
    2149      884760 :     addHyperLogLog(&sss->abbr_card, hash);
    2150             : 
    2151             :     /* Cache result, perhaps saving an expensive strxfrm() call next time */
    2152      884760 :     sss->cache_blob = true;
    2153      884928 : done:
    2154             : 
    2155             :     /*
    2156             :      * Byteswap on little-endian machines.
    2157             :      *
    2158             :      * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
    2159             :      * 3-way comparator) works correctly on all platforms.  If we didn't do
    2160             :      * this, the comparator would have to call memcmp() with a pair of
    2161             :      * pointers to the first byte of each abbreviated key, which is slower.
    2162             :      */
    2163      884928 :     res = DatumBigEndianToNative(res);
    2164             : 
    2165             :     /* Don't leak memory here */
    2166      884928 :     if (PointerGetDatum(authoritative) != original)
    2167           8 :         pfree(authoritative);
    2168             : 
    2169      884928 :     return res;
    2170             : }
    2171             : 
    2172             : /*
    2173             :  * Callback for estimating effectiveness of abbreviated key optimization, using
    2174             :  * heuristic rules.  Returns value indicating if the abbreviation optimization
    2175             :  * should be aborted, based on its projected effectiveness.
    2176             :  */
    2177             : static bool
    2178        2378 : varstr_abbrev_abort(int memtupcount, SortSupport ssup)
    2179             : {
    2180        2378 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    2181             :     double      abbrev_distinct,
    2182             :                 key_distinct;
    2183             : 
    2184             :     Assert(ssup->abbreviate);
    2185             : 
    2186             :     /* Have a little patience */
    2187        2378 :     if (memtupcount < 100)
    2188        1358 :         return false;
    2189             : 
    2190        1020 :     abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
    2191        1020 :     key_distinct = estimateHyperLogLog(&sss->full_card);
    2192             : 
    2193             :     /*
    2194             :      * Clamp cardinality estimates to at least one distinct value.  While
    2195             :      * NULLs are generally disregarded, if only NULL values were seen so far,
    2196             :      * that might misrepresent costs if we failed to clamp.
    2197             :      */
    2198        1020 :     if (abbrev_distinct <= 1.0)
    2199           0 :         abbrev_distinct = 1.0;
    2200             : 
    2201        1020 :     if (key_distinct <= 1.0)
    2202           0 :         key_distinct = 1.0;
    2203             : 
    2204             :     /*
    2205             :      * In the worst case all abbreviated keys are identical, while at the same
    2206             :      * time there are differences within full key strings not captured in
    2207             :      * abbreviations.
    2208             :      */
    2209        1020 :     if (trace_sort)
    2210             :     {
    2211           0 :         double      norm_abbrev_card = abbrev_distinct / (double) memtupcount;
    2212             : 
    2213           0 :         elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
    2214             :              "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
    2215             :              memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
    2216             :              sss->prop_card);
    2217             :     }
    2218             : 
    2219             :     /*
    2220             :      * If the number of distinct abbreviated keys approximately matches the
    2221             :      * number of distinct authoritative original keys, that's reason enough to
    2222             :      * proceed.  We can win even with a very low cardinality set if most
    2223             :      * tie-breakers only memcmp().  This is by far the most important
    2224             :      * consideration.
    2225             :      *
    2226             :      * While comparisons that are resolved at the abbreviated key level are
    2227             :      * considerably cheaper than tie-breakers resolved with memcmp(), both of
    2228             :      * those two outcomes are so much cheaper than a full strcoll() once
    2229             :      * sorting is underway that it doesn't seem worth it to weigh abbreviated
    2230             :      * cardinality against the overall size of the set in order to more
    2231             :      * accurately model costs.  Assume that an abbreviated comparison, and an
    2232             :      * abbreviated comparison with a cheap memcmp()-based authoritative
    2233             :      * resolution are equivalent.
    2234             :      */
    2235        1020 :     if (abbrev_distinct > key_distinct * sss->prop_card)
    2236             :     {
    2237             :         /*
    2238             :          * When we have exceeded 10,000 tuples, decay required cardinality
    2239             :          * aggressively for next call.
    2240             :          *
    2241             :          * This is useful because the number of comparisons required on
    2242             :          * average increases at a linearithmic rate, and at roughly 10,000
    2243             :          * tuples that factor will start to dominate over the linear costs of
    2244             :          * string transformation (this is a conservative estimate).  The decay
    2245             :          * rate is chosen to be a little less aggressive than halving -- which
    2246             :          * (since we're called at points at which memtupcount has doubled)
    2247             :          * would never see the cost model actually abort past the first call
    2248             :          * following a decay.  This decay rate is mostly a precaution against
    2249             :          * a sudden, violent swing in how well abbreviated cardinality tracks
    2250             :          * full key cardinality.  The decay also serves to prevent a marginal
    2251             :          * case from being aborted too late, when too much has already been
    2252             :          * invested in string transformation.
    2253             :          *
    2254             :          * It's possible for sets of several million distinct strings with
    2255             :          * mere tens of thousands of distinct abbreviated keys to still
    2256             :          * benefit very significantly.  This will generally occur provided
    2257             :          * each abbreviated key is a proxy for a roughly uniform number of the
    2258             :          * set's full keys. If it isn't so, we hope to catch that early and
    2259             :          * abort.  If it isn't caught early, by the time the problem is
    2260             :          * apparent it's probably not worth aborting.
    2261             :          */
    2262        1020 :         if (memtupcount > 10000)
    2263           4 :             sss->prop_card *= 0.65;
    2264             : 
    2265        1020 :         return false;
    2266             :     }
    2267             : 
    2268             :     /*
    2269             :      * Abort abbreviation strategy.
    2270             :      *
    2271             :      * The worst case, where all abbreviated keys are identical while all
    2272             :      * original strings differ will typically only see a regression of about
    2273             :      * 10% in execution time for small to medium sized lists of strings.
    2274             :      * Whereas on modern CPUs where cache stalls are the dominant cost, we can
    2275             :      * often expect very large improvements, particularly with sets of strings
    2276             :      * of moderately high to high abbreviated cardinality.  There is little to
    2277             :      * lose but much to gain, which our strategy reflects.
    2278             :      */
    2279           0 :     if (trace_sort)
    2280           0 :         elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
    2281             :              "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
    2282             :              memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
    2283             : 
    2284           0 :     return true;
    2285             : }
    2286             : 
    2287             : /*
    2288             :  * Generic equalimage support function for character type's operator classes.
    2289             :  * Disables the use of deduplication with nondeterministic collations.
    2290             :  */
    2291             : Datum
    2292        8766 : btvarstrequalimage(PG_FUNCTION_ARGS)
    2293             : {
    2294             :     /* Oid      opcintype = PG_GETARG_OID(0); */
    2295        8766 :     Oid         collid = PG_GET_COLLATION();
    2296             :     pg_locale_t locale;
    2297             : 
    2298        8766 :     check_collation_set(collid);
    2299             : 
    2300        8766 :     locale = pg_newlocale_from_collation(collid);
    2301             : 
    2302        8766 :     PG_RETURN_BOOL(locale->deterministic);
    2303             : }
    2304             : 
    2305             : Datum
    2306      229560 : text_larger(PG_FUNCTION_ARGS)
    2307             : {
    2308      229560 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2309      229560 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2310             :     text       *result;
    2311             : 
    2312      229560 :     result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
    2313             : 
    2314      229560 :     PG_RETURN_TEXT_P(result);
    2315             : }
    2316             : 
    2317             : Datum
    2318       86076 : text_smaller(PG_FUNCTION_ARGS)
    2319             : {
    2320       86076 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2321       86076 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2322             :     text       *result;
    2323             : 
    2324       86076 :     result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
    2325             : 
    2326       86076 :     PG_RETURN_TEXT_P(result);
    2327             : }
    2328             : 
    2329             : 
    2330             : /*
    2331             :  * Cross-type comparison functions for types text and name.
    2332             :  */
    2333             : 
    2334             : Datum
    2335      188518 : nameeqtext(PG_FUNCTION_ARGS)
    2336             : {
    2337      188518 :     Name        arg1 = PG_GETARG_NAME(0);
    2338      188518 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2339      188518 :     size_t      len1 = strlen(NameStr(*arg1));
    2340      188518 :     size_t      len2 = VARSIZE_ANY_EXHDR(arg2);
    2341      188518 :     Oid         collid = PG_GET_COLLATION();
    2342             :     bool        result;
    2343             : 
    2344      188518 :     check_collation_set(collid);
    2345             : 
    2346      188518 :     if (collid == C_COLLATION_OID)
    2347      254288 :         result = (len1 == len2 &&
    2348      123670 :                   memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
    2349             :     else
    2350       57900 :         result = (varstr_cmp(NameStr(*arg1), len1,
    2351       57900 :                              VARDATA_ANY(arg2), len2,
    2352             :                              collid) == 0);
    2353             : 
    2354      188518 :     PG_FREE_IF_COPY(arg2, 1);
    2355             : 
    2356      188518 :     PG_RETURN_BOOL(result);
    2357             : }
    2358             : 
    2359             : Datum
    2360        7864 : texteqname(PG_FUNCTION_ARGS)
    2361             : {
    2362        7864 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2363        7864 :     Name        arg2 = PG_GETARG_NAME(1);
    2364        7864 :     size_t      len1 = VARSIZE_ANY_EXHDR(arg1);
    2365        7864 :     size_t      len2 = strlen(NameStr(*arg2));
    2366        7864 :     Oid         collid = PG_GET_COLLATION();
    2367             :     bool        result;
    2368             : 
    2369        7864 :     check_collation_set(collid);
    2370             : 
    2371        7864 :     if (collid == C_COLLATION_OID)
    2372         568 :         result = (len1 == len2 &&
    2373         182 :                   memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
    2374             :     else
    2375        7478 :         result = (varstr_cmp(VARDATA_ANY(arg1), len1,
    2376        7478 :                              NameStr(*arg2), len2,
    2377             :                              collid) == 0);
    2378             : 
    2379        7864 :     PG_FREE_IF_COPY(arg1, 0);
    2380             : 
    2381        7864 :     PG_RETURN_BOOL(result);
    2382             : }
    2383             : 
    2384             : Datum
    2385          18 : namenetext(PG_FUNCTION_ARGS)
    2386             : {
    2387          18 :     Name        arg1 = PG_GETARG_NAME(0);
    2388          18 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2389          18 :     size_t      len1 = strlen(NameStr(*arg1));
    2390          18 :     size_t      len2 = VARSIZE_ANY_EXHDR(arg2);
    2391          18 :     Oid         collid = PG_GET_COLLATION();
    2392             :     bool        result;
    2393             : 
    2394          18 :     check_collation_set(collid);
    2395             : 
    2396          18 :     if (collid == C_COLLATION_OID)
    2397           0 :         result = !(len1 == len2 &&
    2398           0 :                    memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
    2399             :     else
    2400          18 :         result = !(varstr_cmp(NameStr(*arg1), len1,
    2401          18 :                               VARDATA_ANY(arg2), len2,
    2402             :                               collid) == 0);
    2403             : 
    2404          18 :     PG_FREE_IF_COPY(arg2, 1);
    2405             : 
    2406          18 :     PG_RETURN_BOOL(result);
    2407             : }
    2408             : 
    2409             : Datum
    2410          18 : textnename(PG_FUNCTION_ARGS)
    2411             : {
    2412          18 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2413          18 :     Name        arg2 = PG_GETARG_NAME(1);
    2414          18 :     size_t      len1 = VARSIZE_ANY_EXHDR(arg1);
    2415          18 :     size_t      len2 = strlen(NameStr(*arg2));
    2416          18 :     Oid         collid = PG_GET_COLLATION();
    2417             :     bool        result;
    2418             : 
    2419          18 :     check_collation_set(collid);
    2420             : 
    2421          18 :     if (collid == C_COLLATION_OID)
    2422           0 :         result = !(len1 == len2 &&
    2423           0 :                    memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
    2424             :     else
    2425          18 :         result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
    2426          18 :                               NameStr(*arg2), len2,
    2427             :                               collid) == 0);
    2428             : 
    2429          18 :     PG_FREE_IF_COPY(arg1, 0);
    2430             : 
    2431          18 :     PG_RETURN_BOOL(result);
    2432             : }
    2433             : 
    2434             : Datum
    2435      121190 : btnametextcmp(PG_FUNCTION_ARGS)
    2436             : {
    2437      121190 :     Name        arg1 = PG_GETARG_NAME(0);
    2438      121190 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2439             :     int32       result;
    2440             : 
    2441      242380 :     result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
    2442      242380 :                         VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
    2443             :                         PG_GET_COLLATION());
    2444             : 
    2445      121190 :     PG_FREE_IF_COPY(arg2, 1);
    2446             : 
    2447      121190 :     PG_RETURN_INT32(result);
    2448             : }
    2449             : 
    2450             : Datum
    2451          44 : bttextnamecmp(PG_FUNCTION_ARGS)
    2452             : {
    2453          44 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2454          44 :     Name        arg2 = PG_GETARG_NAME(1);
    2455             :     int32       result;
    2456             : 
    2457          44 :     result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
    2458          44 :                         NameStr(*arg2), strlen(NameStr(*arg2)),
    2459             :                         PG_GET_COLLATION());
    2460             : 
    2461          44 :     PG_FREE_IF_COPY(arg1, 0);
    2462             : 
    2463          44 :     PG_RETURN_INT32(result);
    2464             : }
    2465             : 
    2466             : #define CmpCall(cmpfunc) \
    2467             :     DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
    2468             :                                           PG_GET_COLLATION(), \
    2469             :                                           PG_GETARG_DATUM(0), \
    2470             :                                           PG_GETARG_DATUM(1)))
    2471             : 
    2472             : Datum
    2473       58018 : namelttext(PG_FUNCTION_ARGS)
    2474             : {
    2475       58018 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
    2476             : }
    2477             : 
    2478             : Datum
    2479           0 : nameletext(PG_FUNCTION_ARGS)
    2480             : {
    2481           0 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
    2482             : }
    2483             : 
    2484             : Datum
    2485           0 : namegttext(PG_FUNCTION_ARGS)
    2486             : {
    2487           0 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
    2488             : }
    2489             : 
    2490             : Datum
    2491       50594 : namegetext(PG_FUNCTION_ARGS)
    2492             : {
    2493       50594 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
    2494             : }
    2495             : 
    2496             : Datum
    2497           0 : textltname(PG_FUNCTION_ARGS)
    2498             : {
    2499           0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
    2500             : }
    2501             : 
    2502             : Datum
    2503           0 : textlename(PG_FUNCTION_ARGS)
    2504             : {
    2505           0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
    2506             : }
    2507             : 
    2508             : Datum
    2509           0 : textgtname(PG_FUNCTION_ARGS)
    2510             : {
    2511           0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
    2512             : }
    2513             : 
    2514             : Datum
    2515           0 : textgename(PG_FUNCTION_ARGS)
    2516             : {
    2517           0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
    2518             : }
    2519             : 
    2520             : #undef CmpCall
    2521             : 
    2522             : 
    2523             : /*
    2524             :  * The following operators support character-by-character comparison
    2525             :  * of text datums, to allow building indexes suitable for LIKE clauses.
    2526             :  * Note that the regular texteq/textne comparison operators, and regular
    2527             :  * support functions 1 and 2 with "C" collation are assumed to be
    2528             :  * compatible with these!
    2529             :  */
    2530             : 
    2531             : static int
    2532      160444 : internal_text_pattern_compare(text *arg1, text *arg2)
    2533             : {
    2534             :     int         result;
    2535             :     int         len1,
    2536             :                 len2;
    2537             : 
    2538      160444 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    2539      160444 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    2540             : 
    2541      160444 :     result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    2542      160444 :     if (result != 0)
    2543      160312 :         return result;
    2544         132 :     else if (len1 < len2)
    2545           0 :         return -1;
    2546         132 :     else if (len1 > len2)
    2547          84 :         return 1;
    2548             :     else
    2549          48 :         return 0;
    2550             : }
    2551             : 
    2552             : 
    2553             : Datum
    2554       47866 : text_pattern_lt(PG_FUNCTION_ARGS)
    2555             : {
    2556       47866 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2557       47866 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2558             :     int         result;
    2559             : 
    2560       47866 :     result = internal_text_pattern_compare(arg1, arg2);
    2561             : 
    2562       47866 :     PG_FREE_IF_COPY(arg1, 0);
    2563       47866 :     PG_FREE_IF_COPY(arg2, 1);
    2564             : 
    2565       47866 :     PG_RETURN_BOOL(result < 0);
    2566             : }
    2567             : 
    2568             : 
    2569             : Datum
    2570       37510 : text_pattern_le(PG_FUNCTION_ARGS)
    2571             : {
    2572       37510 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2573       37510 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2574             :     int         result;
    2575             : 
    2576       37510 :     result = internal_text_pattern_compare(arg1, arg2);
    2577             : 
    2578       37510 :     PG_FREE_IF_COPY(arg1, 0);
    2579       37510 :     PG_FREE_IF_COPY(arg2, 1);
    2580             : 
    2581       37510 :     PG_RETURN_BOOL(result <= 0);
    2582             : }
    2583             : 
    2584             : 
    2585             : Datum
    2586       37534 : text_pattern_ge(PG_FUNCTION_ARGS)
    2587             : {
    2588       37534 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2589       37534 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2590             :     int         result;
    2591             : 
    2592       37534 :     result = internal_text_pattern_compare(arg1, arg2);
    2593             : 
    2594       37534 :     PG_FREE_IF_COPY(arg1, 0);
    2595       37534 :     PG_FREE_IF_COPY(arg2, 1);
    2596             : 
    2597       37534 :     PG_RETURN_BOOL(result >= 0);
    2598             : }
    2599             : 
    2600             : 
    2601             : Datum
    2602       37510 : text_pattern_gt(PG_FUNCTION_ARGS)
    2603             : {
    2604       37510 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2605       37510 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2606             :     int         result;
    2607             : 
    2608       37510 :     result = internal_text_pattern_compare(arg1, arg2);
    2609             : 
    2610       37510 :     PG_FREE_IF_COPY(arg1, 0);
    2611       37510 :     PG_FREE_IF_COPY(arg2, 1);
    2612             : 
    2613       37510 :     PG_RETURN_BOOL(result > 0);
    2614             : }
    2615             : 
    2616             : 
    2617             : Datum
    2618          24 : bttext_pattern_cmp(PG_FUNCTION_ARGS)
    2619             : {
    2620          24 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2621          24 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2622             :     int         result;
    2623             : 
    2624          24 :     result = internal_text_pattern_compare(arg1, arg2);
    2625             : 
    2626          24 :     PG_FREE_IF_COPY(arg1, 0);
    2627          24 :     PG_FREE_IF_COPY(arg2, 1);
    2628             : 
    2629          24 :     PG_RETURN_INT32(result);
    2630             : }
    2631             : 
    2632             : 
    2633             : Datum
    2634         118 : bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
    2635             : {
    2636         118 :     SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
    2637             :     MemoryContext oldcontext;
    2638             : 
    2639         118 :     oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
    2640             : 
    2641             :     /* Use generic string SortSupport, forcing "C" collation */
    2642         118 :     varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
    2643             : 
    2644         118 :     MemoryContextSwitchTo(oldcontext);
    2645             : 
    2646         118 :     PG_RETURN_VOID();
    2647             : }
    2648             : 
    2649             : 
    2650             : /* text_name()
    2651             :  * Converts a text type to a Name type.
    2652             :  */
    2653             : Datum
    2654       30746 : text_name(PG_FUNCTION_ARGS)
    2655             : {
    2656       30746 :     text       *s = PG_GETARG_TEXT_PP(0);
    2657             :     Name        result;
    2658             :     int         len;
    2659             : 
    2660       30746 :     len = VARSIZE_ANY_EXHDR(s);
    2661             : 
    2662             :     /* Truncate oversize input */
    2663       30746 :     if (len >= NAMEDATALEN)
    2664           6 :         len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
    2665             : 
    2666             :     /* We use palloc0 here to ensure result is zero-padded */
    2667       30746 :     result = (Name) palloc0(NAMEDATALEN);
    2668       30746 :     memcpy(NameStr(*result), VARDATA_ANY(s), len);
    2669             : 
    2670       30746 :     PG_RETURN_NAME(result);
    2671             : }
    2672             : 
    2673             : /* name_text()
    2674             :  * Converts a Name type to a text type.
    2675             :  */
    2676             : Datum
    2677      655010 : name_text(PG_FUNCTION_ARGS)
    2678             : {
    2679      655010 :     Name        s = PG_GETARG_NAME(0);
    2680             : 
    2681      655010 :     PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
    2682             : }
    2683             : 
    2684             : 
    2685             : /*
    2686             :  * textToQualifiedNameList - convert a text object to list of names
    2687             :  *
    2688             :  * This implements the input parsing needed by nextval() and other
    2689             :  * functions that take a text parameter representing a qualified name.
    2690             :  * We split the name at dots, downcase if not double-quoted, and
    2691             :  * truncate names if they're too long.
    2692             :  */
    2693             : List *
    2694        5414 : textToQualifiedNameList(text *textval)
    2695             : {
    2696             :     char       *rawname;
    2697        5414 :     List       *result = NIL;
    2698             :     List       *namelist;
    2699             :     ListCell   *l;
    2700             : 
    2701             :     /* Convert to C string (handles possible detoasting). */
    2702             :     /* Note we rely on being able to modify rawname below. */
    2703        5414 :     rawname = text_to_cstring(textval);
    2704             : 
    2705        5414 :     if (!SplitIdentifierString(rawname, '.', &namelist))
    2706           0 :         ereport(ERROR,
    2707             :                 (errcode(ERRCODE_INVALID_NAME),
    2708             :                  errmsg("invalid name syntax")));
    2709             : 
    2710        5414 :     if (namelist == NIL)
    2711           0 :         ereport(ERROR,
    2712             :                 (errcode(ERRCODE_INVALID_NAME),
    2713             :                  errmsg("invalid name syntax")));
    2714             : 
    2715       10944 :     foreach(l, namelist)
    2716             :     {
    2717        5530 :         char       *curname = (char *) lfirst(l);
    2718             : 
    2719        5530 :         result = lappend(result, makeString(pstrdup(curname)));
    2720             :     }
    2721             : 
    2722        5414 :     pfree(rawname);
    2723        5414 :     list_free(namelist);
    2724             : 
    2725        5414 :     return result;
    2726             : }
    2727             : 
    2728             : /*
    2729             :  * SplitIdentifierString --- parse a string containing identifiers
    2730             :  *
    2731             :  * This is the guts of textToQualifiedNameList, and is exported for use in
    2732             :  * other situations such as parsing GUC variables.  In the GUC case, it's
    2733             :  * important to avoid memory leaks, so the API is designed to minimize the
    2734             :  * amount of stuff that needs to be allocated and freed.
    2735             :  *
    2736             :  * Inputs:
    2737             :  *  rawstring: the input string; must be overwritable!  On return, it's
    2738             :  *             been modified to contain the separated identifiers.
    2739             :  *  separator: the separator punctuation expected between identifiers
    2740             :  *             (typically '.' or ',').  Whitespace may also appear around
    2741             :  *             identifiers.
    2742             :  * Outputs:
    2743             :  *  namelist: filled with a palloc'd list of pointers to identifiers within
    2744             :  *            rawstring.  Caller should list_free() this even on error return.
    2745             :  *
    2746             :  * Returns true if okay, false if there is a syntax error in the string.
    2747             :  *
    2748             :  * Note that an empty string is considered okay here, though not in
    2749             :  * textToQualifiedNameList.
    2750             :  */
    2751             : bool
    2752      334468 : SplitIdentifierString(char *rawstring, char separator,
    2753             :                       List **namelist)
    2754             : {
    2755      334468 :     char       *nextp = rawstring;
    2756      334468 :     bool        done = false;
    2757             : 
    2758      334468 :     *namelist = NIL;
    2759             : 
    2760      334474 :     while (scanner_isspace(*nextp))
    2761           6 :         nextp++;                /* skip leading whitespace */
    2762             : 
    2763      334468 :     if (*nextp == '\0')
    2764       30216 :         return true;            /* allow empty string */
    2765             : 
    2766             :     /* At the top of the loop, we are at start of a new identifier. */
    2767             :     do
    2768             :     {
    2769             :         char       *curname;
    2770             :         char       *endp;
    2771             : 
    2772      562860 :         if (*nextp == '"')
    2773             :         {
    2774             :             /* Quoted name --- collapse quote-quote pairs, no downcasing */
    2775       39660 :             curname = nextp + 1;
    2776             :             for (;;)
    2777             :             {
    2778       39664 :                 endp = strchr(nextp + 1, '"');
    2779       39662 :                 if (endp == NULL)
    2780           0 :                     return false;   /* mismatched quotes */
    2781       39662 :                 if (endp[1] != '"')
    2782       39660 :                     break;      /* found end of quoted name */
    2783             :                 /* Collapse adjacent quotes into one quote, and look again */
    2784           2 :                 memmove(endp, endp + 1, strlen(endp));
    2785           2 :                 nextp = endp;
    2786             :             }
    2787             :             /* endp now points at the terminating quote */
    2788       39660 :             nextp = endp + 1;
    2789             :         }
    2790             :         else
    2791             :         {
    2792             :             /* Unquoted name --- extends to separator or whitespace */
    2793             :             char       *downname;
    2794             :             int         len;
    2795             : 
    2796      523200 :             curname = nextp;
    2797     4783482 :             while (*nextp && *nextp != separator &&
    2798     4260284 :                    !scanner_isspace(*nextp))
    2799     4260282 :                 nextp++;
    2800      523200 :             endp = nextp;
    2801      523200 :             if (curname == nextp)
    2802           0 :                 return false;   /* empty unquoted name not allowed */
    2803             : 
    2804             :             /*
    2805             :              * Downcase the identifier, using same code as main lexer does.
    2806             :              *
    2807             :              * XXX because we want to overwrite the input in-place, we cannot
    2808             :              * support a downcasing transformation that increases the string
    2809             :              * length.  This is not a problem given the current implementation
    2810             :              * of downcase_truncate_identifier, but we'll probably have to do
    2811             :              * something about this someday.
    2812             :              */
    2813      523200 :             len = endp - curname;
    2814      523200 :             downname = downcase_truncate_identifier(curname, len, false);
    2815             :             Assert(strlen(downname) <= len);
    2816      523200 :             strncpy(curname, downname, len);    /* strncpy is required here */
    2817      523200 :             pfree(downname);
    2818             :         }
    2819             : 
    2820      562862 :         while (scanner_isspace(*nextp))
    2821           2 :             nextp++;            /* skip trailing whitespace */
    2822             : 
    2823      562860 :         if (*nextp == separator)
    2824             :         {
    2825      258608 :             nextp++;
    2826      491810 :             while (scanner_isspace(*nextp))
    2827      233202 :                 nextp++;        /* skip leading whitespace for next */
    2828             :             /* we expect another name, so done remains false */
    2829             :         }
    2830      304252 :         else if (*nextp == '\0')
    2831      304250 :             done = true;
    2832             :         else
    2833           2 :             return false;       /* invalid syntax */
    2834             : 
    2835             :         /* Now safe to overwrite separator with a null */
    2836      562858 :         *endp = '\0';
    2837             : 
    2838             :         /* Truncate name if it's overlength */
    2839      562858 :         truncate_identifier(curname, strlen(curname), false);
    2840             : 
    2841             :         /*
    2842             :          * Finished isolating current name --- add it to list
    2843             :          */
    2844      562858 :         *namelist = lappend(*namelist, curname);
    2845             : 
    2846             :         /* Loop back if we didn't reach end of string */
    2847      562858 :     } while (!done);
    2848             : 
    2849      304250 :     return true;
    2850             : }
    2851             : 
    2852             : 
    2853             : /*
    2854             :  * SplitDirectoriesString --- parse a string containing file/directory names
    2855             :  *
    2856             :  * This works fine on file names too; the function name is historical.
    2857             :  *
    2858             :  * This is similar to SplitIdentifierString, except that the parsing
    2859             :  * rules are meant to handle pathnames instead of identifiers: there is
    2860             :  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
    2861             :  * and we apply canonicalize_path() to each extracted string.  Because of the
    2862             :  * last, the returned strings are separately palloc'd rather than being
    2863             :  * pointers into rawstring --- but we still scribble on rawstring.
    2864             :  *
    2865             :  * Inputs:
    2866             :  *  rawstring: the input string; must be modifiable!
    2867             :  *  separator: the separator punctuation expected between directories
    2868             :  *             (typically ',' or ';').  Whitespace may also appear around
    2869             :  *             directories.
    2870             :  * Outputs:
    2871             :  *  namelist: filled with a palloc'd list of directory names.
    2872             :  *            Caller should list_free_deep() this even on error return.
    2873             :  *
    2874             :  * Returns true if okay, false if there is a syntax error in the string.
    2875             :  *
    2876             :  * Note that an empty string is considered okay here.
    2877             :  */
    2878             : bool
    2879        1776 : SplitDirectoriesString(char *rawstring, char separator,
    2880             :                        List **namelist)
    2881             : {
    2882        1776 :     char       *nextp = rawstring;
    2883        1776 :     bool        done = false;
    2884             : 
    2885        1776 :     *namelist = NIL;
    2886             : 
    2887        1776 :     while (scanner_isspace(*nextp))
    2888           0 :         nextp++;                /* skip leading whitespace */
    2889             : 
    2890        1776 :     if (*nextp == '\0')
    2891           2 :         return true;            /* allow empty string */
    2892             : 
    2893             :     /* At the top of the loop, we are at start of a new directory. */
    2894             :     do
    2895             :     {
    2896             :         char       *curname;
    2897             :         char       *endp;
    2898             : 
    2899        1776 :         if (*nextp == '"')
    2900             :         {
    2901             :             /* Quoted name --- collapse quote-quote pairs */
    2902           0 :             curname = nextp + 1;
    2903             :             for (;;)
    2904             :             {
    2905           0 :                 endp = strchr(nextp + 1, '"');
    2906           0 :                 if (endp == NULL)
    2907           0 :                     return false;   /* mismatched quotes */
    2908           0 :                 if (endp[1] != '"')
    2909           0 :                     break;      /* found end of quoted name */
    2910             :                 /* Collapse adjacent quotes into one quote, and look again */
    2911           0 :                 memmove(endp, endp + 1, strlen(endp));
    2912           0 :                 nextp = endp;
    2913             :             }
    2914             :             /* endp now points at the terminating quote */
    2915           0 :             nextp = endp + 1;
    2916             :         }
    2917             :         else
    2918             :         {
    2919             :             /* Unquoted name --- extends to separator or end of string */
    2920        1776 :             curname = endp = nextp;
    2921       29664 :             while (*nextp && *nextp != separator)
    2922             :             {
    2923             :                 /* trailing whitespace should not be included in name */
    2924       27888 :                 if (!scanner_isspace(*nextp))
    2925       27888 :                     endp = nextp + 1;
    2926       27888 :                 nextp++;
    2927             :             }
    2928        1776 :             if (curname == endp)
    2929           0 :                 return false;   /* empty unquoted name not allowed */
    2930             :         }
    2931             : 
    2932        1776 :         while (scanner_isspace(*nextp))
    2933           0 :             nextp++;            /* skip trailing whitespace */
    2934             : 
    2935        1776 :         if (*nextp == separator)
    2936             :         {
    2937           2 :             nextp++;
    2938           2 :             while (scanner_isspace(*nextp))
    2939           0 :                 nextp++;        /* skip leading whitespace for next */
    2940             :             /* we expect another name, so done remains false */
    2941             :         }
    2942        1774 :         else if (*nextp == '\0')
    2943        1774 :             done = true;
    2944             :         else
    2945           0 :             return false;       /* invalid syntax */
    2946             : 
    2947             :         /* Now safe to overwrite separator with a null */
    2948        1776 :         *endp = '\0';
    2949             : 
    2950             :         /* Truncate path if it's overlength */
    2951        1776 :         if (strlen(curname) >= MAXPGPATH)
    2952           0 :             curname[MAXPGPATH - 1] = '\0';
    2953             : 
    2954             :         /*
    2955             :          * Finished isolating current name --- add it to list
    2956             :          */
    2957        1776 :         curname = pstrdup(curname);
    2958        1776 :         canonicalize_path(curname);
    2959        1776 :         *namelist = lappend(*namelist, curname);
    2960             : 
    2961             :         /* Loop back if we didn't reach end of string */
    2962        1776 :     } while (!done);
    2963             : 
    2964        1774 :     return true;
    2965             : }
    2966             : 
    2967             : 
    2968             : /*
    2969             :  * SplitGUCList --- parse a string containing identifiers or file names
    2970             :  *
    2971             :  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
    2972             :  * presuming whether the elements will be taken as identifiers or file names.
    2973             :  * We assume the input has already been through flatten_set_variable_args(),
    2974             :  * so that we need never downcase (if appropriate, that was done already).
    2975             :  * Nor do we ever truncate, since we don't know the correct max length.
    2976             :  * We disallow embedded whitespace for simplicity (it shouldn't matter,
    2977             :  * because any embedded whitespace should have led to double-quoting).
    2978             :  * Otherwise the API is identical to SplitIdentifierString.
    2979             :  *
    2980             :  * XXX it's annoying to have so many copies of this string-splitting logic.
    2981             :  * However, it's not clear that having one function with a bunch of option
    2982             :  * flags would be much better.
    2983             :  *
    2984             :  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
    2985             :  * Be sure to update that if you have to change this.
    2986             :  *
    2987             :  * Inputs:
    2988             :  *  rawstring: the input string; must be overwritable!  On return, it's
    2989             :  *             been modified to contain the separated identifiers.
    2990             :  *  separator: the separator punctuation expected between identifiers
    2991             :  *             (typically '.' or ',').  Whitespace may also appear around
    2992             :  *             identifiers.
    2993             :  * Outputs:
    2994             :  *  namelist: filled with a palloc'd list of pointers to identifiers within
    2995             :  *            rawstring.  Caller should list_free() this even on error return.
    2996             :  *
    2997             :  * Returns true if okay, false if there is a syntax error in the string.
    2998             :  */
    2999             : bool
    3000        3922 : SplitGUCList(char *rawstring, char separator,
    3001             :              List **namelist)
    3002             : {
    3003        3922 :     char       *nextp = rawstring;
    3004        3922 :     bool        done = false;
    3005             : 
    3006        3922 :     *namelist = NIL;
    3007             : 
    3008        3922 :     while (scanner_isspace(*nextp))
    3009           0 :         nextp++;                /* skip leading whitespace */
    3010             : 
    3011        3922 :     if (*nextp == '\0')
    3012        3848 :         return true;            /* allow empty string */
    3013             : 
    3014             :     /* At the top of the loop, we are at start of a new identifier. */
    3015             :     do
    3016             :     {
    3017             :         char       *curname;
    3018             :         char       *endp;
    3019             : 
    3020         100 :         if (*nextp == '"')
    3021             :         {
    3022             :             /* Quoted name --- collapse quote-quote pairs */
    3023          24 :             curname = nextp + 1;
    3024             :             for (;;)
    3025             :             {
    3026          36 :                 endp = strchr(nextp + 1, '"');
    3027          30 :                 if (endp == NULL)
    3028           0 :                     return false;   /* mismatched quotes */
    3029          30 :                 if (endp[1] != '"')
    3030          24 :                     break;      /* found end of quoted name */
    3031             :                 /* Collapse adjacent quotes into one quote, and look again */
    3032           6 :                 memmove(endp, endp + 1, strlen(endp));
    3033           6 :                 nextp = endp;
    3034             :             }
    3035             :             /* endp now points at the terminating quote */
    3036          24 :             nextp = endp + 1;
    3037             :         }
    3038             :         else
    3039             :         {
    3040             :             /* Unquoted name --- extends to separator or whitespace */
    3041          76 :             curname = nextp;
    3042         718 :             while (*nextp && *nextp != separator &&
    3043         642 :                    !scanner_isspace(*nextp))
    3044         642 :                 nextp++;
    3045          76 :             endp = nextp;
    3046          76 :             if (curname == nextp)
    3047           0 :                 return false;   /* empty unquoted name not allowed */
    3048             :         }
    3049             : 
    3050         100 :         while (scanner_isspace(*nextp))
    3051           0 :             nextp++;            /* skip trailing whitespace */
    3052             : 
    3053         100 :         if (*nextp == separator)
    3054             :         {
    3055          26 :             nextp++;
    3056          44 :             while (scanner_isspace(*nextp))
    3057          18 :                 nextp++;        /* skip leading whitespace for next */
    3058             :             /* we expect another name, so done remains false */
    3059             :         }
    3060          74 :         else if (*nextp == '\0')
    3061          74 :             done = true;
    3062             :         else
    3063           0 :             return false;       /* invalid syntax */
    3064             : 
    3065             :         /* Now safe to overwrite separator with a null */
    3066         100 :         *endp = '\0';
    3067             : 
    3068             :         /*
    3069             :          * Finished isolating current name --- add it to list
    3070             :          */
    3071         100 :         *namelist = lappend(*namelist, curname);
    3072             : 
    3073             :         /* Loop back if we didn't reach end of string */
    3074         100 :     } while (!done);
    3075             : 
    3076          74 :     return true;
    3077             : }
    3078             : 
    3079             : /*
    3080             :  * appendStringInfoText
    3081             :  *
    3082             :  * Append a text to str.
    3083             :  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
    3084             :  */
    3085             : static void
    3086     1928506 : appendStringInfoText(StringInfo str, const text *t)
    3087             : {
    3088     1928506 :     appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
    3089     1928506 : }
    3090             : 
    3091             : /*
    3092             :  * replace_text
    3093             :  * replace all occurrences of 'old_sub_str' in 'orig_str'
    3094             :  * with 'new_sub_str' to form 'new_str'
    3095             :  *
    3096             :  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
    3097             :  * otherwise returns 'new_str'
    3098             :  */
    3099             : Datum
    3100        1416 : replace_text(PG_FUNCTION_ARGS)
    3101             : {
    3102        1416 :     text       *src_text = PG_GETARG_TEXT_PP(0);
    3103        1416 :     text       *from_sub_text = PG_GETARG_TEXT_PP(1);
    3104        1416 :     text       *to_sub_text = PG_GETARG_TEXT_PP(2);
    3105             :     int         src_text_len;
    3106             :     int         from_sub_text_len;
    3107             :     TextPositionState state;
    3108             :     text       *ret_text;
    3109             :     int         chunk_len;
    3110             :     char       *curr_ptr;
    3111             :     char       *start_ptr;
    3112             :     StringInfoData str;
    3113             :     bool        found;
    3114             : 
    3115        1416 :     src_text_len = VARSIZE_ANY_EXHDR(src_text);
    3116        1416 :     from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
    3117             : 
    3118             :     /* Return unmodified source string if empty source or pattern */
    3119        1416 :     if (src_text_len < 1 || from_sub_text_len < 1)
    3120             :     {
    3121           0 :         PG_RETURN_TEXT_P(src_text);
    3122             :     }
    3123             : 
    3124        1416 :     text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
    3125             : 
    3126        1416 :     found = text_position_next(&state);
    3127             : 
    3128             :     /* When the from_sub_text is not found, there is nothing to do. */
    3129        1416 :     if (!found)
    3130             :     {
    3131         320 :         text_position_cleanup(&state);
    3132         320 :         PG_RETURN_TEXT_P(src_text);
    3133             :     }
    3134        1096 :     curr_ptr = text_position_get_match_ptr(&state);
    3135        1096 :     start_ptr = VARDATA_ANY(src_text);
    3136             : 
    3137        1096 :     initStringInfo(&str);
    3138             : 
    3139             :     do
    3140             :     {
    3141        5668 :         CHECK_FOR_INTERRUPTS();
    3142             : 
    3143             :         /* copy the data skipped over by last text_position_next() */
    3144        5668 :         chunk_len = curr_ptr - start_ptr;
    3145        5668 :         appendBinaryStringInfo(&str, start_ptr, chunk_len);
    3146             : 
    3147        5668 :         appendStringInfoText(&str, to_sub_text);
    3148             : 
    3149        5668 :         start_ptr = curr_ptr + state.last_match_len;
    3150             : 
    3151        5668 :         found = text_position_next(&state);
    3152        5668 :         if (found)
    3153        4572 :             curr_ptr = text_position_get_match_ptr(&state);
    3154             :     }
    3155        5668 :     while (found);
    3156             : 
    3157             :     /* copy trailing data */
    3158        1096 :     chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
    3159        1096 :     appendBinaryStringInfo(&str, start_ptr, chunk_len);
    3160             : 
    3161        1096 :     text_position_cleanup(&state);
    3162             : 
    3163        1096 :     ret_text = cstring_to_text_with_len(str.data, str.len);
    3164        1096 :     pfree(str.data);
    3165             : 
    3166        1096 :     PG_RETURN_TEXT_P(ret_text);
    3167             : }
    3168             : 
    3169             : /*
    3170             :  * check_replace_text_has_escape
    3171             :  *
    3172             :  * Returns 0 if text contains no backslashes that need processing.
    3173             :  * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
    3174             :  * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
    3175             :  */
    3176             : static int
    3177       18704 : check_replace_text_has_escape(const text *replace_text)
    3178             : {
    3179       18704 :     int         result = 0;
    3180       18704 :     const char *p = VARDATA_ANY(replace_text);
    3181       18704 :     const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
    3182             : 
    3183       37452 :     while (p < p_end)
    3184             :     {
    3185             :         /* Find next escape char, if any. */
    3186       17628 :         p = memchr(p, '\\', p_end - p);
    3187       17628 :         if (p == NULL)
    3188       16806 :             break;
    3189         822 :         p++;
    3190             :         /* Note: a backslash at the end doesn't require extra processing. */
    3191         822 :         if (p < p_end)
    3192             :         {
    3193         822 :             if (*p >= '1' && *p <= '9')
    3194         778 :                 return 2;       /* Found a submatch specifier, so done */
    3195          44 :             result = 1;         /* Found some other sequence, keep looking */
    3196          44 :             p++;
    3197             :         }
    3198             :     }
    3199       17926 :     return result;
    3200             : }
    3201             : 
    3202             : /*
    3203             :  * appendStringInfoRegexpSubstr
    3204             :  *
    3205             :  * Append replace_text to str, substituting regexp back references for
    3206             :  * \n escapes.  start_ptr is the start of the match in the source string,
    3207             :  * at logical character position data_pos.
    3208             :  */
    3209             : static void
    3210         236 : appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
    3211             :                              regmatch_t *pmatch,
    3212             :                              char *start_ptr, int data_pos)
    3213             : {
    3214         236 :     const char *p = VARDATA_ANY(replace_text);
    3215         236 :     const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
    3216             : 
    3217         574 :     while (p < p_end)
    3218             :     {
    3219         518 :         const char *chunk_start = p;
    3220             :         int         so;
    3221             :         int         eo;
    3222             : 
    3223             :         /* Find next escape char, if any. */
    3224         518 :         p = memchr(p, '\\', p_end - p);
    3225         518 :         if (p == NULL)
    3226         174 :             p = p_end;
    3227             : 
    3228             :         /* Copy the text we just scanned over, if any. */
    3229         518 :         if (p > chunk_start)
    3230         318 :             appendBinaryStringInfo(str, chunk_start, p - chunk_start);
    3231             : 
    3232             :         /* Done if at end of string, else advance over escape char. */
    3233         518 :         if (p >= p_end)
    3234         174 :             break;
    3235         344 :         p++;
    3236             : 
    3237         344 :         if (p >= p_end)
    3238             :         {
    3239             :             /* Escape at very end of input.  Treat same as unexpected char */
    3240           6 :             appendStringInfoChar(str, '\\');
    3241           6 :             break;
    3242             :         }
    3243             : 
    3244         338 :         if (*p >= '1' && *p <= '9')
    3245         278 :         {
    3246             :             /* Use the back reference of regexp. */
    3247         278 :             int         idx = *p - '0';
    3248             : 
    3249         278 :             so = pmatch[idx].rm_so;
    3250         278 :             eo = pmatch[idx].rm_eo;
    3251         278 :             p++;
    3252             :         }
    3253          60 :         else if (*p == '&')
    3254             :         {
    3255             :             /* Use the entire matched string. */
    3256          18 :             so = pmatch[0].rm_so;
    3257          18 :             eo = pmatch[0].rm_eo;
    3258          18 :             p++;
    3259             :         }
    3260          42 :         else if (*p == '\\')
    3261             :         {
    3262             :             /* \\ means transfer one \ to output. */
    3263          36 :             appendStringInfoChar(str, '\\');
    3264          36 :             p++;
    3265          36 :             continue;
    3266             :         }
    3267             :         else
    3268             :         {
    3269             :             /*
    3270             :              * If escape char is not followed by any expected char, just treat
    3271             :              * it as ordinary data to copy.  (XXX would it be better to throw
    3272             :              * an error?)
    3273             :              */
    3274           6 :             appendStringInfoChar(str, '\\');
    3275           6 :             continue;
    3276             :         }
    3277             : 
    3278         296 :         if (so >= 0 && eo >= 0)
    3279             :         {
    3280             :             /*
    3281             :              * Copy the text that is back reference of regexp.  Note so and eo
    3282             :              * are counted in characters not bytes.
    3283             :              */
    3284             :             char       *chunk_start;
    3285             :             int         chunk_len;
    3286             : 
    3287             :             Assert(so >= data_pos);
    3288         296 :             chunk_start = start_ptr;
    3289         296 :             chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
    3290         296 :             chunk_len = charlen_to_bytelen(chunk_start, eo - so);
    3291         296 :             appendBinaryStringInfo(str, chunk_start, chunk_len);
    3292             :         }
    3293             :     }
    3294         236 : }
    3295             : 
    3296             : /*
    3297             :  * replace_text_regexp
    3298             :  *
    3299             :  * replace substring(s) in src_text that match pattern with replace_text.
    3300             :  * The replace_text can contain backslash markers to substitute
    3301             :  * (parts of) the matched text.
    3302             :  *
    3303             :  * cflags: regexp compile flags.
    3304             :  * collation: collation to use.
    3305             :  * search_start: the character (not byte) offset in src_text at which to
    3306             :  * begin searching.
    3307             :  * n: if 0, replace all matches; if > 0, replace only the N'th match.
    3308             :  */
    3309             : text *
    3310       18704 : replace_text_regexp(text *src_text, text *pattern_text,
    3311             :                     text *replace_text,
    3312             :                     int cflags, Oid collation,
    3313             :                     int search_start, int n)
    3314             : {
    3315             :     text       *ret_text;
    3316             :     regex_t    *re;
    3317       18704 :     int         src_text_len = VARSIZE_ANY_EXHDR(src_text);
    3318       18704 :     int         nmatches = 0;
    3319             :     StringInfoData buf;
    3320             :     regmatch_t  pmatch[10];     /* main match, plus \1 to \9 */
    3321       18704 :     int         nmatch = lengthof(pmatch);
    3322             :     pg_wchar   *data;
    3323             :     size_t      data_len;
    3324             :     int         data_pos;
    3325             :     char       *start_ptr;
    3326             :     int         escape_status;
    3327             : 
    3328       18704 :     initStringInfo(&buf);
    3329             : 
    3330             :     /* Convert data string to wide characters. */
    3331       18704 :     data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
    3332       18704 :     data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
    3333             : 
    3334             :     /* Check whether replace_text has escapes, especially regexp submatches. */
    3335       18704 :     escape_status = check_replace_text_has_escape(replace_text);
    3336             : 
    3337             :     /* If no regexp submatches, we can use REG_NOSUB. */
    3338       18704 :     if (escape_status < 2)
    3339             :     {
    3340       17926 :         cflags |= REG_NOSUB;
    3341             :         /* Also tell pg_regexec we only want the whole-match location. */
    3342       17926 :         nmatch = 1;
    3343             :     }
    3344             : 
    3345             :     /* Prepare the regexp. */
    3346       18704 :     re = RE_compile_and_cache(pattern_text, cflags, collation);
    3347             : 
    3348             :     /* start_ptr points to the data_pos'th character of src_text */
    3349       18704 :     start_ptr = (char *) VARDATA_ANY(src_text);
    3350       18704 :     data_pos = 0;
    3351             : 
    3352       25066 :     while (search_start <= data_len)
    3353             :     {
    3354             :         int         regexec_result;
    3355             : 
    3356       25060 :         CHECK_FOR_INTERRUPTS();
    3357             : 
    3358       25060 :         regexec_result = pg_regexec(re,
    3359             :                                     data,
    3360             :                                     data_len,
    3361             :                                     search_start,
    3362             :                                     NULL,   /* no details */
    3363             :                                     nmatch,
    3364             :                                     pmatch,
    3365             :                                     0);
    3366             : 
    3367       25060 :         if (regexec_result == REG_NOMATCH)
    3368       16638 :             break;
    3369             : 
    3370        8422 :         if (regexec_result != REG_OKAY)
    3371             :         {
    3372             :             char        errMsg[100];
    3373             : 
    3374           0 :             pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
    3375           0 :             ereport(ERROR,
    3376             :                     (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
    3377             :                      errmsg("regular expression failed: %s", errMsg)));
    3378             :         }
    3379             : 
    3380             :         /*
    3381             :          * Count matches, and decide whether to replace this match.
    3382             :          */
    3383        8422 :         nmatches++;
    3384        8422 :         if (n > 0 && nmatches != n)
    3385             :         {
    3386             :             /*
    3387             :              * No, so advance search_start, but not start_ptr/data_pos. (Thus,
    3388             :              * we treat the matched text as if it weren't matched, and copy it
    3389             :              * to the output later.)
    3390             :              */
    3391          60 :             search_start = pmatch[0].rm_eo;
    3392          60 :             if (pmatch[0].rm_so == pmatch[0].rm_eo)
    3393           0 :                 search_start++;
    3394          60 :             continue;
    3395             :         }
    3396             : 
    3397             :         /*
    3398             :          * Copy the text to the left of the match position.  Note we are given
    3399             :          * character not byte indexes.
    3400             :          */
    3401        8362 :         if (pmatch[0].rm_so - data_pos > 0)
    3402             :         {
    3403             :             int         chunk_len;
    3404             : 
    3405        8188 :             chunk_len = charlen_to_bytelen(start_ptr,
    3406        8188 :                                            pmatch[0].rm_so - data_pos);
    3407        8188 :             appendBinaryStringInfo(&buf, start_ptr, chunk_len);
    3408             : 
    3409             :             /*
    3410             :              * Advance start_ptr over that text, to avoid multiple rescans of
    3411             :              * it if the replace_text contains multiple back-references.
    3412             :              */
    3413        8188 :             start_ptr += chunk_len;
    3414        8188 :             data_pos = pmatch[0].rm_so;
    3415             :         }
    3416             : 
    3417             :         /*
    3418             :          * Copy the replace_text, processing escapes if any are present.
    3419             :          */
    3420        8362 :         if (escape_status > 0)
    3421         236 :             appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
    3422             :                                          start_ptr, data_pos);
    3423             :         else
    3424        8126 :             appendStringInfoText(&buf, replace_text);
    3425             : 
    3426             :         /* Advance start_ptr and data_pos over the matched text. */
    3427       16724 :         start_ptr += charlen_to_bytelen(start_ptr,
    3428        8362 :                                         pmatch[0].rm_eo - data_pos);
    3429        8362 :         data_pos = pmatch[0].rm_eo;
    3430             : 
    3431             :         /*
    3432             :          * If we only want to replace one occurrence, we're done.
    3433             :          */
    3434        8362 :         if (n > 0)
    3435        2060 :             break;
    3436             : 
    3437             :         /*
    3438             :          * Advance search position.  Normally we start the next search at the
    3439             :          * end of the previous match; but if the match was of zero length, we
    3440             :          * have to advance by one character, or we'd just find the same match
    3441             :          * again.
    3442             :          */
    3443        6302 :         search_start = data_pos;
    3444        6302 :         if (pmatch[0].rm_so == pmatch[0].rm_eo)
    3445          12 :             search_start++;
    3446             :     }
    3447             : 
    3448             :     /*
    3449             :      * Copy the text to the right of the last match.
    3450             :      */
    3451       18704 :     if (data_pos < data_len)
    3452             :     {
    3453             :         int         chunk_len;
    3454             : 
    3455       17828 :         chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
    3456       17828 :         appendBinaryStringInfo(&buf, start_ptr, chunk_len);
    3457             :     }
    3458             : 
    3459       18704 :     ret_text = cstring_to_text_with_len(buf.data, buf.len);
    3460       18704 :     pfree(buf.data);
    3461       18704 :     pfree(data);
    3462             : 
    3463       18704 :     return ret_text;
    3464             : }
    3465             : 
    3466             : /*
    3467             :  * split_part
    3468             :  * parse input string based on provided field separator
    3469             :  * return N'th item (1 based, negative counts from end)
    3470             :  */
    3471             : Datum
    3472         150 : split_part(PG_FUNCTION_ARGS)
    3473             : {
    3474         150 :     text       *inputstring = PG_GETARG_TEXT_PP(0);
    3475         150 :     text       *fldsep = PG_GETARG_TEXT_PP(1);
    3476         150 :     int         fldnum = PG_GETARG_INT32(2);
    3477             :     int         inputstring_len;
    3478             :     int         fldsep_len;
    3479             :     TextPositionState state;
    3480             :     char       *start_ptr;
    3481             :     char       *end_ptr;
    3482             :     text       *result_text;
    3483             :     bool        found;
    3484             : 
    3485             :     /* field number is 1 based */
    3486         150 :     if (fldnum == 0)
    3487           6 :         ereport(ERROR,
    3488             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    3489             :                  errmsg("field position must not be zero")));
    3490             : 
    3491         144 :     inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    3492         144 :     fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
    3493             : 
    3494             :     /* return empty string for empty input string */
    3495         144 :     if (inputstring_len < 1)
    3496          12 :         PG_RETURN_TEXT_P(cstring_to_text(""));
    3497             : 
    3498             :     /* handle empty field separator */
    3499         132 :     if (fldsep_len < 1)
    3500             :     {
    3501             :         /* if first or last field, return input string, else empty string */
    3502          24 :         if (fldnum == 1 || fldnum == -1)
    3503          12 :             PG_RETURN_TEXT_P(inputstring);
    3504             :         else
    3505          12 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    3506             :     }
    3507             : 
    3508             :     /* find the first field separator */
    3509         108 :     text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
    3510             : 
    3511         108 :     found = text_position_next(&state);
    3512             : 
    3513             :     /* special case if fldsep not found at all */
    3514         108 :     if (!found)
    3515             :     {
    3516          24 :         text_position_cleanup(&state);
    3517             :         /* if first or last field, return input string, else empty string */
    3518          24 :         if (fldnum == 1 || fldnum == -1)
    3519          12 :             PG_RETURN_TEXT_P(inputstring);
    3520             :         else
    3521          12 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    3522             :     }
    3523             : 
    3524             :     /*
    3525             :      * take care of a negative field number (i.e. count from the right) by
    3526             :      * converting to a positive field number; we need total number of fields
    3527             :      */
    3528          84 :     if (fldnum < 0)
    3529             :     {
    3530             :         /* we found a fldsep, so there are at least two fields */
    3531          42 :         int         numfields = 2;
    3532             : 
    3533          54 :         while (text_position_next(&state))
    3534          12 :             numfields++;
    3535             : 
    3536             :         /* special case of last field does not require an extra pass */
    3537          42 :         if (fldnum == -1)
    3538             :         {
    3539          24 :             start_ptr = text_position_get_match_ptr(&state) + state.last_match_len;
    3540          24 :             end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
    3541          24 :             text_position_cleanup(&state);
    3542          24 :             PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
    3543             :                                                       end_ptr - start_ptr));
    3544             :         }
    3545             : 
    3546             :         /* else, convert fldnum to positive notation */
    3547          18 :         fldnum += numfields + 1;
    3548             : 
    3549             :         /* if nonexistent field, return empty string */
    3550          18 :         if (fldnum <= 0)
    3551             :         {
    3552           6 :             text_position_cleanup(&state);
    3553           6 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    3554             :         }
    3555             : 
    3556             :         /* reset to pointing at first match, but now with positive fldnum */
    3557          12 :         text_position_reset(&state);
    3558          12 :         found = text_position_next(&state);
    3559             :         Assert(found);
    3560             :     }
    3561             : 
    3562             :     /* identify bounds of first field */
    3563          54 :     start_ptr = VARDATA_ANY(inputstring);
    3564          54 :     end_ptr = text_position_get_match_ptr(&state);
    3565             : 
    3566         102 :     while (found && --fldnum > 0)
    3567             :     {
    3568             :         /* identify bounds of next field */
    3569          48 :         start_ptr = end_ptr + state.last_match_len;
    3570          48 :         found = text_position_next(&state);
    3571          48 :         if (found)
    3572          18 :             end_ptr = text_position_get_match_ptr(&state);
    3573             :     }
    3574             : 
    3575          54 :     text_position_cleanup(&state);
    3576             : 
    3577          54 :     if (fldnum > 0)
    3578             :     {
    3579             :         /* N'th field separator not found */
    3580             :         /* if last field requested, return it, else empty string */
    3581          30 :         if (fldnum == 1)
    3582             :         {
    3583          24 :             int         last_len = start_ptr - VARDATA_ANY(inputstring);
    3584             : 
    3585          24 :             result_text = cstring_to_text_with_len(start_ptr,
    3586             :                                                    inputstring_len - last_len);
    3587             :         }
    3588             :         else
    3589           6 :             result_text = cstring_to_text("");
    3590             :     }
    3591             :     else
    3592             :     {
    3593             :         /* non-last field requested */
    3594          24 :         result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
    3595             :     }
    3596             : 
    3597          54 :     PG_RETURN_TEXT_P(result_text);
    3598             : }
    3599             : 
    3600             : /*
    3601             :  * Convenience function to return true when two text params are equal.
    3602             :  */
    3603             : static bool
    3604         384 : text_isequal(text *txt1, text *txt2, Oid collid)
    3605             : {
    3606         384 :     return DatumGetBool(DirectFunctionCall2Coll(texteq,
    3607             :                                                 collid,
    3608             :                                                 PointerGetDatum(txt1),
    3609             :                                                 PointerGetDatum(txt2)));
    3610             : }
    3611             : 
    3612             : /*
    3613             :  * text_to_array
    3614             :  * parse input string and return text array of elements,
    3615             :  * based on provided field separator
    3616             :  */
    3617             : Datum
    3618         170 : text_to_array(PG_FUNCTION_ARGS)
    3619             : {
    3620             :     SplitTextOutputData tstate;
    3621             : 
    3622             :     /* For array output, tstate should start as all zeroes */
    3623         170 :     memset(&tstate, 0, sizeof(tstate));
    3624             : 
    3625         170 :     if (!split_text(fcinfo, &tstate))
    3626           6 :         PG_RETURN_NULL();
    3627             : 
    3628         164 :     if (tstate.astate == NULL)
    3629           6 :         PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
    3630             : 
    3631         158 :     PG_RETURN_DATUM(makeArrayResult(tstate.astate,
    3632             :                                     CurrentMemoryContext));
    3633             : }
    3634             : 
    3635             : /*
    3636             :  * text_to_array_null
    3637             :  * parse input string and return text array of elements,
    3638             :  * based on provided field separator and null string
    3639             :  *
    3640             :  * This is a separate entry point only to prevent the regression tests from
    3641             :  * complaining about different argument sets for the same internal function.
    3642             :  */
    3643             : Datum
    3644          60 : text_to_array_null(PG_FUNCTION_ARGS)
    3645             : {
    3646          60 :     return text_to_array(fcinfo);
    3647             : }
    3648             : 
    3649             : /*
    3650             :  * text_to_table
    3651             :  * parse input string and return table of elements,
    3652             :  * based on provided field separator
    3653             :  */
    3654             : Datum
    3655          84 : text_to_table(PG_FUNCTION_ARGS)
    3656             : {
    3657          84 :     ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
    3658             :     SplitTextOutputData tstate;
    3659             : 
    3660          84 :     tstate.astate = NULL;
    3661          84 :     InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
    3662          84 :     tstate.tupstore = rsi->setResult;
    3663          84 :     tstate.tupdesc = rsi->setDesc;
    3664             : 
    3665          84 :     (void) split_text(fcinfo, &tstate);
    3666             : 
    3667          84 :     return (Datum) 0;
    3668             : }
    3669             : 
    3670             : /*
    3671             :  * text_to_table_null
    3672             :  * parse input string and return table of elements,
    3673             :  * based on provided field separator and null string
    3674             :  *
    3675             :  * This is a separate entry point only to prevent the regression tests from
    3676             :  * complaining about different argument sets for the same internal function.
    3677             :  */
    3678             : Datum
    3679          24 : text_to_table_null(PG_FUNCTION_ARGS)
    3680             : {
    3681          24 :     return text_to_table(fcinfo);
    3682             : }
    3683             : 
    3684             : /*
    3685             :  * Common code for text_to_array, text_to_array_null, text_to_table
    3686             :  * and text_to_table_null functions.
    3687             :  *
    3688             :  * These are not strict so we have to test for null inputs explicitly.
    3689             :  * Returns false if result is to be null, else returns true.
    3690             :  *
    3691             :  * Note that if the result is valid but empty (zero elements), we return
    3692             :  * without changing *tstate --- caller must handle that case, too.
    3693             :  */
    3694             : static bool
    3695         254 : split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
    3696             : {
    3697             :     text       *inputstring;
    3698             :     text       *fldsep;
    3699             :     text       *null_string;
    3700         254 :     Oid         collation = PG_GET_COLLATION();
    3701             :     int         inputstring_len;
    3702             :     int         fldsep_len;
    3703             :     char       *start_ptr;
    3704             :     text       *result_text;
    3705             : 
    3706             :     /* when input string is NULL, then result is NULL too */
    3707         254 :     if (PG_ARGISNULL(0))
    3708          12 :         return false;
    3709             : 
    3710         242 :     inputstring = PG_GETARG_TEXT_PP(0);
    3711             : 
    3712             :     /* fldsep can be NULL */
    3713         242 :     if (!PG_ARGISNULL(1))
    3714         212 :         fldsep = PG_GETARG_TEXT_PP(1);
    3715             :     else
    3716          30 :         fldsep = NULL;
    3717             : 
    3718             :     /* null_string can be NULL or omitted */
    3719         242 :     if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
    3720          84 :         null_string = PG_GETARG_TEXT_PP(2);
    3721             :     else
    3722         158 :         null_string = NULL;
    3723             : 
    3724         242 :     if (fldsep != NULL)
    3725             :     {
    3726             :         /*
    3727             :          * Normal case with non-null fldsep.  Use the text_position machinery
    3728             :          * to search for occurrences of fldsep.
    3729             :          */
    3730             :         TextPositionState state;
    3731             : 
    3732         212 :         inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    3733         212 :         fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
    3734             : 
    3735             :         /* return empty set for empty input string */
    3736         212 :         if (inputstring_len < 1)
    3737          60 :             return true;
    3738             : 
    3739             :         /* empty field separator: return input string as a one-element set */
    3740         200 :         if (fldsep_len < 1)
    3741             :         {
    3742          48 :             split_text_accum_result(tstate, inputstring,
    3743             :                                     null_string, collation);
    3744          48 :             return true;
    3745             :         }
    3746             : 
    3747         152 :         text_position_setup(inputstring, fldsep, collation, &state);
    3748             : 
    3749         152 :         start_ptr = VARDATA_ANY(inputstring);
    3750             : 
    3751             :         for (;;)
    3752         512 :         {
    3753             :             bool        found;
    3754             :             char       *end_ptr;
    3755             :             int         chunk_len;
    3756             : 
    3757         664 :             CHECK_FOR_INTERRUPTS();
    3758             : 
    3759         664 :             found = text_position_next(&state);
    3760         664 :             if (!found)
    3761             :             {
    3762             :                 /* fetch last field */
    3763         152 :                 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
    3764         152 :                 end_ptr = NULL; /* not used, but some compilers complain */
    3765             :             }
    3766             :             else
    3767             :             {
    3768             :                 /* fetch non-last field */
    3769         512 :                 end_ptr = text_position_get_match_ptr(&state);
    3770         512 :                 chunk_len = end_ptr - start_ptr;
    3771             :             }
    3772             : 
    3773             :             /* build a temp text datum to pass to split_text_accum_result */
    3774         664 :             result_text = cstring_to_text_with_len(start_ptr, chunk_len);
    3775             : 
    3776             :             /* stash away this field */
    3777         664 :             split_text_accum_result(tstate, result_text,
    3778             :                                     null_string, collation);
    3779             : 
    3780         664 :             pfree(result_text);
    3781             : 
    3782         664 :             if (!found)
    3783         152 :                 break;
    3784             : 
    3785         512 :             start_ptr = end_ptr + state.last_match_len;
    3786             :         }
    3787             : 
    3788         152 :         text_position_cleanup(&state);
    3789             :     }
    3790             :     else
    3791             :     {
    3792             :         /*
    3793             :          * When fldsep is NULL, each character in the input string becomes a
    3794             :          * separate element in the result set.  The separator is effectively
    3795             :          * the space between characters.
    3796             :          */
    3797          30 :         inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    3798             : 
    3799          30 :         start_ptr = VARDATA_ANY(inputstring);
    3800             : 
    3801         252 :         while (inputstring_len > 0)
    3802             :         {
    3803         222 :             int         chunk_len = pg_mblen(start_ptr);
    3804             : 
    3805         222 :             CHECK_FOR_INTERRUPTS();
    3806             : 
    3807             :             /* build a temp text datum to pass to split_text_accum_result */
    3808         222 :             result_text = cstring_to_text_with_len(start_ptr, chunk_len);
    3809             : 
    3810             :             /* stash away this field */
    3811         222 :             split_text_accum_result(tstate, result_text,
    3812             :                                     null_string, collation);
    3813             : 
    3814         222 :             pfree(result_text);
    3815             : 
    3816         222 :             start_ptr += chunk_len;
    3817         222 :             inputstring_len -= chunk_len;
    3818             :         }
    3819             :     }
    3820             : 
    3821         182 :     return true;
    3822             : }
    3823             : 
    3824             : /*
    3825             :  * Add text item to result set (table or array).
    3826             :  *
    3827             :  * This is also responsible for checking to see if the item matches
    3828             :  * the null_string, in which case we should emit NULL instead.
    3829             :  */
    3830             : static void
    3831         934 : split_text_accum_result(SplitTextOutputData *tstate,
    3832             :                         text *field_value,
    3833             :                         text *null_string,
    3834             :                         Oid collation)
    3835             : {
    3836         934 :     bool        is_null = false;
    3837             : 
    3838         934 :     if (null_string && text_isequal(field_value, null_string, collation))
    3839          72 :         is_null = true;
    3840             : 
    3841         934 :     if (tstate->tupstore)
    3842             :     {
    3843             :         Datum       values[1];
    3844             :         bool        nulls[1];
    3845             : 
    3846         228 :         values[0] = PointerGetDatum(field_value);
    3847         228 :         nulls[0] = is_null;
    3848             : 
    3849         228 :         tuplestore_putvalues(tstate->tupstore,
    3850             :                              tstate->tupdesc,
    3851             :                              values,
    3852             :                              nulls);
    3853             :     }
    3854             :     else
    3855             :     {
    3856         706 :         tstate->astate = accumArrayResult(tstate->astate,
    3857             :                                           PointerGetDatum(field_value),
    3858             :                                           is_null,
    3859             :                                           TEXTOID,
    3860             :                                           CurrentMemoryContext);
    3861             :     }
    3862         934 : }
    3863             : 
    3864             : /*
    3865             :  * array_to_text
    3866             :  * concatenate Cstring representation of input array elements
    3867             :  * using provided field separator
    3868             :  */
    3869             : Datum
    3870       95410 : array_to_text(PG_FUNCTION_ARGS)
    3871             : {
    3872       95410 :     ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
    3873       95410 :     char       *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
    3874             : 
    3875       95410 :     PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
    3876             : }
    3877             : 
    3878             : /*
    3879             :  * array_to_text_null
    3880             :  * concatenate Cstring representation of input array elements
    3881             :  * using provided field separator and null string
    3882             :  *
    3883             :  * This version is not strict so we have to test for null inputs explicitly.
    3884             :  */
    3885             : Datum
    3886          12 : array_to_text_null(PG_FUNCTION_ARGS)
    3887             : {
    3888             :     ArrayType  *v;
    3889             :     char       *fldsep;
    3890             :     char       *null_string;
    3891             : 
    3892             :     /* returns NULL when first or second parameter is NULL */
    3893          12 :     if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
    3894           0 :         PG_RETURN_NULL();
    3895             : 
    3896          12 :     v = PG_GETARG_ARRAYTYPE_P(0);
    3897          12 :     fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
    3898             : 
    3899             :     /* NULL null string is passed through as a null pointer */
    3900          12 :     if (!PG_ARGISNULL(2))
    3901           6 :         null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
    3902             :     else
    3903           6 :         null_string = NULL;
    3904             : 
    3905          12 :     PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
    3906             : }
    3907             : 
    3908             : /*
    3909             :  * common code for array_to_text and array_to_text_null functions
    3910             :  */
    3911             : static text *
    3912       95440 : array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
    3913             :                        const char *fldsep, const char *null_string)
    3914             : {
    3915             :     text       *result;
    3916             :     int         nitems,
    3917             :                *dims,
    3918             :                 ndims;
    3919             :     Oid         element_type;
    3920             :     int         typlen;
    3921             :     bool        typbyval;
    3922             :     char        typalign;
    3923             :     StringInfoData buf;
    3924       95440 :     bool        printed = false;
    3925             :     char       *p;
    3926             :     bits8      *bitmap;
    3927             :     int         bitmask;
    3928             :     int         i;
    3929             :     ArrayMetaState *my_extra;
    3930             : 
    3931       95440 :     ndims = ARR_NDIM(v);
    3932       95440 :     dims = ARR_DIMS(v);
    3933       95440 :     nitems = ArrayGetNItems(ndims, dims);
    3934             : 
    3935             :     /* if there are no elements, return an empty string */
    3936       95440 :     if (nitems == 0)
    3937       70400 :         return cstring_to_text_with_len("", 0);
    3938             : 
    3939       25040 :     element_type = ARR_ELEMTYPE(v);
    3940       25040 :     initStringInfo(&buf);
    3941             : 
    3942             :     /*
    3943             :      * We arrange to look up info about element type, including its output
    3944             :      * conversion proc, only once per series of calls, assuming the element
    3945             :      * type doesn't change underneath us.
    3946             :      */
    3947       25040 :     my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
    3948       25040 :     if (my_extra == NULL)
    3949             :     {
    3950        1474 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    3951             :                                                       sizeof(ArrayMetaState));
    3952        1474 :         my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
    3953        1474 :         my_extra->element_type = ~element_type;
    3954             :     }
    3955             : 
    3956       25040 :     if (my_extra->element_type != element_type)
    3957             :     {
    3958             :         /*
    3959             :          * Get info about element type, including its output conversion proc
    3960             :          */
    3961        1474 :         get_type_io_data(element_type, IOFunc_output,
    3962             :                          &my_extra->typlen, &my_extra->typbyval,
    3963             :                          &my_extra->typalign, &my_extra->typdelim,
    3964             :                          &my_extra->typioparam, &my_extra->typiofunc);
    3965        1474 :         fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
    3966        1474 :                       fcinfo->flinfo->fn_mcxt);
    3967        1474 :         my_extra->element_type = element_type;
    3968             :     }
    3969       25040 :     typlen = my_extra->typlen;
    3970       25040 :     typbyval = my_extra->typbyval;
    3971       25040 :     typalign = my_extra->typalign;
    3972             : 
    3973       25040 :     p = ARR_DATA_PTR(v);
    3974       25040 :     bitmap = ARR_NULLBITMAP(v);
    3975       25040 :     bitmask = 1;
    3976             : 
    3977       85080 :     for (i = 0; i < nitems; i++)
    3978             :     {
    3979             :         Datum       itemvalue;
    3980             :         char       *value;
    3981             : 
    3982             :         /* Get source element, checking for NULL */
    3983       60040 :         if (bitmap && (*bitmap & bitmask) == 0)
    3984             :         {
    3985             :             /* if null_string is NULL, we just ignore null elements */
    3986          18 :             if (null_string != NULL)
    3987             :             {
    3988           6 :                 if (printed)
    3989           6 :                     appendStringInfo(&buf, "%s%s", fldsep, null_string);
    3990             :                 else
    3991           0 :                     appendStringInfoString(&buf, null_string);
    3992           6 :                 printed = true;
    3993             :             }
    3994             :         }
    3995             :         else
    3996             :         {
    3997       60022 :             itemvalue = fetch_att(p, typbyval, typlen);
    3998             : 
    3999       60022 :             value = OutputFunctionCall(&my_extra->proc, itemvalue);
    4000             : 
    4001       60022 :             if (printed)
    4002       34982 :                 appendStringInfo(&buf, "%s%s", fldsep, value);
    4003             :             else
    4004       25040 :                 appendStringInfoString(&buf, value);
    4005       60022 :             printed = true;
    4006             : 
    4007       60022 :             p = att_addlength_pointer(p, typlen, p);
    4008       60022 :             p = (char *) att_align_nominal(p, typalign);
    4009             :         }
    4010             : 
    4011             :         /* advance bitmap pointer if any */
    4012       60040 :         if (bitmap)
    4013             :         {
    4014         108 :             bitmask <<= 1;
    4015         108 :             if (bitmask == 0x100)
    4016             :             {
    4017           0 :                 bitmap++;
    4018           0 :                 bitmask = 1;
    4019             :             }
    4020             :         }
    4021             :     }
    4022             : 
    4023       25040 :     result = cstring_to_text_with_len(buf.data, buf.len);
    4024       25040 :     pfree(buf.data);
    4025             : 
    4026       25040 :     return result;
    4027             : }
    4028             : 
    4029             : /*
    4030             :  * Workhorse for to_bin, to_oct, and to_hex.  Note that base must be > 1 and <=
    4031             :  * 16.
    4032             :  */
    4033             : static inline text *
    4034       38750 : convert_to_base(uint64 value, int base)
    4035             : {
    4036       38750 :     const char *digits = "0123456789abcdef";
    4037             : 
    4038             :     /* We size the buffer for to_bin's longest possible return value. */
    4039             :     char        buf[sizeof(uint64) * BITS_PER_BYTE];
    4040       38750 :     char       *const end = buf + sizeof(buf);
    4041       38750 :     char       *ptr = end;
    4042             : 
    4043             :     Assert(base > 1);
    4044             :     Assert(base <= 16);
    4045             : 
    4046             :     do
    4047             :     {
    4048       75974 :         *--ptr = digits[value % base];
    4049       75974 :         value /= base;
    4050       75974 :     } while (ptr > buf && value);
    4051             : 
    4052       38750 :     return cstring_to_text_with_len(ptr, end - ptr);
    4053             : }
    4054             : 
    4055             : /*
    4056             :  * Convert an integer to a string containing a base-2 (binary) representation
    4057             :  * of the number.
    4058             :  */
    4059             : Datum
    4060          12 : to_bin32(PG_FUNCTION_ARGS)
    4061             : {
    4062          12 :     uint64      value = (uint32) PG_GETARG_INT32(0);
    4063             : 
    4064          12 :     PG_RETURN_TEXT_P(convert_to_base(value, 2));
    4065             : }
    4066             : Datum
    4067          12 : to_bin64(PG_FUNCTION_ARGS)
    4068             : {
    4069          12 :     uint64      value = (uint64) PG_GETARG_INT64(0);
    4070             : 
    4071          12 :     PG_RETURN_TEXT_P(convert_to_base(value, 2));
    4072             : }
    4073             : 
    4074             : /*
    4075             :  * Convert an integer to a string containing a base-8 (oct) representation of
    4076             :  * the number.
    4077             :  */
    4078             : Datum
    4079          12 : to_oct32(PG_FUNCTION_ARGS)
    4080             : {
    4081          12 :     uint64      value = (uint32) PG_GETARG_INT32(0);
    4082             : 
    4083          12 :     PG_RETURN_TEXT_P(convert_to_base(value, 8));
    4084             : }
    4085             : Datum
    4086          12 : to_oct64(PG_FUNCTION_ARGS)
    4087             : {
    4088          12 :     uint64      value = (uint64) PG_GETARG_INT64(0);
    4089             : 
    4090          12 :     PG_RETURN_TEXT_P(convert_to_base(value, 8));
    4091             : }
    4092             : 
    4093             : /*
    4094             :  * Convert an integer to a string containing a base-16 (hex) representation of
    4095             :  * the number.
    4096             :  */
    4097             : Datum
    4098       38690 : to_hex32(PG_FUNCTION_ARGS)
    4099             : {
    4100       38690 :     uint64      value = (uint32) PG_GETARG_INT32(0);
    4101             : 
    4102       38690 :     PG_RETURN_TEXT_P(convert_to_base(value, 16));
    4103             : }
    4104             : Datum
    4105          12 : to_hex64(PG_FUNCTION_ARGS)
    4106             : {
    4107          12 :     uint64      value = (uint64) PG_GETARG_INT64(0);
    4108             : 
    4109          12 :     PG_RETURN_TEXT_P(convert_to_base(value, 16));
    4110             : }
    4111             : 
    4112             : /*
    4113             :  * Return the size of a datum, possibly compressed
    4114             :  *
    4115             :  * Works on any data type
    4116             :  */
    4117             : Datum
    4118         122 : pg_column_size(PG_FUNCTION_ARGS)
    4119             : {
    4120         122 :     Datum       value = PG_GETARG_DATUM(0);
    4121             :     int32       result;
    4122             :     int         typlen;
    4123             : 
    4124             :     /* On first call, get the input type's typlen, and save at *fn_extra */
    4125         122 :     if (fcinfo->flinfo->fn_extra == NULL)
    4126             :     {
    4127             :         /* Lookup the datatype of the supplied argument */
    4128         122 :         Oid         argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
    4129             : 
    4130         122 :         typlen = get_typlen(argtypeid);
    4131         122 :         if (typlen == 0)        /* should not happen */
    4132           0 :             elog(ERROR, "cache lookup failed for type %u", argtypeid);
    4133             : 
    4134         122 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    4135             :                                                       sizeof(int));
    4136         122 :         *((int *) fcinfo->flinfo->fn_extra) = typlen;
    4137             :     }
    4138             :     else
    4139           0 :         typlen = *((int *) fcinfo->flinfo->fn_extra);
    4140             : 
    4141         122 :     if (typlen == -1)
    4142             :     {
    4143             :         /* varlena type, possibly toasted */
    4144         122 :         result = toast_datum_size(value);
    4145             :     }
    4146           0 :     else if (typlen == -2)
    4147             :     {
    4148             :         /* cstring */
    4149           0 :         result = strlen(DatumGetCString(value)) + 1;
    4150             :     }
    4151             :     else
    4152             :     {
    4153             :         /* ordinary fixed-width type */
    4154           0 :         result = typlen;
    4155             :     }
    4156             : 
    4157         122 :     PG_RETURN_INT32(result);
    4158             : }
    4159             : 
    4160             : /*
    4161             :  * Return the compression method stored in the compressed attribute.  Return
    4162             :  * NULL for non varlena type or uncompressed data.
    4163             :  */
    4164             : Datum
    4165         162 : pg_column_compression(PG_FUNCTION_ARGS)
    4166             : {
    4167             :     int         typlen;
    4168             :     char       *result;
    4169             :     ToastCompressionId cmid;
    4170             : 
    4171             :     /* On first call, get the input type's typlen, and save at *fn_extra */
    4172         162 :     if (fcinfo->flinfo->fn_extra == NULL)
    4173             :     {
    4174             :         /* Lookup the datatype of the supplied argument */
    4175         108 :         Oid         argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
    4176             : 
    4177         108 :         typlen = get_typlen(argtypeid);
    4178         108 :         if (typlen == 0)        /* should not happen */
    4179           0 :             elog(ERROR, "cache lookup failed for type %u", argtypeid);
    4180             : 
    4181         108 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    4182             :                                                       sizeof(int));
    4183         108 :         *((int *) fcinfo->flinfo->fn_extra) = typlen;
    4184             :     }
    4185             :     else
    4186          54 :         typlen = *((int *) fcinfo->flinfo->fn_extra);
    4187             : 
    4188         162 :     if (typlen != -1)
    4189           0 :         PG_RETURN_NULL();
    4190             : 
    4191             :     /* get the compression method id stored in the compressed varlena */
    4192         162 :     cmid = toast_get_compression_id((struct varlena *)
    4193         162 :                                     DatumGetPointer(PG_GETARG_DATUM(0)));
    4194         162 :     if (cmid == TOAST_INVALID_COMPRESSION_ID)
    4195           6 :         PG_RETURN_NULL();
    4196             : 
    4197             :     /* convert compression method id to compression method name */
    4198         156 :     switch (cmid)
    4199             :     {
    4200          66 :         case TOAST_PGLZ_COMPRESSION_ID:
    4201          66 :             result = "pglz";
    4202          66 :             break;
    4203          90 :         case TOAST_LZ4_COMPRESSION_ID:
    4204          90 :             result = "lz4";
    4205          90 :             break;
    4206           0 :         default:
    4207           0 :             elog(ERROR, "invalid compression method id %d", cmid);
    4208             :     }
    4209             : 
    4210         156 :     PG_RETURN_TEXT_P(cstring_to_text(result));
    4211             : }
    4212             : 
    4213             : /*
    4214             :  * Return the chunk_id of the on-disk TOASTed value.  Return NULL if the value
    4215             :  * is un-TOASTed or not on-disk.
    4216             :  */
    4217             : Datum
    4218          12 : pg_column_toast_chunk_id(PG_FUNCTION_ARGS)
    4219             : {
    4220             :     int         typlen;
    4221             :     struct varlena *attr;
    4222             :     struct varatt_external toast_pointer;
    4223             : 
    4224             :     /* On first call, get the input type's typlen, and save at *fn_extra */
    4225          12 :     if (fcinfo->flinfo->fn_extra == NULL)
    4226             :     {
    4227             :         /* Lookup the datatype of the supplied argument */
    4228          12 :         Oid         argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
    4229             : 
    4230          12 :         typlen = get_typlen(argtypeid);
    4231          12 :         if (typlen == 0)        /* should not happen */
    4232           0 :             elog(ERROR, "cache lookup failed for type %u", argtypeid);
    4233             : 
    4234          12 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    4235             :                                                       sizeof(int));
    4236          12 :         *((int *) fcinfo->flinfo->fn_extra) = typlen;
    4237             :     }
    4238             :     else
    4239           0 :         typlen = *((int *) fcinfo->flinfo->fn_extra);
    4240             : 
    4241          12 :     if (typlen != -1)
    4242           0 :         PG_RETURN_NULL();
    4243             : 
    4244          12 :     attr = (struct varlena *) DatumGetPointer(PG_GETARG_DATUM(0));
    4245             : 
    4246          12 :     if (!VARATT_IS_EXTERNAL_ONDISK(attr))
    4247           6 :         PG_RETURN_NULL();
    4248             : 
    4249           6 :     VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
    4250             : 
    4251           6 :     PG_RETURN_OID(toast_pointer.va_valueid);
    4252             : }
    4253             : 
    4254             : /*
    4255             :  * string_agg - Concatenates values and returns string.
    4256             :  *
    4257             :  * Syntax: string_agg(value text, delimiter text) RETURNS text
    4258             :  *
    4259             :  * Note: Any NULL values are ignored. The first-call delimiter isn't
    4260             :  * actually used at all, and on subsequent calls the delimiter precedes
    4261             :  * the associated value.
    4262             :  */
    4263             : 
    4264             : /* subroutine to initialize state */
    4265             : static StringInfo
    4266        2370 : makeStringAggState(FunctionCallInfo fcinfo)
    4267             : {
    4268             :     StringInfo  state;
    4269             :     MemoryContext aggcontext;
    4270             :     MemoryContext oldcontext;
    4271             : 
    4272        2370 :     if (!AggCheckCallContext(fcinfo, &aggcontext))
    4273             :     {
    4274             :         /* cannot be called directly because of internal-type argument */
    4275           0 :         elog(ERROR, "string_agg_transfn called in non-aggregate context");
    4276             :     }
    4277             : 
    4278             :     /*
    4279             :      * Create state in aggregate context.  It'll stay there across subsequent
    4280             :      * calls.
    4281             :      */
    4282        2370 :     oldcontext = MemoryContextSwitchTo(aggcontext);
    4283        2370 :     state = makeStringInfo();
    4284        2370 :     MemoryContextSwitchTo(oldcontext);
    4285             : 
    4286        2370 :     return state;
    4287             : }
    4288             : 
    4289             : Datum
    4290      972404 : string_agg_transfn(PG_FUNCTION_ARGS)
    4291             : {
    4292             :     StringInfo  state;
    4293             : 
    4294      972404 :     state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
    4295             : 
    4296             :     /* Append the value unless null, preceding it with the delimiter. */
    4297      972404 :     if (!PG_ARGISNULL(1))
    4298             :     {
    4299      957356 :         text       *value = PG_GETARG_TEXT_PP(1);
    4300      957356 :         bool        isfirst = false;
    4301             : 
    4302             :         /*
    4303             :          * You might think we can just throw away the first delimiter, however
    4304             :          * we must keep it as we may be a parallel worker doing partial
    4305             :          * aggregation building a state to send to the main process.  We need
    4306             :          * to keep the delimiter of every aggregation so that the combine
    4307             :          * function can properly join up the strings of two separately
    4308             :          * partially aggregated results.  The first delimiter is only stripped
    4309             :          * off in the final function.  To know how much to strip off the front
    4310             :          * of the string, we store the length of the first delimiter in the
    4311             :          * StringInfo's cursor field, which we don't otherwise need here.
    4312             :          */
    4313      957356 :         if (state == NULL)
    4314             :         {
    4315        2050 :             state = makeStringAggState(fcinfo);
    4316        2050 :             isfirst = true;
    4317             :         }
    4318             : 
    4319      957356 :         if (!PG_ARGISNULL(2))
    4320             :         {
    4321      957356 :             text       *delim = PG_GETARG_TEXT_PP(2);
    4322             : 
    4323      957356 :             appendStringInfoText(state, delim);
    4324      957356 :             if (isfirst)
    4325        2050 :                 state->cursor = VARSIZE_ANY_EXHDR(delim);
    4326             :         }
    4327             : 
    4328      957356 :         appendStringInfoText(state, value);
    4329             :     }
    4330             : 
    4331             :     /*
    4332             :      * The transition type for string_agg() is declared to be "internal",
    4333             :      * which is a pass-by-value type the same size as a pointer.
    4334             :      */
    4335      972404 :     if (state)
    4336      972318 :         PG_RETURN_POINTER(state);
    4337          86 :     PG_RETURN_NULL();
    4338             : }
    4339             : 
    4340             : /*
    4341             :  * string_agg_combine
    4342             :  *      Aggregate combine function for string_agg(text) and string_agg(bytea)
    4343             :  */
    4344             : Datum
    4345         200 : string_agg_combine(PG_FUNCTION_ARGS)
    4346             : {
    4347             :     StringInfo  state1;
    4348             :     StringInfo  state2;
    4349             :     MemoryContext agg_context;
    4350             : 
    4351         200 :     if (!AggCheckCallContext(fcinfo, &agg_context))
    4352           0 :         elog(ERROR, "aggregate function called in non-aggregate context");
    4353             : 
    4354         200 :     state1 = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
    4355         200 :     state2 = PG_ARGISNULL(1) ? NULL : (StringInfo) PG_GETARG_POINTER(1);
    4356             : 
    4357         200 :     if (state2 == NULL)
    4358             :     {
    4359             :         /*
    4360             :          * NULL state2 is easy, just return state1, which we know is already
    4361             :          * in the agg_context
    4362             :          */
    4363           0 :         if (state1 == NULL)
    4364           0 :             PG_RETURN_NULL();
    4365           0 :         PG_RETURN_POINTER(state1);
    4366             :     }
    4367             : 
    4368         200 :     if (state1 == NULL)
    4369             :     {
    4370             :         /* We must copy state2's data into the agg_context */
    4371             :         MemoryContext old_context;
    4372             : 
    4373         120 :         old_context = MemoryContextSwitchTo(agg_context);
    4374         120 :         state1 = makeStringAggState(fcinfo);
    4375         120 :         appendBinaryStringInfo(state1, state2->data, state2->len);
    4376         120 :         state1->cursor = state2->cursor;
    4377         120 :         MemoryContextSwitchTo(old_context);
    4378             :     }
    4379          80 :     else if (state2->len > 0)
    4380             :     {
    4381             :         /* Combine ... state1->cursor does not change in this case */
    4382          80 :         appendBinaryStringInfo(state1, state2->data, state2->len);
    4383             :     }
    4384             : 
    4385         200 :     PG_RETURN_POINTER(state1);
    4386             : }
    4387             : 
    4388             : /*
    4389             :  * string_agg_serialize
    4390             :  *      Aggregate serialize function for string_agg(text) and string_agg(bytea)
    4391             :  *
    4392             :  * This is strict, so we need not handle NULL input
    4393             :  */
    4394             : Datum
    4395         200 : string_agg_serialize(PG_FUNCTION_ARGS)
    4396             : {
    4397             :     StringInfo  state;
    4398             :     StringInfoData buf;
    4399             :     bytea      *result;
    4400             : 
    4401             :     /* cannot be called directly because of internal-type argument */
    4402             :     Assert(AggCheckCallContext(fcinfo, NULL));
    4403             : 
    4404         200 :     state = (StringInfo) PG_GETARG_POINTER(0);
    4405             : 
    4406         200 :     pq_begintypsend(&buf);
    4407             : 
    4408             :     /* cursor */
    4409         200 :     pq_sendint(&buf, state->cursor, 4);
    4410             : 
    4411             :     /* data */
    4412         200 :     pq_sendbytes(&buf, state->data, state->len);
    4413             : 
    4414         200 :     result = pq_endtypsend(&buf);
    4415             : 
    4416         200 :     PG_RETURN_BYTEA_P(result);
    4417             : }
    4418             : 
    4419             : /*
    4420             :  * string_agg_deserialize
    4421             :  *      Aggregate deserial function for string_agg(text) and string_agg(bytea)
    4422             :  *
    4423             :  * This is strict, so we need not handle NULL input
    4424             :  */
    4425             : Datum
    4426         200 : string_agg_deserialize(PG_FUNCTION_ARGS)
    4427             : {
    4428             :     bytea      *sstate;
    4429             :     StringInfo  result;
    4430             :     StringInfoData buf;
    4431             :     char       *data;
    4432             :     int         datalen;
    4433             : 
    4434             :     /* cannot be called directly because of internal-type argument */
    4435             :     Assert(AggCheckCallContext(fcinfo, NULL));
    4436             : 
    4437         200 :     sstate = PG_GETARG_BYTEA_PP(0);
    4438             : 
    4439             :     /*
    4440             :      * Initialize a StringInfo so that we can "receive" it using the standard
    4441             :      * recv-function infrastructure.
    4442             :      */
    4443         200 :     initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate),
    4444         200 :                            VARSIZE_ANY_EXHDR(sstate));
    4445             : 
    4446         200 :     result = makeStringAggState(fcinfo);
    4447             : 
    4448             :     /* cursor */
    4449         200 :     result->cursor = pq_getmsgint(&buf, 4);
    4450             : 
    4451             :     /* data */
    4452         200 :     datalen = VARSIZE_ANY_EXHDR(sstate) - 4;
    4453         200 :     data = (char *) pq_getmsgbytes(&buf, datalen);
    4454         200 :     appendBinaryStringInfo(result, data, datalen);
    4455             : 
    4456         200 :     pq_getmsgend(&buf);
    4457             : 
    4458         200 :     PG_RETURN_POINTER(result);
    4459             : }
    4460             : 
    4461             : Datum
    4462        2094 : string_agg_finalfn(PG_FUNCTION_ARGS)
    4463             : {
    4464             :     StringInfo  state;
    4465             : 
    4466             :     /* cannot be called directly because of internal-type argument */
    4467             :     Assert(AggCheckCallContext(fcinfo, NULL));
    4468             : 
    4469        2094 :     state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
    4470             : 
    4471        2094 :     if (state != NULL)
    4472             :     {
    4473             :         /* As per comment in transfn, strip data before the cursor position */
    4474        2010 :         PG_RETURN_TEXT_P(cstring_to_text_with_len(&state->data[state->cursor],
    4475             :                                                   state->len - state->cursor));
    4476             :     }
    4477             :     else
    4478          84 :         PG_RETURN_NULL();
    4479             : }
    4480             : 
    4481             : /*
    4482             :  * Prepare cache with fmgr info for the output functions of the datatypes of
    4483             :  * the arguments of a concat-like function, beginning with argument "argidx".
    4484             :  * (Arguments before that will have corresponding slots in the resulting
    4485             :  * FmgrInfo array, but we don't fill those slots.)
    4486             :  */
    4487             : static FmgrInfo *
    4488         106 : build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
    4489             : {
    4490             :     FmgrInfo   *foutcache;
    4491             :     int         i;
    4492             : 
    4493             :     /* We keep the info in fn_mcxt so it survives across calls */
    4494         106 :     foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    4495         106 :                                                 PG_NARGS() * sizeof(FmgrInfo));
    4496             : 
    4497         400 :     for (i = argidx; i < PG_NARGS(); i++)
    4498             :     {
    4499             :         Oid         valtype;
    4500             :         Oid         typOutput;
    4501             :         bool        typIsVarlena;
    4502             : 
    4503         294 :         valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
    4504         294 :         if (!OidIsValid(valtype))
    4505           0 :             elog(ERROR, "could not determine data type of concat() input");
    4506             : 
    4507         294 :         getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
    4508         294 :         fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
    4509             :     }
    4510             : 
    4511         106 :     fcinfo->flinfo->fn_extra = foutcache;
    4512             : 
    4513         106 :     return foutcache;
    4514             : }
    4515             : 
    4516             : /*
    4517             :  * Implementation of both concat() and concat_ws().
    4518             :  *
    4519             :  * sepstr is the separator string to place between values.
    4520             :  * argidx identifies the first argument to concatenate (counting from zero);
    4521             :  * note that this must be constant across any one series of calls.
    4522             :  *
    4523             :  * Returns NULL if result should be NULL, else text value.
    4524             :  */
    4525             : static text *
    4526         264 : concat_internal(const char *sepstr, int argidx,
    4527             :                 FunctionCallInfo fcinfo)
    4528             : {
    4529             :     text       *result;
    4530             :     StringInfoData str;
    4531             :     FmgrInfo   *foutcache;
    4532         264 :     bool        first_arg = true;
    4533             :     int         i;
    4534             : 
    4535             :     /*
    4536             :      * concat(VARIADIC some-array) is essentially equivalent to
    4537             :      * array_to_text(), ie concat the array elements with the given separator.
    4538             :      * So we just pass the case off to that code.
    4539             :      */
    4540         264 :     if (get_fn_expr_variadic(fcinfo->flinfo))
    4541             :     {
    4542             :         ArrayType  *arr;
    4543             : 
    4544             :         /* Should have just the one argument */
    4545             :         Assert(argidx == PG_NARGS() - 1);
    4546             : 
    4547             :         /* concat(VARIADIC NULL) is defined as NULL */
    4548          30 :         if (PG_ARGISNULL(argidx))
    4549          12 :             return NULL;
    4550             : 
    4551             :         /*
    4552             :          * Non-null argument had better be an array.  We assume that any call
    4553             :          * context that could let get_fn_expr_variadic return true will have
    4554             :          * checked that a VARIADIC-labeled parameter actually is an array.  So
    4555             :          * it should be okay to just Assert that it's an array rather than
    4556             :          * doing a full-fledged error check.
    4557             :          */
    4558             :         Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
    4559             : 
    4560             :         /* OK, safe to fetch the array value */
    4561          18 :         arr = PG_GETARG_ARRAYTYPE_P(argidx);
    4562             : 
    4563             :         /*
    4564             :          * And serialize the array.  We tell array_to_text to ignore null
    4565             :          * elements, which matches the behavior of the loop below.
    4566             :          */
    4567          18 :         return array_to_text_internal(fcinfo, arr, sepstr, NULL);
    4568             :     }
    4569             : 
    4570             :     /* Normal case without explicit VARIADIC marker */
    4571         234 :     initStringInfo(&str);
    4572             : 
    4573             :     /* Get output function info, building it if first time through */
    4574         234 :     foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
    4575         234 :     if (foutcache == NULL)
    4576         106 :         foutcache = build_concat_foutcache(fcinfo, argidx);
    4577             : 
    4578         822 :     for (i = argidx; i < PG_NARGS(); i++)
    4579             :     {
    4580         588 :         if (!PG_ARGISNULL(i))
    4581             :         {
    4582         510 :             Datum       value = PG_GETARG_DATUM(i);
    4583             : 
    4584             :             /* add separator if appropriate */
    4585         510 :             if (first_arg)
    4586         228 :                 first_arg = false;
    4587             :             else
    4588         282 :                 appendStringInfoString(&str, sepstr);
    4589             : 
    4590             :             /* call the appropriate type output function, append the result */
    4591         510 :             appendStringInfoString(&str,
    4592         510 :                                    OutputFunctionCall(&foutcache[i], value));
    4593             :         }
    4594             :     }
    4595             : 
    4596         234 :     result = cstring_to_text_with_len(str.data, str.len);
    4597         234 :     pfree(str.data);
    4598             : 
    4599         234 :     return result;
    4600             : }
    4601             : 
    4602             : /*
    4603             :  * Concatenate all arguments. NULL arguments are ignored.
    4604             :  */
    4605             : Datum
    4606         186 : text_concat(PG_FUNCTION_ARGS)
    4607             : {
    4608             :     text       *result;
    4609             : 
    4610         186 :     result = concat_internal("", 0, fcinfo);
    4611         186 :     if (result == NULL)
    4612           6 :         PG_RETURN_NULL();
    4613         180 :     PG_RETURN_TEXT_P(result);
    4614             : }
    4615             : 
    4616             : /*
    4617             :  * Concatenate all but first argument value with separators. The first
    4618             :  * parameter is used as the separator. NULL arguments are ignored.
    4619             :  */
    4620             : Datum
    4621          84 : text_concat_ws(PG_FUNCTION_ARGS)
    4622             : {
    4623             :     char       *sep;
    4624             :     text       *result;
    4625             : 
    4626             :     /* return NULL when separator is NULL */
    4627          84 :     if (PG_ARGISNULL(0))
    4628           6 :         PG_RETURN_NULL();
    4629          78 :     sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
    4630             : 
    4631          78 :     result = concat_internal(sep, 1, fcinfo);
    4632          78 :     if (result == NULL)
    4633           6 :         PG_RETURN_NULL();
    4634          72 :     PG_RETURN_TEXT_P(result);
    4635             : }
    4636             : 
    4637             : /*
    4638             :  * Return first n characters in the string. When n is negative,
    4639             :  * return all but last |n| characters.
    4640             :  */
    4641             : Datum
    4642        2148 : text_left(PG_FUNCTION_ARGS)
    4643             : {
    4644        2148 :     int         n = PG_GETARG_INT32(1);
    4645             : 
    4646        2148 :     if (n < 0)
    4647             :     {
    4648          30 :         text       *str = PG_GETARG_TEXT_PP(0);
    4649          30 :         const char *p = VARDATA_ANY(str);
    4650          30 :         int         len = VARSIZE_ANY_EXHDR(str);
    4651             :         int         rlen;
    4652             : 
    4653          30 :         n = pg_mbstrlen_with_len(p, len) + n;
    4654          30 :         rlen = pg_mbcharcliplen(p, len, n);
    4655          30 :         PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
    4656             :     }
    4657             :     else
    4658        2118 :         PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
    4659             : }
    4660             : 
    4661             : /*
    4662             :  * Return last n characters in the string. When n is negative,
    4663             :  * return all but first |n| characters.
    4664             :  */
    4665             : Datum
    4666          66 : text_right(PG_FUNCTION_ARGS)
    4667             : {
    4668          66 :     text       *str = PG_GETARG_TEXT_PP(0);
    4669          66 :     const char *p = VARDATA_ANY(str);
    4670          66 :     int         len = VARSIZE_ANY_EXHDR(str);
    4671          66 :     int         n = PG_GETARG_INT32(1);
    4672             :     int         off;
    4673             : 
    4674          66 :     if (n < 0)
    4675          30 :         n = -n;
    4676             :     else
    4677          36 :         n = pg_mbstrlen_with_len(p, len) - n;
    4678          66 :     off = pg_mbcharcliplen(p, len, n);
    4679             : 
    4680          66 :     PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
    4681             : }
    4682             : 
    4683             : /*
    4684             :  * Return reversed string
    4685             :  */
    4686             : Datum
    4687           6 : text_reverse(PG_FUNCTION_ARGS)
    4688             : {
    4689           6 :     text       *str = PG_GETARG_TEXT_PP(0);
    4690           6 :     const char *p = VARDATA_ANY(str);
    4691           6 :     int         len = VARSIZE_ANY_EXHDR(str);
    4692           6 :     const char *endp = p + len;
    4693             :     text       *result;
    4694             :     char       *dst;
    4695             : 
    4696           6 :     result = palloc(len + VARHDRSZ);
    4697           6 :     dst = (char *) VARDATA(result) + len;
    4698           6 :     SET_VARSIZE(result, len + VARHDRSZ);
    4699             : 
    4700           6 :     if (pg_database_encoding_max_length() > 1)
    4701             :     {
    4702             :         /* multibyte version */
    4703          36 :         while (p < endp)
    4704             :         {
    4705             :             int         sz;
    4706             : 
    4707          30 :             sz = pg_mblen(p);
    4708          30 :             dst -= sz;
    4709          30 :             memcpy(dst, p, sz);
    4710          30 :             p += sz;
    4711             :         }
    4712             :     }
    4713             :     else
    4714             :     {
    4715             :         /* single byte version */
    4716           0 :         while (p < endp)
    4717           0 :             *(--dst) = *p++;
    4718             :     }
    4719             : 
    4720           6 :     PG_RETURN_TEXT_P(result);
    4721             : }
    4722             : 
    4723             : 
    4724             : /*
    4725             :  * Support macros for text_format()
    4726             :  */
    4727             : #define TEXT_FORMAT_FLAG_MINUS  0x0001  /* is minus flag present? */
    4728             : 
    4729             : #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
    4730             :     do { \
    4731             :         if (++(ptr) >= (end_ptr)) \
    4732             :             ereport(ERROR, \
    4733             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
    4734             :                      errmsg("unterminated format() type specifier"), \
    4735             :                      errhint("For a single \"%%\" use \"%%%%\"."))); \
    4736             :     } while (0)
    4737             : 
    4738             : /*
    4739             :  * Returns a formatted string
    4740             :  */
    4741             : Datum
    4742       33180 : text_format(PG_FUNCTION_ARGS)
    4743             : {
    4744             :     text       *fmt;
    4745             :     StringInfoData str;
    4746             :     const char *cp;
    4747             :     const char *start_ptr;
    4748             :     const char *end_ptr;
    4749             :     text       *result;
    4750             :     int         arg;
    4751             :     bool        funcvariadic;
    4752             :     int         nargs;
    4753       33180 :     Datum      *elements = NULL;
    4754       33180 :     bool       *nulls = NULL;
    4755       33180 :     Oid         element_type = InvalidOid;
    4756       33180 :     Oid         prev_type = InvalidOid;
    4757       33180 :     Oid         prev_width_type = InvalidOid;
    4758             :     FmgrInfo    typoutputfinfo;
    4759             :     FmgrInfo    typoutputinfo_width;
    4760             : 
    4761             :     /* When format string is null, immediately return null */
    4762       33180 :     if (PG_ARGISNULL(0))
    4763           6 :         PG_RETURN_NULL();
    4764             : 
    4765             :     /* If argument is marked VARIADIC, expand array into elements */
    4766       33174 :     if (get_fn_expr_variadic(fcinfo->flinfo))
    4767             :     {
    4768             :         ArrayType  *arr;
    4769             :         int16       elmlen;
    4770             :         bool        elmbyval;
    4771             :         char        elmalign;
    4772             :         int         nitems;
    4773             : 
    4774             :         /* Should have just the one argument */
    4775             :         Assert(PG_NARGS() == 2);
    4776             : 
    4777             :         /* If argument is NULL, we treat it as zero-length array */
    4778          48 :         if (PG_ARGISNULL(1))
    4779           6 :             nitems = 0;
    4780             :         else
    4781             :         {
    4782             :             /*
    4783             :              * Non-null argument had better be an array.  We assume that any
    4784             :              * call context that could let get_fn_expr_variadic return true
    4785             :              * will have checked that a VARIADIC-labeled parameter actually is
    4786             :              * an array.  So it should be okay to just Assert that it's an
    4787             :              * array rather than doing a full-fledged error check.
    4788             :              */
    4789             :             Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
    4790             : 
    4791             :             /* OK, safe to fetch the array value */
    4792          42 :             arr = PG_GETARG_ARRAYTYPE_P(1);
    4793             : 
    4794             :             /* Get info about array element type */
    4795          42 :             element_type = ARR_ELEMTYPE(arr);
    4796          42 :             get_typlenbyvalalign(element_type,
    4797             :                                  &elmlen, &elmbyval, &elmalign);
    4798             : 
    4799             :             /* Extract all array elements */
    4800          42 :             deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
    4801             :                               &elements, &nulls, &nitems);
    4802             :         }
    4803             : 
    4804          48 :         nargs = nitems + 1;
    4805          48 :         funcvariadic = true;
    4806             :     }
    4807             :     else
    4808             :     {
    4809             :         /* Non-variadic case, we'll process the arguments individually */
    4810       33126 :         nargs = PG_NARGS();
    4811       33126 :         funcvariadic = false;
    4812             :     }
    4813             : 
    4814             :     /* Setup for main loop. */
    4815       33174 :     fmt = PG_GETARG_TEXT_PP(0);
    4816       33174 :     start_ptr = VARDATA_ANY(fmt);
    4817       33174 :     end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
    4818       33174 :     initStringInfo(&str);
    4819       33174 :     arg = 1;                    /* next argument position to print */
    4820             : 
    4821             :     /* Scan format string, looking for conversion specifiers. */
    4822     1012568 :     for (cp = start_ptr; cp < end_ptr; cp++)
    4823             :     {
    4824             :         int         argpos;
    4825             :         int         widthpos;
    4826             :         int         flags;
    4827             :         int         width;
    4828             :         Datum       value;
    4829             :         bool        isNull;
    4830             :         Oid         typid;
    4831             : 
    4832             :         /*
    4833             :          * If it's not the start of a conversion specifier, just copy it to
    4834             :          * the output buffer.
    4835             :          */
    4836      979454 :         if (*cp != '%')
    4837             :         {
    4838      913622 :             appendStringInfoCharMacro(&str, *cp);
    4839      913640 :             continue;
    4840             :         }
    4841             : 
    4842       65832 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    4843             : 
    4844             :         /* Easy case: %% outputs a single % */
    4845       65832 :         if (*cp == '%')
    4846             :         {
    4847          18 :             appendStringInfoCharMacro(&str, *cp);
    4848          18 :             continue;
    4849             :         }
    4850             : 
    4851             :         /* Parse the optional portions of the format specifier */
    4852       65814 :         cp = text_format_parse_format(cp, end_ptr,
    4853             :                                       &argpos, &widthpos,
    4854             :                                       &flags, &width);
    4855             : 
    4856             :         /*
    4857             :          * Next we should see the main conversion specifier.  Whether or not
    4858             :          * an argument position was present, it's known that at least one
    4859             :          * character remains in the string at this point.  Experience suggests
    4860             :          * that it's worth checking that that character is one of the expected
    4861             :          * ones before we try to fetch arguments, so as to produce the least
    4862             :          * confusing response to a mis-formatted specifier.
    4863             :          */
    4864       65790 :         if (strchr("sIL", *cp) == NULL)
    4865           6 :             ereport(ERROR,
    4866             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4867             :                      errmsg("unrecognized format() type specifier \"%.*s\"",
    4868             :                             pg_mblen(cp), cp),
    4869             :                      errhint("For a single \"%%\" use \"%%%%\".")));
    4870             : 
    4871             :         /* If indirect width was specified, get its value */
    4872       65784 :         if (widthpos >= 0)
    4873             :         {
    4874             :             /* Collect the specified or next argument position */
    4875          42 :             if (widthpos > 0)
    4876          36 :                 arg = widthpos;
    4877          42 :             if (arg >= nargs)
    4878           0 :                 ereport(ERROR,
    4879             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4880             :                          errmsg("too few arguments for format()")));
    4881             : 
    4882             :             /* Get the value and type of the selected argument */
    4883          42 :             if (!funcvariadic)
    4884             :             {
    4885          42 :                 value = PG_GETARG_DATUM(arg);
    4886          42 :                 isNull = PG_ARGISNULL(arg);
    4887          42 :                 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
    4888             :             }
    4889             :             else
    4890             :             {
    4891           0 :                 value = elements[arg - 1];
    4892           0 :                 isNull = nulls[arg - 1];
    4893           0 :                 typid = element_type;
    4894             :             }
    4895          42 :             if (!OidIsValid(typid))
    4896           0 :                 elog(ERROR, "could not determine data type of format() input");
    4897             : 
    4898          42 :             arg++;
    4899             : 
    4900             :             /* We can treat NULL width the same as zero */
    4901          42 :             if (isNull)
    4902           6 :                 width = 0;
    4903          36 :             else if (typid == INT4OID)
    4904          36 :                 width = DatumGetInt32(value);
    4905           0 :             else if (typid == INT2OID)
    4906           0 :                 width = DatumGetInt16(value);
    4907             :             else
    4908             :             {
    4909             :                 /* For less-usual datatypes, convert to text then to int */
    4910             :                 char       *str;
    4911             : 
    4912           0 :                 if (typid != prev_width_type)
    4913             :                 {
    4914             :                     Oid         typoutputfunc;
    4915             :                     bool        typIsVarlena;
    4916             : 
    4917           0 :                     getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
    4918           0 :                     fmgr_info(typoutputfunc, &typoutputinfo_width);
    4919           0 :                     prev_width_type = typid;
    4920             :                 }
    4921             : 
    4922           0 :                 str = OutputFunctionCall(&typoutputinfo_width, value);
    4923             : 
    4924             :                 /* pg_strtoint32 will complain about bad data or overflow */
    4925           0 :                 width = pg_strtoint32(str);
    4926             : 
    4927           0 :                 pfree(str);
    4928             :             }
    4929             :         }
    4930             : 
    4931             :         /* Collect the specified or next argument position */
    4932       65784 :         if (argpos > 0)
    4933         132 :             arg = argpos;
    4934       65784 :         if (arg >= nargs)
    4935          24 :             ereport(ERROR,
    4936             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4937             :                      errmsg("too few arguments for format()")));
    4938             : 
    4939             :         /* Get the value and type of the selected argument */
    4940       65760 :         if (!funcvariadic)
    4941             :         {
    4942       64488 :             value = PG_GETARG_DATUM(arg);
    4943       64488 :             isNull = PG_ARGISNULL(arg);
    4944       64488 :             typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
    4945             :         }
    4946             :         else
    4947             :         {
    4948        1272 :             value = elements[arg - 1];
    4949        1272 :             isNull = nulls[arg - 1];
    4950        1272 :             typid = element_type;
    4951             :         }
    4952       65760 :         if (!OidIsValid(typid))
    4953           0 :             elog(ERROR, "could not determine data type of format() input");
    4954             : 
    4955       65760 :         arg++;
    4956             : 
    4957             :         /*
    4958             :          * Get the appropriate typOutput function, reusing previous one if
    4959             :          * same type as previous argument.  That's particularly useful in the
    4960             :          * variadic-array case, but often saves work even for ordinary calls.
    4961             :          */
    4962       65760 :         if (typid != prev_type)
    4963             :         {
    4964             :             Oid         typoutputfunc;
    4965             :             bool        typIsVarlena;
    4966             : 
    4967       34242 :             getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
    4968       34242 :             fmgr_info(typoutputfunc, &typoutputfinfo);
    4969       34242 :             prev_type = typid;
    4970             :         }
    4971             : 
    4972             :         /*
    4973             :          * And now we can format the value.
    4974             :          */
    4975       65760 :         switch (*cp)
    4976             :         {
    4977       65760 :             case 's':
    4978             :             case 'I':
    4979             :             case 'L':
    4980       65760 :                 text_format_string_conversion(&str, *cp, &typoutputfinfo,
    4981             :                                               value, isNull,
    4982             :                                               flags, width);
    4983       65754 :                 break;
    4984           0 :             default:
    4985             :                 /* should not get here, because of previous check */
    4986           0 :                 ereport(ERROR,
    4987             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4988             :                          errmsg("unrecognized format() type specifier \"%.*s\"",
    4989             :                                 pg_mblen(cp), cp),
    4990             :                          errhint("For a single \"%%\" use \"%%%%\".")));
    4991             :                 break;
    4992             :         }
    4993             :     }
    4994             : 
    4995             :     /* Don't need deconstruct_array results anymore. */
    4996       33114 :     if (elements != NULL)
    4997          42 :         pfree(elements);
    4998       33114 :     if (nulls != NULL)
    4999          42 :         pfree(nulls);
    5000             : 
    5001             :     /* Generate results. */
    5002       33114 :     result = cstring_to_text_with_len(str.data, str.len);
    5003       33114 :     pfree(str.data);
    5004             : 
    5005       33114 :     PG_RETURN_TEXT_P(result);
    5006             : }
    5007             : 
    5008             : /*
    5009             :  * Parse contiguous digits as a decimal number.
    5010             :  *
    5011             :  * Returns true if some digits could be parsed.
    5012             :  * The value is returned into *value, and *ptr is advanced to the next
    5013             :  * character to be parsed.
    5014             :  *
    5015             :  * Note parsing invariant: at least one character is known available before
    5016             :  * string end (end_ptr) at entry, and this is still true at exit.
    5017             :  */
    5018             : static bool
    5019      131592 : text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
    5020             : {
    5021      131592 :     bool        found = false;
    5022      131592 :     const char *cp = *ptr;
    5023      131592 :     int         val = 0;
    5024             : 
    5025      131904 :     while (*cp >= '0' && *cp <= '9')
    5026             :     {
    5027         318 :         int8        digit = (*cp - '0');
    5028             : 
    5029         318 :         if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
    5030         318 :             unlikely(pg_add_s32_overflow(val, digit, &val)))
    5031           0 :             ereport(ERROR,
    5032             :                     (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    5033             :                      errmsg("number is out of range")));
    5034         318 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5035         312 :         found = true;
    5036             :     }
    5037             : 
    5038      131586 :     *ptr = cp;
    5039      131586 :     *value = val;
    5040             : 
    5041      131586 :     return found;
    5042             : }
    5043             : 
    5044             : /*
    5045             :  * Parse a format specifier (generally following the SUS printf spec).
    5046             :  *
    5047             :  * We have already advanced over the initial '%', and we are looking for
    5048             :  * [argpos][flags][width]type (but the type character is not consumed here).
    5049             :  *
    5050             :  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
    5051             :  * Output parameters:
    5052             :  *  argpos: argument position for value to be printed.  -1 means unspecified.
    5053             :  *  widthpos: argument position for width.  Zero means the argument position
    5054             :  *          was unspecified (ie, take the next arg) and -1 means no width
    5055             :  *          argument (width was omitted or specified as a constant).
    5056             :  *  flags: bitmask of flags.
    5057             :  *  width: directly-specified width value.  Zero means the width was omitted
    5058             :  *          (note it's not necessary to distinguish this case from an explicit
    5059             :  *          zero width value).
    5060             :  *
    5061             :  * The function result is the next character position to be parsed, ie, the
    5062             :  * location where the type character is/should be.
    5063             :  *
    5064             :  * Note parsing invariant: at least one character is known available before
    5065             :  * string end (end_ptr) at entry, and this is still true at exit.
    5066             :  */
    5067             : static const char *
    5068       65814 : text_format_parse_format(const char *start_ptr, const char *end_ptr,
    5069             :                          int *argpos, int *widthpos,
    5070             :                          int *flags, int *width)
    5071             : {
    5072       65814 :     const char *cp = start_ptr;
    5073             :     int         n;
    5074             : 
    5075             :     /* set defaults for output parameters */
    5076       65814 :     *argpos = -1;
    5077       65814 :     *widthpos = -1;
    5078       65814 :     *flags = 0;
    5079       65814 :     *width = 0;
    5080             : 
    5081             :     /* try to identify first number */
    5082       65814 :     if (text_format_parse_digits(&cp, end_ptr, &n))
    5083             :     {
    5084         174 :         if (*cp != '$')
    5085             :         {
    5086             :             /* Must be just a width and a type, so we're done */
    5087          24 :             *width = n;
    5088          24 :             return cp;
    5089             :         }
    5090             :         /* The number was argument position */
    5091         150 :         *argpos = n;
    5092             :         /* Explicit 0 for argument index is immediately refused */
    5093         150 :         if (n == 0)
    5094           6 :             ereport(ERROR,
    5095             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5096             :                      errmsg("format specifies argument 0, but arguments are numbered from 1")));
    5097         144 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5098             :     }
    5099             : 
    5100             :     /* Handle flags (only minus is supported now) */
    5101       65808 :     while (*cp == '-')
    5102             :     {
    5103          30 :         *flags |= TEXT_FORMAT_FLAG_MINUS;
    5104          30 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5105             :     }
    5106             : 
    5107       65778 :     if (*cp == '*')
    5108             :     {
    5109             :         /* Handle indirect width */
    5110          48 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5111          48 :         if (text_format_parse_digits(&cp, end_ptr, &n))
    5112             :         {
    5113             :             /* number in this position must be closed by $ */
    5114          42 :             if (*cp != '$')
    5115           0 :                 ereport(ERROR,
    5116             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5117             :                          errmsg("width argument position must be ended by \"$\"")));
    5118             :             /* The number was width argument position */
    5119          42 :             *widthpos = n;
    5120             :             /* Explicit 0 for argument index is immediately refused */
    5121          42 :             if (n == 0)
    5122           6 :                 ereport(ERROR,
    5123             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5124             :                          errmsg("format specifies argument 0, but arguments are numbered from 1")));
    5125          36 :             ADVANCE_PARSE_POINTER(cp, end_ptr);
    5126             :         }
    5127             :         else
    5128           6 :             *widthpos = 0;      /* width's argument position is unspecified */
    5129             :     }
    5130             :     else
    5131             :     {
    5132             :         /* Check for direct width specification */
    5133       65730 :         if (text_format_parse_digits(&cp, end_ptr, &n))
    5134          30 :             *width = n;
    5135             :     }
    5136             : 
    5137             :     /* cp should now be pointing at type character */
    5138       65766 :     return cp;
    5139             : }
    5140             : 
    5141             : /*
    5142             :  * Format a %s, %I, or %L conversion
    5143             :  */
    5144             : static void
    5145       65760 : text_format_string_conversion(StringInfo buf, char conversion,
    5146             :                               FmgrInfo *typOutputInfo,
    5147             :                               Datum value, bool isNull,
    5148             :                               int flags, int width)
    5149             : {
    5150             :     char       *str;
    5151             : 
    5152             :     /* Handle NULL arguments before trying to stringify the value. */
    5153       65760 :     if (isNull)
    5154             :     {
    5155         342 :         if (conversion == 's')
    5156         270 :             text_format_append_string(buf, "", flags, width);
    5157          72 :         else if (conversion == 'L')
    5158          66 :             text_format_append_string(buf, "NULL", flags, width);
    5159           6 :         else if (conversion == 'I')
    5160           6 :             ereport(ERROR,
    5161             :                     (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
    5162             :                      errmsg("null values cannot be formatted as an SQL identifier")));
    5163         336 :         return;
    5164             :     }
    5165             : 
    5166             :     /* Stringify. */
    5167       65418 :     str = OutputFunctionCall(typOutputInfo, value);
    5168             : 
    5169             :     /* Escape. */
    5170       65418 :     if (conversion == 'I')
    5171             :     {
    5172             :         /* quote_identifier may or may not allocate a new string. */
    5173        4902 :         text_format_append_string(buf, quote_identifier(str), flags, width);
    5174             :     }
    5175       60516 :     else if (conversion == 'L')
    5176             :     {
    5177        3232 :         char       *qstr = quote_literal_cstr(str);
    5178             : 
    5179        3232 :         text_format_append_string(buf, qstr, flags, width);
    5180             :         /* quote_literal_cstr() always allocates a new string */
    5181        3232 :         pfree(qstr);
    5182             :     }
    5183             :     else
    5184       57284 :         text_format_append_string(buf, str, flags, width);
    5185             : 
    5186             :     /* Cleanup. */
    5187       65418 :     pfree(str);
    5188             : }
    5189             : 
    5190             : /*
    5191             :  * Append str to buf, padding as directed by flags/width
    5192             :  */
    5193             : static void
    5194       65754 : text_format_append_string(StringInfo buf, const char *str,
    5195             :                           int flags, int width)
    5196             : {
    5197       65754 :     bool        align_to_left = false;
    5198             :     int         len;
    5199             : 
    5200             :     /* fast path for typical easy case */
    5201       65754 :     if (width == 0)
    5202             :     {
    5203       65670 :         appendStringInfoString(buf, str);
    5204       65670 :         return;
    5205             :     }
    5206             : 
    5207          84 :     if (width < 0)
    5208             :     {
    5209             :         /* Negative width: implicit '-' flag, then take absolute value */
    5210           6 :         align_to_left = true;
    5211             :         /* -INT_MIN is undefined */
    5212           6 :         if (width <= INT_MIN)
    5213           0 :             ereport(ERROR,
    5214             :                     (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    5215             :                      errmsg("number is out of range")));
    5216           6 :         width = -width;
    5217             :     }
    5218          78 :     else if (flags & TEXT_FORMAT_FLAG_MINUS)
    5219          24 :         align_to_left = true;
    5220             : 
    5221          84 :     len = pg_mbstrlen(str);
    5222          84 :     if (align_to_left)
    5223             :     {
    5224             :         /* left justify */
    5225          30 :         appendStringInfoString(buf, str);
    5226          30 :         if (len < width)
    5227          30 :             appendStringInfoSpaces(buf, width - len);
    5228             :     }
    5229             :     else
    5230             :     {
    5231             :         /* right justify */
    5232          54 :         if (len < width)
    5233          54 :             appendStringInfoSpaces(buf, width - len);
    5234          54 :         appendStringInfoString(buf, str);
    5235             :     }
    5236             : }
    5237             : 
    5238             : /*
    5239             :  * text_format_nv - nonvariadic wrapper for text_format function.
    5240             :  *
    5241             :  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
    5242             :  * which checks that all built-in functions that share the implementing C
    5243             :  * function take the same number of arguments.
    5244             :  */
    5245             : Datum
    5246        3810 : text_format_nv(PG_FUNCTION_ARGS)
    5247             : {
    5248        3810 :     return text_format(fcinfo);
    5249             : }
    5250             : 
    5251             : /*
    5252             :  * Helper function for Levenshtein distance functions. Faster than memcmp(),
    5253             :  * for this use case.
    5254             :  */
    5255             : static inline bool
    5256           0 : rest_of_char_same(const char *s1, const char *s2, int len)
    5257             : {
    5258           0 :     while (len > 0)
    5259             :     {
    5260           0 :         len--;
    5261           0 :         if (s1[len] != s2[len])
    5262           0 :             return false;
    5263             :     }
    5264           0 :     return true;
    5265             : }
    5266             : 
    5267             : /* Expand each Levenshtein distance variant */
    5268             : #include "levenshtein.c"
    5269             : #define LEVENSHTEIN_LESS_EQUAL
    5270             : #include "levenshtein.c"
    5271             : 
    5272             : 
    5273             : /*
    5274             :  * The following *ClosestMatch() functions can be used to determine whether a
    5275             :  * user-provided string resembles any known valid values, which is useful for
    5276             :  * providing hints in log messages, among other things.  Use these functions
    5277             :  * like so:
    5278             :  *
    5279             :  *      initClosestMatch(&state, source_string, max_distance);
    5280             :  *
    5281             :  *      for (int i = 0; i < num_valid_strings; i++)
    5282             :  *          updateClosestMatch(&state, valid_strings[i]);
    5283             :  *
    5284             :  *      closestMatch = getClosestMatch(&state);
    5285             :  */
    5286             : 
    5287             : /*
    5288             :  * Initialize the given state with the source string and maximum Levenshtein
    5289             :  * distance to consider.
    5290             :  */
    5291             : void
    5292          78 : initClosestMatch(ClosestMatchState *state, const char *source, int max_d)
    5293             : {
    5294             :     Assert(state);
    5295             :     Assert(max_d >= 0);
    5296             : 
    5297          78 :     state->source = source;
    5298          78 :     state->min_d = -1;
    5299          78 :     state->max_d = max_d;
    5300          78 :     state->match = NULL;
    5301          78 : }
    5302             : 
    5303             : /*
    5304             :  * If the candidate string is a closer match than the current one saved (or
    5305             :  * there is no match saved), save it as the closest match.
    5306             :  *
    5307             :  * If the source or candidate string is NULL, empty, or too long, this function
    5308             :  * takes no action.  Likewise, if the Levenshtein distance exceeds the maximum
    5309             :  * allowed or more than half the characters are different, no action is taken.
    5310             :  */
    5311             : void
    5312         794 : updateClosestMatch(ClosestMatchState *state, const char *candidate)
    5313             : {
    5314             :     int         dist;
    5315             : 
    5316             :     Assert(state);
    5317             : 
    5318         794 :     if (state->source == NULL || state->source[0] == '\0' ||
    5319         794 :         candidate == NULL || candidate[0] == '\0')
    5320           0 :         return;
    5321             : 
    5322             :     /*
    5323             :      * To avoid ERROR-ing, we check the lengths here instead of setting
    5324             :      * 'trusted' to false in the call to varstr_levenshtein_less_equal().
    5325             :      */
    5326         794 :     if (strlen(state->source) > MAX_LEVENSHTEIN_STRLEN ||
    5327         794 :         strlen(candidate) > MAX_LEVENSHTEIN_STRLEN)
    5328           0 :         return;
    5329             : 
    5330         794 :     dist = varstr_levenshtein_less_equal(state->source, strlen(state->source),
    5331         794 :                                          candidate, strlen(candidate), 1, 1, 1,
    5332             :                                          state->max_d, true);
    5333         794 :     if (dist <= state->max_d &&
    5334          62 :         dist <= strlen(state->source) / 2 &&
    5335          14 :         (state->min_d == -1 || dist < state->min_d))
    5336             :     {
    5337          14 :         state->min_d = dist;
    5338          14 :         state->match = candidate;
    5339             :     }
    5340             : }
    5341             : 
    5342             : /*
    5343             :  * Return the closest match.  If no suitable candidates were provided via
    5344             :  * updateClosestMatch(), return NULL.
    5345             :  */
    5346             : const char *
    5347          78 : getClosestMatch(ClosestMatchState *state)
    5348             : {
    5349             :     Assert(state);
    5350             : 
    5351          78 :     return state->match;
    5352             : }
    5353             : 
    5354             : 
    5355             : /*
    5356             :  * Unicode support
    5357             :  */
    5358             : 
    5359             : static UnicodeNormalizationForm
    5360         210 : unicode_norm_form_from_string(const char *formstr)
    5361             : {
    5362         210 :     UnicodeNormalizationForm form = -1;
    5363             : 
    5364             :     /*
    5365             :      * Might as well check this while we're here.
    5366             :      */
    5367         210 :     if (GetDatabaseEncoding() != PG_UTF8)
    5368           0 :         ereport(ERROR,
    5369             :                 (errcode(ERRCODE_SYNTAX_ERROR),
    5370             :                  errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
    5371             : 
    5372         210 :     if (pg_strcasecmp(formstr, "NFC") == 0)
    5373          66 :         form = UNICODE_NFC;
    5374         144 :     else if (pg_strcasecmp(formstr, "NFD") == 0)
    5375          60 :         form = UNICODE_NFD;
    5376          84 :     else if (pg_strcasecmp(formstr, "NFKC") == 0)
    5377          36 :         form = UNICODE_NFKC;
    5378          48 :     else if (pg_strcasecmp(formstr, "NFKD") == 0)
    5379          36 :         form = UNICODE_NFKD;
    5380             :     else
    5381          12 :         ereport(ERROR,
    5382             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5383             :                  errmsg("invalid normalization form: %s", formstr)));
    5384             : 
    5385         198 :     return form;
    5386             : }
    5387             : 
    5388             : /*
    5389             :  * Returns version of Unicode used by Postgres in "major.minor" format (the
    5390             :  * same format as the Unicode version reported by ICU). The third component
    5391             :  * ("update version") never involves additions to the character repertoire and
    5392             :  * is unimportant for most purposes.
    5393             :  *
    5394             :  * See: https://unicode.org/versions/
    5395             :  */
    5396             : Datum
    5397          30 : unicode_version(PG_FUNCTION_ARGS)
    5398             : {
    5399          30 :     PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
    5400             : }
    5401             : 
    5402             : /*
    5403             :  * Returns version of Unicode used by ICU, if enabled; otherwise NULL.
    5404             :  */
    5405             : Datum
    5406           2 : icu_unicode_version(PG_FUNCTION_ARGS)
    5407             : {
    5408             : #ifdef USE_ICU
    5409           2 :     PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
    5410             : #else
    5411             :     PG_RETURN_NULL();
    5412             : #endif
    5413             : }
    5414             : 
    5415             : /*
    5416             :  * Check whether the string contains only assigned Unicode code
    5417             :  * points. Requires that the database encoding is UTF-8.
    5418             :  */
    5419             : Datum
    5420          12 : unicode_assigned(PG_FUNCTION_ARGS)
    5421             : {
    5422          12 :     text       *input = PG_GETARG_TEXT_PP(0);
    5423             :     unsigned char *p;
    5424             :     int         size;
    5425             : 
    5426          12 :     if (GetDatabaseEncoding() != PG_UTF8)
    5427           0 :         ereport(ERROR,
    5428             :                 (errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
    5429             : 
    5430             :     /* convert to pg_wchar */
    5431          12 :     size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
    5432          12 :     p = (unsigned char *) VARDATA_ANY(input);
    5433          48 :     for (int i = 0; i < size; i++)
    5434             :     {
    5435          42 :         pg_wchar    uchar = utf8_to_unicode(p);
    5436          42 :         int         category = unicode_category(uchar);
    5437             : 
    5438          42 :         if (category == PG_U_UNASSIGNED)
    5439           6 :             PG_RETURN_BOOL(false);
    5440             : 
    5441          36 :         p += pg_utf_mblen(p);
    5442             :     }
    5443             : 
    5444           6 :     PG_RETURN_BOOL(true);
    5445             : }
    5446             : 
    5447             : Datum
    5448          72 : unicode_normalize_func(PG_FUNCTION_ARGS)
    5449             : {
    5450          72 :     text       *input = PG_GETARG_TEXT_PP(0);
    5451          72 :     char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
    5452             :     UnicodeNormalizationForm form;
    5453             :     int         size;
    5454             :     pg_wchar   *input_chars;
    5455             :     pg_wchar   *output_chars;
    5456             :     unsigned char *p;
    5457             :     text       *result;
    5458             :     int         i;
    5459             : 
    5460          72 :     form = unicode_norm_form_from_string(formstr);
    5461             : 
    5462             :     /* convert to pg_wchar */
    5463          66 :     size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
    5464          66 :     input_chars = palloc((size + 1) * sizeof(pg_wchar));
    5465          66 :     p = (unsigned char *) VARDATA_ANY(input);
    5466         288 :     for (i = 0; i < size; i++)
    5467             :     {
    5468         222 :         input_chars[i] = utf8_to_unicode(p);
    5469         222 :         p += pg_utf_mblen(p);
    5470             :     }
    5471          66 :     input_chars[i] = (pg_wchar) '\0';
    5472             :     Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
    5473             : 
    5474             :     /* action */
    5475          66 :     output_chars = unicode_normalize(form, input_chars);
    5476             : 
    5477             :     /* convert back to UTF-8 string */
    5478          66 :     size = 0;
    5479         306 :     for (pg_wchar *wp = output_chars; *wp; wp++)
    5480             :     {
    5481             :         unsigned char buf[4];
    5482             : 
    5483         240 :         unicode_to_utf8(*wp, buf);
    5484         240 :         size += pg_utf_mblen(buf);
    5485             :     }
    5486             : 
    5487          66 :     result = palloc(size + VARHDRSZ);
    5488          66 :     SET_VARSIZE(result, size + VARHDRSZ);
    5489             : 
    5490          66 :     p = (unsigned char *) VARDATA_ANY(result);
    5491         306 :     for (pg_wchar *wp = output_chars; *wp; wp++)
    5492             :     {
    5493         240 :         unicode_to_utf8(*wp, p);
    5494         240 :         p += pg_utf_mblen(p);
    5495             :     }
    5496             :     Assert((char *) p == (char *) result + size + VARHDRSZ);
    5497             : 
    5498          66 :     PG_RETURN_TEXT_P(result);
    5499             : }
    5500             : 
    5501             : /*
    5502             :  * Check whether the string is in the specified Unicode normalization form.
    5503             :  *
    5504             :  * This is done by converting the string to the specified normal form and then
    5505             :  * comparing that to the original string.  To speed that up, we also apply the
    5506             :  * "quick check" algorithm specified in UAX #15, which can give a yes or no
    5507             :  * answer for many strings by just scanning the string once.
    5508             :  *
    5509             :  * This function should generally be optimized for the case where the string
    5510             :  * is in fact normalized.  In that case, we'll end up looking at the entire
    5511             :  * string, so it's probably not worth doing any incremental conversion etc.
    5512             :  */
    5513             : Datum
    5514         138 : unicode_is_normalized(PG_FUNCTION_ARGS)
    5515             : {
    5516         138 :     text       *input = PG_GETARG_TEXT_PP(0);
    5517         138 :     char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
    5518             :     UnicodeNormalizationForm form;
    5519             :     int         size;
    5520             :     pg_wchar   *input_chars;
    5521             :     pg_wchar   *output_chars;
    5522             :     unsigned char *p;
    5523             :     int         i;
    5524             :     UnicodeNormalizationQC quickcheck;
    5525             :     int         output_size;
    5526             :     bool        result;
    5527             : 
    5528         138 :     form = unicode_norm_form_from_string(formstr);
    5529             : 
    5530             :     /* convert to pg_wchar */
    5531         132 :     size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
    5532         132 :     input_chars = palloc((size + 1) * sizeof(pg_wchar));
    5533         132 :     p = (unsigned char *) VARDATA_ANY(input);
    5534         504 :     for (i = 0; i < size; i++)
    5535             :     {
    5536         372 :         input_chars[i] = utf8_to_unicode(p);
    5537         372 :         p += pg_utf_mblen(p);
    5538             :     }
    5539         132 :     input_chars[i] = (pg_wchar) '\0';
    5540             :     Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
    5541             : 
    5542             :     /* quick check (see UAX #15) */
    5543         132 :     quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
    5544         132 :     if (quickcheck == UNICODE_NORM_QC_YES)
    5545          42 :         PG_RETURN_BOOL(true);
    5546          90 :     else if (quickcheck == UNICODE_NORM_QC_NO)
    5547          12 :         PG_RETURN_BOOL(false);
    5548             : 
    5549             :     /* normalize and compare with original */
    5550          78 :     output_chars = unicode_normalize(form, input_chars);
    5551             : 
    5552          78 :     output_size = 0;
    5553         324 :     for (pg_wchar *wp = output_chars; *wp; wp++)
    5554         246 :         output_size++;
    5555             : 
    5556         114 :     result = (size == output_size) &&
    5557          36 :         (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
    5558             : 
    5559          78 :     PG_RETURN_BOOL(result);
    5560             : }
    5561             : 
    5562             : /*
    5563             :  * Check if first n chars are hexadecimal digits
    5564             :  */
    5565             : static bool
    5566         156 : isxdigits_n(const char *instr, size_t n)
    5567             : {
    5568         660 :     for (size_t i = 0; i < n; i++)
    5569         570 :         if (!isxdigit((unsigned char) instr[i]))
    5570          66 :             return false;
    5571             : 
    5572          90 :     return true;
    5573             : }
    5574             : 
    5575             : static unsigned int
    5576         504 : hexval(unsigned char c)
    5577             : {
    5578         504 :     if (c >= '0' && c <= '9')
    5579         384 :         return c - '0';
    5580         120 :     if (c >= 'a' && c <= 'f')
    5581          60 :         return c - 'a' + 0xA;
    5582          60 :     if (c >= 'A' && c <= 'F')
    5583          60 :         return c - 'A' + 0xA;
    5584           0 :     elog(ERROR, "invalid hexadecimal digit");
    5585             :     return 0;                   /* not reached */
    5586             : }
    5587             : 
    5588             : /*
    5589             :  * Translate string with hexadecimal digits to number
    5590             :  */
    5591             : static unsigned int
    5592          90 : hexval_n(const char *instr, size_t n)
    5593             : {
    5594          90 :     unsigned int result = 0;
    5595             : 
    5596         594 :     for (size_t i = 0; i < n; i++)
    5597         504 :         result += hexval(instr[i]) << (4 * (n - i - 1));
    5598             : 
    5599          90 :     return result;
    5600             : }
    5601             : 
    5602             : /*
    5603             :  * Replaces Unicode escape sequences by Unicode characters
    5604             :  */
    5605             : Datum
    5606          66 : unistr(PG_FUNCTION_ARGS)
    5607             : {
    5608          66 :     text       *input_text = PG_GETARG_TEXT_PP(0);
    5609             :     char       *instr;
    5610             :     int         len;
    5611             :     StringInfoData str;
    5612             :     text       *result;
    5613          66 :     pg_wchar    pair_first = 0;
    5614             :     char        cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
    5615             : 
    5616          66 :     instr = VARDATA_ANY(input_text);
    5617          66 :     len = VARSIZE_ANY_EXHDR(input_text);
    5618             : 
    5619          66 :     initStringInfo(&str);
    5620             : 
    5621         510 :     while (len > 0)
    5622             :     {
    5623         486 :         if (instr[0] == '\\')
    5624             :         {
    5625         102 :             if (len >= 2 &&
    5626         102 :                 instr[1] == '\\')
    5627             :             {
    5628           6 :                 if (pair_first)
    5629           0 :                     goto invalid_pair;
    5630           6 :                 appendStringInfoChar(&str, '\\');
    5631           6 :                 instr += 2;
    5632           6 :                 len -= 2;
    5633             :             }
    5634          96 :             else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
    5635          66 :                      (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
    5636          30 :             {
    5637             :                 pg_wchar    unicode;
    5638          42 :                 int         offset = instr[1] == 'u' ? 2 : 1;
    5639             : 
    5640          42 :                 unicode = hexval_n(instr + offset, 4);
    5641             : 
    5642          42 :                 if (!is_valid_unicode_codepoint(unicode))
    5643           0 :                     ereport(ERROR,
    5644             :                             errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5645             :                             errmsg("invalid Unicode code point: %04X", unicode));
    5646             : 
    5647          42 :                 if (pair_first)
    5648             :                 {
    5649          12 :                     if (is_utf16_surrogate_second(unicode))
    5650             :                     {
    5651           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
    5652           0 :                         pair_first = 0;
    5653             :                     }
    5654             :                     else
    5655          12 :                         goto invalid_pair;
    5656             :                 }
    5657          30 :                 else if (is_utf16_surrogate_second(unicode))
    5658           0 :                     goto invalid_pair;
    5659             : 
    5660          30 :                 if (is_utf16_surrogate_first(unicode))
    5661          18 :                     pair_first = unicode;
    5662             :                 else
    5663             :                 {
    5664          12 :                     pg_unicode_to_server(unicode, (unsigned char *) cbuf);
    5665          12 :                     appendStringInfoString(&str, cbuf);
    5666             :                 }
    5667             : 
    5668          30 :                 instr += 4 + offset;
    5669          30 :                 len -= 4 + offset;
    5670             :             }
    5671          54 :             else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
    5672          12 :             {
    5673             :                 pg_wchar    unicode;
    5674             : 
    5675          24 :                 unicode = hexval_n(instr + 2, 6);
    5676             : 
    5677          24 :                 if (!is_valid_unicode_codepoint(unicode))
    5678           6 :                     ereport(ERROR,
    5679             :                             errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5680             :                             errmsg("invalid Unicode code point: %04X", unicode));
    5681             : 
    5682          18 :                 if (pair_first)
    5683             :                 {
    5684           6 :                     if (is_utf16_surrogate_second(unicode))
    5685             :                     {
    5686           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
    5687           0 :                         pair_first = 0;
    5688             :                     }
    5689             :                     else
    5690           6 :                         goto invalid_pair;
    5691             :                 }
    5692          12 :                 else if (is_utf16_surrogate_second(unicode))
    5693           0 :                     goto invalid_pair;
    5694             : 
    5695          12 :                 if (is_utf16_surrogate_first(unicode))
    5696           6 :                     pair_first = unicode;
    5697             :                 else
    5698             :                 {
    5699           6 :                     pg_unicode_to_server(unicode, (unsigned char *) cbuf);
    5700           6 :                     appendStringInfoString(&str, cbuf);
    5701             :                 }
    5702             : 
    5703          12 :                 instr += 8;
    5704          12 :                 len -= 8;
    5705             :             }
    5706          30 :             else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
    5707          12 :             {
    5708             :                 pg_wchar    unicode;
    5709             : 
    5710          24 :                 unicode = hexval_n(instr + 2, 8);
    5711             : 
    5712          24 :                 if (!is_valid_unicode_codepoint(unicode))
    5713           6 :                     ereport(ERROR,
    5714             :                             errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5715             :                             errmsg("invalid Unicode code point: %04X", unicode));
    5716             : 
    5717          18 :                 if (pair_first)
    5718             :                 {
    5719           6 :                     if (is_utf16_surrogate_second(unicode))
    5720             :                     {
    5721           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
    5722           0 :                         pair_first = 0;
    5723             :                     }
    5724             :                     else
    5725           6 :                         goto invalid_pair;
    5726             :                 }
    5727          12 :                 else if (is_utf16_surrogate_second(unicode))
    5728           0 :                     goto invalid_pair;
    5729             : 
    5730          12 :                 if (is_utf16_surrogate_first(unicode))
    5731           6 :                     pair_first = unicode;
    5732             :                 else
    5733             :                 {
    5734           6 :                     pg_unicode_to_server(unicode, (unsigned char *) cbuf);
    5735           6 :                     appendStringInfoString(&str, cbuf);
    5736             :                 }
    5737             : 
    5738          12 :                 instr += 10;
    5739          12 :                 len -= 10;
    5740             :             }
    5741             :             else
    5742           6 :                 ereport(ERROR,
    5743             :                         (errcode(ERRCODE_SYNTAX_ERROR),
    5744             :                          errmsg("invalid Unicode escape"),
    5745             :                          errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
    5746             :         }
    5747             :         else
    5748             :         {
    5749         384 :             if (pair_first)
    5750           0 :                 goto invalid_pair;
    5751             : 
    5752         384 :             appendStringInfoChar(&str, *instr++);
    5753         384 :             len--;
    5754             :         }
    5755             :     }
    5756             : 
    5757             :     /* unfinished surrogate pair? */
    5758          24 :     if (pair_first)
    5759           6 :         goto invalid_pair;
    5760             : 
    5761          18 :     result = cstring_to_text_with_len(str.data, str.len);
    5762          18 :     pfree(str.data);
    5763             : 
    5764          18 :     PG_RETURN_TEXT_P(result);
    5765             : 
    5766          30 : invalid_pair:
    5767          30 :     ereport(ERROR,
    5768             :             (errcode(ERRCODE_SYNTAX_ERROR),
    5769             :              errmsg("invalid Unicode surrogate pair")));
    5770             :     PG_RETURN_NULL();           /* keep compiler quiet */
    5771             : }

Generated by: LCOV version 1.16