Improve to_date/to_number/to_timestamp behavior with multibyte characters.

author Tom Lane <tgl@sss.pgh.pa.us>

Sat, 18 Nov 2017 17:42:52 +0000 (12:42 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sat, 18 Nov 2017 17:42:52 +0000 (12:42 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Sat, 18 Nov 2017 17:42:52 +0000 (12:42 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sat, 18 Nov 2017 17:42:52 +0000 (12:42 -0500)
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c

index cb0dbf748e501f0ba002efe37d48e9e9c4df12a8..ec97de0ad27ee9c29731f9138c965fb07c1f103a 100644 (file)
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -151,8 +151,6 @@ typedef enum
     FROM_CHAR_DATE_ISOWEEK      /* ISO 8601 week date */
  } FromCharDateMode;
  
-typedef struct FormatNode FormatNode;
-
  typedef struct
  {
     const char *name;
@@ -162,13 +160,13 @@ typedef struct
     FromCharDateMode date_mode;
  } KeyWord;
  
-struct FormatNode
+typedef struct
  {
-   int         type;           /* node type            */
-   const KeyWord *key;         /* if node type is KEYWORD  */
-   char        character;      /* if node type is CHAR     */
-   int         suffix;         /* keyword suffix       */
-};
+   int         type;           /* NODE_TYPE_XXX, see below */
+   const KeyWord *key;         /* if type is ACTION */
+   char        character[MAX_MULTIBYTE_CHAR_LEN + 1];  /* if type is CHAR */
+   int         suffix;         /* keyword prefix/suffix code, if any */
+} FormatNode;
  
  #define NODE_TYPE_END      1
  #define NODE_TYPE_ACTION   2
@@ -1282,12 +1280,15 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
         }
         else if (*str)
         {
+           int         chlen;
+
             /*
              * Process double-quoted literal string, if any
              */
             if (*str == '"')
             {
-               while (*(++str))
+               str++;
+               while (*str)
                 {
                     if (*str == '"')
                     {
@@ -1297,11 +1298,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
                     /* backslash quotes the next character, if any */
                     if (*str == '\\' && *(str + 1))
                         str++;
+                   chlen = pg_mblen(str);
                     n->type = NODE_TYPE_CHAR;
-                   n->character = *str;
+                   memcpy(n->character, str, chlen);
+                   n->character[chlen] = '\0';
                     n->key = NULL;
                     n->suffix = 0;
                     n++;
+                   str += chlen;
                 }
             }
             else
@@ -1312,12 +1316,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
                  */
                 if (*str == '\\' && *(str + 1) == '"')
                     str++;
+               chlen = pg_mblen(str);
                 n->type = NODE_TYPE_CHAR;
-               n->character = *str;
+               memcpy(n->character, str, chlen);
+               n->character[chlen] = '\0';
                 n->key = NULL;
                 n->suffix = 0;
                 n++;
-               str++;
+               str += chlen;
             }
         }
     }
@@ -1349,7 +1355,8 @@ dump_node(FormatNode *node, int max)
             elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)",
                  a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix));
         else if (n->type == NODE_TYPE_CHAR)
-           elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%c'", a, n->character);
+           elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%s'",
+                a, n->character);
         else if (n->type == NODE_TYPE_END)
         {
             elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a);
@@ -2008,8 +2015,8 @@ asc_toupper_z(const char *buff)
     do { \
         if (S_THth(_suf)) \
         { \
-           if (*(ptr)) (ptr)++; \
-           if (*(ptr)) (ptr)++; \
+           if (*(ptr)) (ptr) += pg_mblen(ptr); \
+           if (*(ptr)) (ptr) += pg_mblen(ptr); \
         } \
     } while (0)
  
@@ -2076,7 +2083,8 @@ is_next_separator(FormatNode *n)
  
         return true;
     }
-   else if (isdigit((unsigned char) n->character))
+   else if (n->character[1] == '\0' &&
+            isdigit((unsigned char) n->character[0]))
         return false;
  
     return true;                /* some non-digit input (separator) */
@@ -2405,8 +2413,8 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col
     {
         if (n->type != NODE_TYPE_ACTION)
         {
-           *s = n->character;
-           s++;
+           strcpy(s, n->character);
+           s += strlen(s);
             continue;
         }
  
@@ -2974,7 +2982,7 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
              * we don't insist that the consumed character match the format's
              * character.
              */
-           s++;
+           s += pg_mblen(s);
             continue;
         }
  
@@ -4217,7 +4225,7 @@ get_last_relevant_decnum(char *num)
  /*
   * These macros are used in NUM_processor() and its subsidiary routines.
   * OVERLOAD_TEST: true if we've reached end of input string
- * AMOUNT_TEST(s): true if at least s characters remain in string
+ * AMOUNT_TEST(s): true if at least s bytes remain in string
   */
  #define OVERLOAD_TEST  (Np->inout_p >= Np->inout + input_len)
  #define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s)))
@@ -4821,9 +4829,9 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
         if (!Np->is_to_char)
         {
             /*
-            * Check at least one character remains to be scanned.  (In
-            * actions below, must use AMOUNT_TEST if we want to read more
-            * characters than that.)
+            * Check at least one byte remains to be scanned.  (In actions
+            * below, must use AMOUNT_TEST if we want to read more bytes than
+            * that.)
              */
             if (OVERLOAD_TEST)
                 break;
@@ -5081,12 +5089,18 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
              * In TO_CHAR, non-pattern characters in the format are copied to
              * the output.  In TO_NUMBER, we skip one input character for each
              * non-pattern format character, whether or not it matches the
-            * format character.  (Currently, that's actually implemented as
-            * skipping one input byte per non-pattern format byte, which is
-            * wrong...)
+            * format character.
              */
             if (Np->is_to_char)
-               *Np->inout_p = n->character;
+           {
+               strcpy(Np->inout_p, n->character);
+               Np->inout_p += strlen(Np->inout_p);
+           }
+           else
+           {
+               Np->inout_p += pg_mblen(Np->inout_p);
+           }
+           continue;
         }
         Np->inout_p++;
     }
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 18 Nov 2017 17:42:52 +0000 (12:42 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 18 Nov 2017 17:42:52 +0000 (12:42 -0500)