Handle carriage returns and line feeds in COPY CSV mode.
authorBruce Momjian <bruce@momjian.us>
Sat, 12 Mar 2005 05:41:34 +0000 (05:41 +0000)
committerBruce Momjian <bruce@momjian.us>
Sat, 12 Mar 2005 05:41:34 +0000 (05:41 +0000)
Andrew Dunstan

src/backend/commands/copy.c

index 9f99bdd9e482868a29f5d92a572b8c40990689e4..ed815098aba7577ce2b29286bf108f8e40bc3a12 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.236 2004/12/31 21:59:41 pgsql Exp $
+ *   $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.237 2005/03/12 05:41:34 momjian Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -98,7 +98,6 @@ static bool fe_eof;               /* true if detected end of copy data */
 static EolType eol_type;       /* EOL type of input */
 static int client_encoding;    /* remote side's character encoding */
 static int server_encoding;    /* local encoding */
-static bool embedded_line_warning;
 
 /* these are just for error messages, see copy_in_error_callback */
 static bool copy_binary;       /* is it a binary copy? */
@@ -139,7 +138,7 @@ static void CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
 static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
  char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
         List *force_notnull_atts);
-static bool CopyReadLine(void);
+static bool CopyReadLine(char * quote, char * escape);
 static char *CopyReadAttribute(const char *delim, const char *null_print,
                  CopyReadResult *result, bool *isnull);
 static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
@@ -1191,7 +1190,6 @@ CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
    attr = tupDesc->attrs;
    num_phys_attrs = tupDesc->natts;
    attr_count = list_length(attnumlist);
-   embedded_line_warning = false;
 
    /*
     * Get info about the columns we need to process.
@@ -1718,7 +1716,8 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
            ListCell   *cur;
 
            /* Actually read the line into memory here */
-           done = CopyReadLine();
+           done = csv_mode ? 
+               CopyReadLine(quote, escape) : CopyReadLine(NULL, NULL);
 
            /*
             * EOF at start of line means we're done.  If we see EOF after
@@ -2006,7 +2005,7 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
  * by newline.
  */
 static bool
-CopyReadLine(void)
+CopyReadLine(char * quote, char * escape)
 {
    bool        result;
    bool        change_encoding = (client_encoding != server_encoding);
@@ -2015,6 +2014,19 @@ CopyReadLine(void)
    int         j;
    unsigned char s[2];
    char       *cvt;
+   bool        in_quote = false, last_was_esc = false, csv_mode = false;
+   char        quotec = '\0', escapec = '\0';
+
+   if (quote)
+   {
+       csv_mode = true;
+       quotec = quote[0];
+       escapec = escape[0];
+       /* ignore special escape processing if it's the same as quotec */
+       if (quotec == escapec)
+           escapec = '\0';
+   }
+
 
    s[1] = 0;
 
@@ -2031,11 +2043,20 @@ CopyReadLine(void)
 
    /*
     * In this loop we only care for detecting newlines (\r and/or \n) and
-    * the end-of-copy marker (\.).  For backwards compatibility we allow
+    * the end-of-copy marker (\.).  
+    *
+    * In Text mode, for backwards compatibility we allow
     * backslashes to escape newline characters.  Backslashes other than
     * the end marker get put into the line_buf, since CopyReadAttribute
-    * does its own escape processing.  These four characters, and only
-    * these four, are assumed the same in frontend and backend encodings.
+    * does its own escape processing.  
+    *
+    * In CSV mode, CR and NL inside q quoted field are just part of the
+    * data value and are put in line_buf. We keep just enough state
+    * to know if we are currently in a quoted field or not.
+    *
+    * These four characters, and only these four, are assumed the same in 
+    * frontend and backend encodings.
+    *
     * We do not assume that second and later bytes of a frontend
     * multibyte character couldn't look like ASCII characters.
     */
@@ -2047,13 +2068,49 @@ CopyReadLine(void)
            result = true;
            break;
        }
-       if (c == '\r')
+
+       if (csv_mode)
+       {
+           /*  
+            * Dealing with quotes and escapes here is mildly tricky. If the
+            * quote char is also the escape char, there's no problem - we  
+            * just use the char as a toggle. If they are different, we need
+            * to ensure that we only take account of an escape inside a quoted
+            * field and immediately preceding a quote char, and not the
+            * second in a escape-escape sequence.
+            */ 
+
+           if (in_quote && c == escapec)
+               last_was_esc = ! last_was_esc;
+           if (c == quotec && ! last_was_esc)
+               in_quote = ! in_quote;
+           if (c != escapec)
+               last_was_esc = false;
+
+           /*
+            * updating the line count for embedded CR and/or LF chars is 
+            * necessarily a little fragile - this test is probably about 
+            * the best we can do.
+            */ 
+           if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n')) 
+               copy_lineno++; 
+       }
+
+       if (!in_quote && c == '\r')
        {
            if (eol_type == EOL_NL)
-               ereport(ERROR,
-                       (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-                        errmsg("literal carriage return found in data"),
-                 errhint("Use \"\\r\" to represent carriage return.")));
+           {
+               if (! csv_mode)
+                   ereport(ERROR,
+                           (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                            errmsg("literal carriage return found in data"),
+                            errhint("Use \"\\r\" to represent carriage return.")));
+               else
+                   ereport(ERROR,
+                           (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                            errmsg("unquoted carriage return found in CSV data"),
+                            errhint("Use quoted CSV field to represent carriage return.")));
+           }
            /* Check for \r\n on first line, _and_ handle \r\n. */
            if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
            {
@@ -2068,10 +2125,19 @@ CopyReadLine(void)
                {
                    /* found \r, but no \n */
                    if (eol_type == EOL_CRNL)
-                       ereport(ERROR,
-                               (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-                        errmsg("literal carriage return found in data"),
-                                errhint("Use \"\\r\" to represent carriage return.")));
+                   {
+                       if (!csv_mode)
+                           ereport(ERROR,
+                                   (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                    errmsg("literal carriage return found in data"),
+                                    errhint("Use \"\\r\" to represent carriage return.")));
+                       else
+                           ereport(ERROR,
+                                   (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                    errmsg("unquoted carriage return found in data"),
+                                    errhint("Use quoted CSV field to represent carriage return.")));
+
+                   }
 
                    /*
                     * if we got here, it is the first line and we didn't
@@ -2083,26 +2149,47 @@ CopyReadLine(void)
            }
            break;
        }
-       if (c == '\n')
+       if (!in_quote && c == '\n')
        {
            if (eol_type == EOL_CR || eol_type == EOL_CRNL)
-               ereport(ERROR,
-                       (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-                        errmsg("literal newline found in data"),
-                        errhint("Use \"\\n\" to represent newline.")));
+           {
+               if (!csv_mode)
+                   ereport(ERROR,
+                           (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                            errmsg("literal newline found in data"),
+                            errhint("Use \"\\n\" to represent newline.")));
+               else
+                   ereport(ERROR,
+                           (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                            errmsg("unquoted newline found in data"),
+                            errhint("Use quoted CSV field to represent newline.")));
+                   
+           }
            eol_type = EOL_NL;
            break;
        }
-       if (c == '\\')
+
+       if ((line_buf.len == 0 || !csv_mode) && c == '\\')
        {
-           c = CopyGetChar();
-           if (c == EOF)
+           int c2;
+           
+           if (csv_mode)
+               c2 = CopyPeekChar();
+           else
+               c2 = c = CopyGetChar();
+
+           if (c2 == EOF)
            {
                result = true;
+               if (csv_mode)
+                   CopyDonePeek(c2, true);
                break;
            }
-           if (c == '.')
+           if (c2 == '.')
            {
+               if (csv_mode)
+                   CopyDonePeek(c2, true); /* allow keep calling GetChar() */
+
                if (eol_type == EOL_CRNL)
                {
                    c = CopyGetChar();
@@ -2140,8 +2227,12 @@ CopyReadLine(void)
                result = true;  /* report EOF */
                break;
            }
-           /* not EOF mark, so emit \ and following char literally */
-           appendStringInfoCharMacro(&line_buf, '\\');
+           
+           if (csv_mode)
+               CopyDonePeek(c2, false); /* not a dot, so put it back */ 
+           else
+               /* not EOF mark, so emit \ and following char literally */
+               appendStringInfoCharMacro(&line_buf, '\\');
        }
 
        appendStringInfoCharMacro(&line_buf, c);
@@ -2369,34 +2460,6 @@ CopyReadAttributeCSV(const char *delim, const char *null_print, char *quote,
 
    for (;;)
    {
-       /* handle multiline quoted fields */
-       if (in_quote && line_buf.cursor >= line_buf.len)
-       {
-           bool        done;
-
-           switch (eol_type)
-           {
-               case EOL_NL:
-                   appendStringInfoString(&attribute_buf, "\n");
-                   break;
-               case EOL_CR:
-                   appendStringInfoString(&attribute_buf, "\r");
-                   break;
-               case EOL_CRNL:
-                   appendStringInfoString(&attribute_buf, "\r\n");
-                   break;
-               case EOL_UNKNOWN:
-                   /* shouldn't happen - just keep going */
-                   break;
-           }
-
-           copy_lineno++;
-           done = CopyReadLine();
-           if (done && line_buf.len == 0)
-               break;
-           start_cursor = line_buf.cursor;
-       }
-
        end_cursor = line_buf.cursor;
        if (line_buf.cursor >= line_buf.len)
            break;
@@ -2629,25 +2692,6 @@ CopyAttributeOutCSV(char *server_string, char *delim, char *quote,
         !use_quote && (c = *test_string) != '\0';
         test_string += mblen)
    {
-       /*
-        * We don't know here what the surrounding line end characters
-        * might be. It might not even be under postgres' control. So
-        * we simple warn on ANY embedded line ending character.
-        *
-        * This warning will disappear when we make line parsing field-aware,
-        * so that we can reliably read in embedded line ending characters
-        * regardless of the file's line-end context.
-        *
-        */
-
-       if (!embedded_line_warning  && (c == '\n' || c == '\r') )
-       {
-           embedded_line_warning = true;
-           elog(WARNING,
-                "CSV fields with embedded linefeed or carriage return "
-                "characters might not be able to be reimported");
-       }
-
        if (c == delimc || c == quotec || c == '\n' || c == '\r')
            use_quote = true;
        if (!same_encoding)