Do not treat \. as an EOF marker in CSV mode for COPY IN.

author Tom Lane <tgl@sss.pgh.pa.us>

Mon, 30 Sep 2024 21:57:12 +0000 (17:57 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Mon, 30 Sep 2024 21:57:12 +0000 (17:57 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Mon, 30 Sep 2024 21:57:12 +0000 (17:57 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Mon, 30 Sep 2024 21:57:12 +0000 (17:57 -0400)
diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml

index 783e8e750bb7ff5103421a4d14b63ff4c68457e9..4a727d4499708c8706fa6853cecd29a001557da7 100644 (file)
--- a/doc/src/sgml/libpq.sgml
+++ b/doc/src/sgml/libpq.sgml
@@ -7381,8 +7381,9 @@ int PQputline(PGconn *conn,
          <literal>\.</literal> as a final line to indicate to the server that it had
          finished sending <command>COPY</command> data.  While this still works, it is deprecated and the
          special meaning of <literal>\.</literal> can be expected to be removed in a
-        future release.  It is sufficient to call <xref linkend="libpq-PQendcopy"/> after
-        having sent the actual data.
+        future release.  (It already will misbehave in <literal>CSV</literal>
+        mode.)  It is sufficient to call <xref linkend="libpq-PQendcopy"/>
+        after having sent the actual data.
         </para>
        </note>
       </listitem>
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml

index 11b6456779784d79886a07b4ee07bef4fe14c640..2d2481bb8b8a636c56f5952a2099f1419de846d0 100644 (file)
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -7606,8 +7606,9 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
     is a well-defined way to recover from errors during <command>COPY</command>.  The special
     <quote><literal>\.</literal></quote> last line is not needed anymore, and is not sent
     during <command>COPY OUT</command>.
-   (It is still recognized as a terminator during <command>COPY IN</command>, but its use is
-   deprecated and will eventually be removed.)  Binary <command>COPY</command> is supported.
+   (It is still recognized as a terminator during text-mode <command>COPY
+   IN</command>, but not in CSV mode.  The text-mode behavior is
+   deprecated and may eventually be removed.)  Binary <command>COPY</command> is supported.
     The CopyInResponse and CopyOutResponse messages include fields indicating
     the number of columns and the format of each column.
    </para>
diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml

index 1518af8a0450c90319fe43ac32ab84bb1ce2361e..fdbd20bc50b100198b4103ae9b7ea7c9e35818e6 100644 (file)
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -646,11 +646,16 @@ COPY <replaceable class="parameter">count</replaceable>
     </para>
  
     <para>
-    End of data can be represented by a single line containing just
+    End of data can be represented by a line containing just
      backslash-period (<literal>\.</literal>).  An end-of-data marker is
      not necessary when reading from a file, since the end of file
-    serves perfectly well; it is needed only when copying data to or from
-    client applications using pre-3.0 client protocol.
+    serves perfectly well; in that context this provision exists only for
+    backward compatibility.  However, <application>psql</application>
+    uses <literal>\.</literal> to terminate a <literal>COPY FROM
+    STDIN</literal> operation (that is, reading
+    in-line <command>COPY</command> data in a SQL script).  In that
+    context the rule is needed to be able to end the operation before the
+    end of the script.
     </para>
  
     <para>
@@ -811,16 +816,25 @@ COPY <replaceable class="parameter">count</replaceable>
  
     <para>
      Because backslash is not a special character in the <literal>CSV</literal>
-    format, <literal>\.</literal>, the end-of-data marker, could also appear
-    as a data value.  To avoid any misinterpretation, a <literal>\.</literal>
-    data value appearing as a lone entry on a line is automatically
-    quoted on output, and on input, if quoted, is not interpreted as the
-    end-of-data marker.  If you are loading a file created by another
-    application that has a single unquoted column and might have a
-    value of <literal>\.</literal>, you might need to quote that value in the
-    input file.
+    format, the end-of-data marker used in text mode (<literal>\.</literal>)
+    is not normally treated as special when reading <literal>CSV</literal>
+    data.  An exception is that <application>psql</application> will terminate
+    a <literal>COPY FROM STDIN</literal> operation (that is, reading
+    in-line <command>COPY</command> data in a SQL script) at a line containing
+    only <literal>\.</literal>, whether it is text or <literal>CSV</literal>
+    mode.
     </para>
  
+   <note>
+    <para>
+     <productname>PostgreSQL</productname> versions before v18 always
+     recognized unquoted <literal>\.</literal> as an end-of-data marker,
+     even when reading from a separate file.  For compatibility with older
+     versions, <command>COPY TO</command> will quote <literal>\.</literal>
+     when it's alone on a line, even though this is no longer necessary.
+    </para>
+   </note>
+
     <note>
      <para>
       In <literal>CSV</literal> format, all characters are significant. A quoted value
diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml

index 3fd9959ed168caf386acce840015894c929ffeec..b825ca96a2373154221923f6b68f9c62a0918df1 100644 (file)
--- a/doc/src/sgml/ref/psql-ref.sgml
+++ b/doc/src/sgml/ref/psql-ref.sgml
@@ -1135,7 +1135,8 @@ SELECT $1 \parse stmt1
  
          <para>
          For <literal>\copy ... from stdin</literal>, data rows are read from the same
-        source that issued the command, continuing until <literal>\.</literal>
+        source that issued the command, continuing until a line containing
+        only <literal>\.</literal>
          is read or the stream reaches <acronym>EOF</acronym>. This option is useful
          for populating tables in-line within an SQL script file.
          For <literal>\copy ... to stdout</literal>, output is sent to the same place
@@ -1179,10 +1180,6 @@ SELECT $1 \parse stmt1
          destination, because all data must pass through the client/server
          connection.  For large amounts of data the <acronym>SQL</acronym>
          command might be preferable.
-        Also, because of this pass-through method, <literal>\copy
-        ... from</literal> in <acronym>CSV</acronym> mode will erroneously
-        treat a <literal>\.</literal> data value alone on a line as an
-        end-of-input marker.
          </para>
          </tip>
  
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c

index 97a4c387a3058290b74b25c7937e213b0e5119c0..a280efe23f98304dee1af549a5d596b9b5febbf6 100644 (file)
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -136,14 +136,6 @@ if (1) \
     } \
  } else ((void) 0)
  
-/* Undo any read-ahead and jump out of the block. */
-#define NO_END_OF_COPY_GOTO \
-if (1) \
-{ \
-   input_buf_ptr = prev_raw_ptr + 1; \
-   goto not_end_of_copy; \
-} else ((void) 0)
-
  /* NOTE: there's a copy of this in copyto.c */
  static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
  
@@ -1182,7 +1174,6 @@ CopyReadLineText(CopyFromState cstate)
     bool        result = false;
  
     /* CSV variables */
-   bool        first_char_in_line = true;
     bool        in_quote = false,
                 last_was_esc = false;
     char        quotec = '\0';
@@ -1268,12 +1259,12 @@ CopyReadLineText(CopyFromState cstate)
         if (cstate->opts.csv_mode)
         {
             /*
-            * If character is '\\' or '\r', we may need to look ahead below.
-            * Force fetch of the next character if we don't already have it.
-            * We need to do this before changing CSV state, in case one of
-            * these characters is also the quote or escape character.
+            * If character is '\r', we may need to look ahead below.  Force
+            * fetch of the next character if we don't already have it.  We
+            * need to do this before changing CSV state, in case '\r' is also
+            * the quote or escape character.
              */
-           if (c == '\\' || c == '\r')
+           if (c == '\r')
             {
                 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
             }
@@ -1377,10 +1368,10 @@ CopyReadLineText(CopyFromState cstate)
         }
  
         /*
-        * In CSV mode, we only recognize \. alone on a line.  This is because
-        * \. is a valid CSV data value.
+        * Process backslash, except in CSV mode where backslash is a normal
+        * character.
          */
-       if (c == '\\' && (!cstate->opts.csv_mode || first_char_in_line))
+       if (c == '\\' && !cstate->opts.csv_mode)
         {
             char        c2;
  
@@ -1398,12 +1389,6 @@ CopyReadLineText(CopyFromState cstate)
             if (c2 == '.')
             {
                 input_buf_ptr++;    /* consume the '.' */
-
-               /*
-                * Note: if we loop back for more data here, it does not
-                * matter that the CSV state change checks are re-executed; we
-                * will come back here with no important state changed.
-                */
                 if (cstate->eol_type == EOL_CRNL)
                 {
                     /* Get the next character */
@@ -1412,23 +1397,13 @@ CopyReadLineText(CopyFromState cstate)
                     c2 = copy_input_buf[input_buf_ptr++];
  
                     if (c2 == '\n')
-                   {
-                       if (!cstate->opts.csv_mode)
-                           ereport(ERROR,
-                                   (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-                                    errmsg("end-of-copy marker does not match previous newline style")));
-                       else
-                           NO_END_OF_COPY_GOTO;
-                   }
+                       ereport(ERROR,
+                               (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                errmsg("end-of-copy marker does not match previous newline style")));
                     else if (c2 != '\r')
-                   {
-                       if (!cstate->opts.csv_mode)
-                           ereport(ERROR,
-                                   (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-                                    errmsg("end-of-copy marker corrupt")));
-                       else
-                           NO_END_OF_COPY_GOTO;
-                   }
+                       ereport(ERROR,
+                               (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                errmsg("end-of-copy marker corrupt")));
                 }
  
                 /* Get the next character */
@@ -1437,14 +1412,9 @@ CopyReadLineText(CopyFromState cstate)
                 c2 = copy_input_buf[input_buf_ptr++];
  
                 if (c2 != '\r' && c2 != '\n')
-               {
-                   if (!cstate->opts.csv_mode)
-                       ereport(ERROR,
-                               (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-                                errmsg("end-of-copy marker corrupt")));
-                   else
-                       NO_END_OF_COPY_GOTO;
-               }
+                   ereport(ERROR,
+                           (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                            errmsg("end-of-copy marker corrupt")));
  
                 if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
                     (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
@@ -1467,7 +1437,7 @@ CopyReadLineText(CopyFromState cstate)
                 result = true;  /* report EOF */
                 break;
             }
-           else if (!cstate->opts.csv_mode)
+           else
             {
                 /*
                  * If we are here, it means we found a backslash followed by
@@ -1475,23 +1445,11 @@ CopyReadLineText(CopyFromState cstate)
                  * after a backslash is special, so we skip over that second
                  * character too.  If we didn't do that \\. would be
                  * considered an eof-of copy, while in non-CSV mode it is a
-                * literal backslash followed by a period.  In CSV mode,
-                * backslashes are not special, so we want to process the
-                * character after the backslash just like a normal character,
-                * so we don't increment in those cases.
+                * literal backslash followed by a period.
                  */
                 input_buf_ptr++;
             }
         }
-
-       /*
-        * This label is for CSV cases where \. appears at the start of a
-        * line, but there is more text after it, meaning it was a data value.
-        * We are more strict for \. in CSV mode because \. could be a data
-        * value, while in non-CSV mode, \. cannot be a data value.
-        */
-not_end_of_copy:
-       first_char_in_line = false;
     }                           /* end of outer loop */
  
     /*
diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c

index 91de442f4341a614d175ff5b599839ecbcc9e7c8..463083e645dd999ac0c1a438b5978cfab25207ff 100644 (file)
--- a/src/backend/commands/copyto.c
+++ b/src/backend/commands/copyto.c
@@ -1160,8 +1160,11 @@ CopyAttributeOutCSV(CopyToState cstate, const char *string,
     if (!use_quote)
     {
         /*
-        * Because '\.' can be a data value, quote it if it appears alone on a
-        * line so it is not interpreted as the end-of-data marker.
+        * Quote '\.' if it appears alone on a line, so that it will not be
+        * interpreted as an end-of-data marker.  (PG 18 and up will not
+        * interpret '\.' in CSV that way, except in embedded-in-SQL data; but
+        * we want the data to be loadable by older versions too.  Also, this
+        * avoids breaking clients that are still using PQgetline().)
          */
         if (single_attr && strcmp(ptr, "\\.") == 0)
             use_quote = true;
diff --git a/src/bin/psql/copy.c b/src/bin/psql/copy.c

index 961ae3269493a76a40c0957fc86ba93ba001d95c..e020e4d665dab5b609607749ed378bb5bb542cdb 100644 (file)
--- a/src/bin/psql/copy.c
+++ b/src/bin/psql/copy.c
@@ -620,20 +620,29 @@ handleCopyIn(PGconn *conn, FILE *copystream, bool isbinary, PGresult **res)
                 /* current line is done? */
                 if (buf[buflen - 1] == '\n')
                 {
-                   /* check for EOF marker, but not on a partial line */
-                   if (at_line_begin)
+                   /*
+                    * When at the beginning of the line and the data is
+                    * inlined, check for EOF marker.  If the marker is found,
+                    * we must stop at this point.  If not, the \. line can be
+                    * sent to the server, and we let it decide whether it's
+                    * an EOF or not depending on the format: in TEXT mode, \.
+                    * will be interpreted as an EOF, in CSV, it will not.
+                    */
+                   if (at_line_begin && copystream == pset.cur_cmd_source)
                     {
-                       /*
-                        * This code erroneously assumes '\.' on a line alone
-                        * inside a quoted CSV string terminates the \copy.
-                        * https://www.postgresql.org/message-id/E1TdNVQ-0001ju-GO@wrigleys.postgresql.org
-                        *
-                        * https://www.postgresql.org/message-id/bfcd57e4-8f23-4c3e-a5db-2571d09208e2@beta.fastmail.com
-                        */
                         if ((linelen == 3 && memcmp(fgresult, "\\.\n", 3) == 0) ||
                             (linelen == 4 && memcmp(fgresult, "\\.\r\n", 4) == 0))
                         {
                             copydone = true;
+
+                           /*
+                            * Remove the EOF marker from the data sent.  In
+                            * CSV mode, the EOF marker must be removed,
+                            * otherwise it would be interpreted by the server
+                            * as valid data.
+                            */
+                           *fgresult = '\0';
+                           buflen -= linelen;
                         }
                     }
  
diff --git a/src/test/regress/expected/copy.out b/src/test/regress/expected/copy.out

index 44114089a6dce9807223151ea9215e5247205575..174fe056033e2b0ac337205d2acb68c6dfbee4ad 100644 (file)
--- a/src/test/regress/expected/copy.out
+++ b/src/test/regress/expected/copy.out
@@ -32,6 +32,24 @@ select * from copytest except select * from copytest2;
  -------+------+--------
  (0 rows)
  
+--- test unquoted \. as data inside CSV
+-- do not use copy out to export the data, as it would quote \.
+\o :filename
+\qecho line1
+\qecho '\\.'
+\qecho line2
+\o
+-- get the data back in with copy
+truncate copytest2;
+copy copytest2(test) from :'filename' csv;
+select test from copytest2 order by test collate "C";
+ test  
+-------
+ \.
+ line1
+ line2
+(3 rows)
+
  -- test header line feature
  create temp table copytest3 (
     c1 int,
diff --git a/src/test/regress/sql/copy.sql b/src/test/regress/sql/copy.sql

index e2dd24cb3519b61c269f099143ceea036901a648..8ed7922ab492aeaab097a5bd8a45a9cfe768aefc 100644 (file)
--- a/src/test/regress/sql/copy.sql
+++ b/src/test/regress/sql/copy.sql
@@ -38,6 +38,18 @@ copy copytest2 from :'filename' csv quote '''' escape E'\\';
  
  select * from copytest except select * from copytest2;
  
+--- test unquoted \. as data inside CSV
+-- do not use copy out to export the data, as it would quote \.
+\o :filename
+\qecho line1
+\qecho '\\.'
+\qecho line2
+\o
+-- get the data back in with copy
+truncate copytest2;
+copy copytest2(test) from :'filename' csv;
+select test from copytest2 order by test collate "C";
+
  
  -- test header line feature
author	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 30 Sep 2024 21:57:12 +0000 (17:57 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 30 Sep 2024 21:57:12 +0000 (17:57 -0400)
doc/src/sgml/libpq.sgml		patch \| blob \| blame \| history
doc/src/sgml/protocol.sgml		patch \| blob \| blame \| history
doc/src/sgml/ref/copy.sgml		patch \| blob \| blame \| history
doc/src/sgml/ref/psql-ref.sgml		patch \| blob \| blame \| history
src/backend/commands/copyfromparse.c		patch \| blob \| blame \| history
src/backend/commands/copyto.c		patch \| blob \| blame \| history
src/bin/psql/copy.c		patch \| blob \| blame \| history
src/test/regress/expected/copy.out		patch \| blob \| blame \| history
src/test/regress/sql/copy.sql		patch \| blob \| blame \| history