Add header matching mode to COPY FROM
authorPeter Eisentraut <peter@eisentraut.org>
Wed, 30 Mar 2022 06:56:58 +0000 (08:56 +0200)
committerPeter Eisentraut <peter@eisentraut.org>
Wed, 30 Mar 2022 07:02:31 +0000 (09:02 +0200)
COPY FROM supports the HEADER option to silently discard the header
line from a CSV or text file.  It is possible to load by mistake a
file that matches the expected format, for example, if two text
columns have been swapped, resulting in garbage in the database.

This adds a new option value HEADER MATCH that checks the column names
in the header line against the actual column names and errors out if
they do not match.

Author: RĂ©mi Lapeyre <remi.lapeyre@lenstra.fr>
Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Eisentraut <peter.eisentraut@enterprisedb.com>
Discussion: https://www.postgresql.org/message-id/flat/CAF1-J-0PtCWMeLtswwGV2M70U26n4g33gpe1rcKQqe6wVQDrFA@mail.gmail.com

contrib/file_fdw/expected/file_fdw.out
contrib/file_fdw/sql/file_fdw.sql
doc/src/sgml/ref/copy.sgml
src/backend/commands/copy.c
src/backend/commands/copyfromparse.c
src/include/commands/copy.h
src/test/regress/expected/copy.out
src/test/regress/sql/copy.sql

index 14acdb27e5b62684ebb7be62196c75b1c51f89c6..0029f36b3593ce9f7f05138cf6b89817de3e39a0 100644 (file)
@@ -113,6 +113,21 @@ CREATE FOREIGN TABLE agg_bad (
 ) SERVER file_server
 OPTIONS (format 'csv', filename :'filename', header 'true', delimiter ';', quote '@', escape '"', null '');
 ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0);
+-- test header matching
+\set filename :abs_srcdir '/data/list1.csv'
+CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');
+SELECT * FROM header_match;
+ 1 | foo 
+---+-----
+ 1 | bar
+(1 row)
+
+CREATE FOREIGN TABLE header_doesnt_match (a int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');
+SELECT * FROM header_doesnt_match; -- ERROR
+ERROR:  column name mismatch in header line field 1: got "1", expected "a"
+CONTEXT:  COPY header_doesnt_match, line 1: "1,foo"
 -- per-column options tests
 \set filename :abs_srcdir '/data/text.csv'
 CREATE FOREIGN TABLE text_csv (
@@ -464,12 +479,14 @@ SET ROLE regress_file_fdw_superuser;
 -- cleanup
 RESET ROLE;
 DROP EXTENSION file_fdw CASCADE;
-NOTICE:  drop cascades to 7 other objects
+NOTICE:  drop cascades to 9 other objects
 DETAIL:  drop cascades to server file_server
 drop cascades to user mapping for regress_file_fdw_superuser on server file_server
 drop cascades to user mapping for regress_no_priv_user on server file_server
 drop cascades to foreign table agg_text
 drop cascades to foreign table agg_csv
 drop cascades to foreign table agg_bad
+drop cascades to foreign table header_match
+drop cascades to foreign table header_doesnt_match
 drop cascades to foreign table text_csv
 DROP ROLE regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
index 86f876d565f1766da5bbfcb812b1b57e2bafaa14..563d824ccc8c85373bc6dbf3114e69724662c12f 100644 (file)
@@ -103,6 +103,15 @@ CREATE FOREIGN TABLE agg_bad (
 OPTIONS (format 'csv', filename :'filename', header 'true', delimiter ';', quote '@', escape '"', null '');
 ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0);
 
+-- test header matching
+\set filename :abs_srcdir '/data/list1.csv'
+CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');
+SELECT * FROM header_match;
+CREATE FOREIGN TABLE header_doesnt_match (a int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');
+SELECT * FROM header_doesnt_match; -- ERROR
+
 -- per-column options tests
 \set filename :abs_srcdir '/data/text.csv'
 CREATE FOREIGN TABLE text_csv (
index 1b7d001963667c0299bc4f6f57750164cc88ecee..e7a6676efd053613b3bf9e3b6ec355c85dd123e2 100644 (file)
@@ -36,7 +36,7 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
     FREEZE [ <replaceable class="parameter">boolean</replaceable> ]
     DELIMITER '<replaceable class="parameter">delimiter_character</replaceable>'
     NULL '<replaceable class="parameter">null_string</replaceable>'
-    HEADER [ <replaceable class="parameter">boolean</replaceable> ]
+    HEADER [ <replaceable class="parameter">boolean</replaceable> | <literal>match</literal> ]
     QUOTE '<replaceable class="parameter">quote_character</replaceable>'
     ESCAPE '<replaceable class="parameter">escape_character</replaceable>'
     FORCE_QUOTE { ( <replaceable class="parameter">column_name</replaceable> [, ...] ) | * }
@@ -276,7 +276,11 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
      <para>
       Specifies that the file contains a header line with the names of each
       column in the file.  On output, the first line contains the column
-      names from the table, and on input, the first line is ignored.
+      names from the table.  On input, the first line is discarded when this
+      option is set to <literal>true</literal> (or equivalent Boolean value).
+      If this option is set to <literal>match</literal>, the number and names
+      of the columns in the header line must match the actual column names of
+      the table, otherwise an error is raised.
       This option is not allowed when using <literal>binary</literal> format.
      </para>
     </listitem>
index 7a0c897cc97f320f05590ab5dd961931abd00a6b..689713ea5802b7303a6cd48830cdd9f1565407e6 100644 (file)
@@ -313,6 +313,64 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt,
                table_close(rel, NoLock);
 }
 
+/*
+ * Extract a CopyHeaderChoice value from a DefElem.  This is like
+ * defGetBoolean() but also accepts the special value "match".
+ */
+static CopyHeaderChoice
+defGetCopyHeaderChoice(DefElem *def)
+{
+       /*
+        * If no parameter given, assume "true" is meant.
+        */
+       if (def->arg == NULL)
+               return COPY_HEADER_TRUE;
+
+       /*
+        * Allow 0, 1, "true", "false", "on", "off", or "match".
+        */
+       switch (nodeTag(def->arg))
+       {
+               case T_Integer:
+                       switch (intVal(def->arg))
+                       {
+                               case 0:
+                                       return COPY_HEADER_FALSE;
+                               case 1:
+                                       return COPY_HEADER_TRUE;
+                               default:
+                                       /* otherwise, error out below */
+                                       break;
+                       }
+                       break;
+               default:
+                       {
+                               char    *sval = defGetString(def);
+
+                               /*
+                                * The set of strings accepted here should match up with the
+                                * grammar's opt_boolean_or_string production.
+                                */
+                               if (pg_strcasecmp(sval, "true") == 0)
+                                       return COPY_HEADER_TRUE;
+                               if (pg_strcasecmp(sval, "false") == 0)
+                                       return COPY_HEADER_FALSE;
+                               if (pg_strcasecmp(sval, "on") == 0)
+                                       return COPY_HEADER_TRUE;
+                               if (pg_strcasecmp(sval, "off") == 0)
+                                       return COPY_HEADER_FALSE;
+                               if (pg_strcasecmp(sval, "match") == 0)
+                                       return COPY_HEADER_MATCH;
+                       }
+                       break;
+       }
+       ereport(ERROR,
+                               (errcode(ERRCODE_SYNTAX_ERROR),
+                                errmsg("%s requires a Boolean value or \"match\"",
+                                       def->defname)));
+       return COPY_HEADER_FALSE;       /* keep compiler quiet */
+}
+
 /*
  * Process the statement option list for COPY.
  *
@@ -394,7 +452,7 @@ ProcessCopyOptions(ParseState *pstate,
                        if (header_specified)
                                errorConflictingDefElem(defel, pstate);
                        header_specified = true;
-                       opts_out->header_line = defGetBoolean(defel);
+                       opts_out->header_line = defGetCopyHeaderChoice(defel);
                }
                else if (strcmp(defel->defname, "quote") == 0)
                {
index baf328b620c98e2afdb6ff7e4dd7c5b61202b196..58017ec53b005792b6583c434b6c7fc10d88f7e4 100644 (file)
@@ -72,6 +72,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "port/pg_bswap.h"
+#include "utils/builtins.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
@@ -758,12 +759,58 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
        /* only available for text or csv input */
        Assert(!cstate->opts.binary);
 
-       /* on input just throw the header line away */
+       /* on input check that the header line is correct if needed */
        if (cstate->cur_lineno == 0 && cstate->opts.header_line)
        {
+               ListCell   *cur;
+               TupleDesc       tupDesc;
+
+               tupDesc = RelationGetDescr(cstate->rel);
+
                cstate->cur_lineno++;
-               if (CopyReadLine(cstate))
-                       return false;           /* done */
+               done = CopyReadLine(cstate);
+
+               if (cstate->opts.header_line == COPY_HEADER_MATCH)
+               {
+                       int                     fldnum;
+
+                       if (cstate->opts.csv_mode)
+                               fldct = CopyReadAttributesCSV(cstate);
+                       else
+                               fldct = CopyReadAttributesText(cstate);
+
+                       if (fldct != list_length(cstate->attnumlist))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                errmsg("wrong number of fields in header line: field count is %d, expected %d",
+                                                               fldct, list_length(cstate->attnumlist))));
+
+                       fldnum = 0;
+                       foreach(cur, cstate->attnumlist)
+                       {
+                               int                     attnum = lfirst_int(cur);
+                               char       *colName = cstate->raw_fields[attnum - 1];
+                               Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
+
+                               fldnum++;
+
+                               if (colName == NULL)
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                        errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
+                                                                       fldnum, cstate->opts.null_print, NameStr(attr->attname))));
+
+                               if (namestrcmp(&attr->attname, colName) != 0) {
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                        errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
+                                                                       fldnum, colName, NameStr(attr->attname))));
+                               }
+                       }
+               }
+
+               if (done)
+                       return false;
        }
 
        cstate->cur_lineno++;
index 8694da5004cdaebf7cea9d5a4992fe6532ee15e6..cb0096aeb67440ebf25e83688bec2056596b9b3b 100644 (file)
 #include "parser/parse_node.h"
 #include "tcop/dest.h"
 
+/*
+ * Represents whether a header line should be present, and whether it must
+ * match the actual names (which implies "true").
+ */
+typedef enum CopyHeaderChoice
+{
+       COPY_HEADER_FALSE = 0,
+       COPY_HEADER_TRUE,
+       COPY_HEADER_MATCH,
+} CopyHeaderChoice;
+
 /*
  * A struct to hold COPY options, in a parsed form. All of these are related
  * to formatting, except for 'freeze', which doesn't really belong here, but
@@ -32,7 +43,7 @@ typedef struct CopyFormatOptions
        bool            binary;                 /* binary format? */
        bool            freeze;                 /* freeze rows on loading? */
        bool            csv_mode;               /* Comma Separated Value format? */
-       bool            header_line;    /* header line? */
+       CopyHeaderChoice header_line;   /* header line? */
        char       *null_print;         /* NULL marker string (server encoding!) */
        int                     null_print_len; /* length of same */
        char       *null_print_client;  /* same converted to file encoding */
index bec19dc44dd943f4b93dbcba16bc86dc85ba5f88..e8d6b4fc13b0214f252bd89836f696ab5e829d77 100644 (file)
@@ -176,3 +176,26 @@ INFO:  progress: {"type": "FILE", "command": "COPY FROM", "relname": "tab_progre
 drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
 drop function notice_after_tab_progress_reporting();
 drop table tab_progress_reporting;
+-- Test header matching feature
+create table header_copytest (
+       a int,
+       b int,
+       c text
+);
+copy header_copytest from stdin with (header wrong_choice);
+ERROR:  header requires a Boolean value or "match"
+copy header_copytest from stdin with (header match);
+copy header_copytest from stdin with (header match);
+ERROR:  column name mismatch in header line field 3: got null value ("\N"), expected "c"
+CONTEXT:  COPY header_copytest, line 1: "a     b       \N"
+copy header_copytest from stdin with (header match);
+ERROR:  wrong number of fields in header line: field count is 2, expected 3
+CONTEXT:  COPY header_copytest, line 1: "a     b"
+copy header_copytest from stdin with (header match);
+ERROR:  wrong number of fields in header line: field count is 4, expected 3
+CONTEXT:  COPY header_copytest, line 1: "a     b       c       d"
+copy header_copytest from stdin with (header match);
+ERROR:  column name mismatch in header line field 3: got "d", expected "c"
+CONTEXT:  COPY header_copytest, line 1: "a     b       d"
+copy header_copytest from stdin with (header match, format csv);
+drop table header_copytest;
index eb302773ab930a115fa296e7cb5a18b7fa242673..d72d226f341f42c69ffcb773c4faf53d9e586894 100644 (file)
@@ -197,3 +197,36 @@ copy tab_progress_reporting from :'filename'
 drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
 drop function notice_after_tab_progress_reporting();
 drop table tab_progress_reporting;
+
+-- Test header matching feature
+create table header_copytest (
+       a int,
+       b int,
+       c text
+);
+copy header_copytest from stdin with (header wrong_choice);
+copy header_copytest from stdin with (header match);
+a      b       c
+1      2       foo
+\.
+copy header_copytest from stdin with (header match);
+a      b       \N
+1      2       foo
+\.
+copy header_copytest from stdin with (header match);
+a      b
+1      2
+\.
+copy header_copytest from stdin with (header match);
+a      b       c       d
+1      2       foo     bar
+\.
+copy header_copytest from stdin with (header match);
+a      b       d
+1      2       foo
+\.
+copy header_copytest from stdin with (header match, format csv);
+a,b,c
+1,2,foo
+\.
+drop table header_copytest;