From 5b570d771b80aadc98755208f8f1b81e9a5eb366 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sun, 25 Feb 2018 17:27:20 -0500 Subject: [PATCH] Un-break parallel pg_upgrade. Commit b3f840120 changed pg_upgrade so that it'd actually drop and re-create the template1 and postgres databases in the new cluster. That works fine, serially. With the -j option it's not so fine, because other per-database jobs might be launched while the template1 database is dropped. Since they attempt to connect there to start up, kaboom. This is the cause of the intermittent failures buildfarm member jacana has been showing for the last month; evidently it is the only BF member configured to run the pg_upgrade test with parallelism enabled. Fix by processing template1 separately before we get into the parallel sub-job launch loop. (We could alternatively have made the postgres DB be the special case, but it seems likely that template1 will contain less stuff and so we lose less parallelism with this choice.) --- src/bin/pg_upgrade/pg_upgrade.c | 59 +++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index bbfa4c1ef38..d12412799fa 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -302,13 +302,21 @@ create_new_objects(void) prep_status("Restoring database schemas in the new cluster\n"); + /* + * We cannot process the template1 database concurrently with others, + * because when it's transiently dropped, connection attempts would fail. + * So handle it in a separate non-parallelized pass. + */ for (dbnum = 0; dbnum < old_cluster.dbarr.ndbs; dbnum++) { char sql_file_name[MAXPGPATH], log_file_name[MAXPGPATH]; DbInfo *old_db = &old_cluster.dbarr.dbs[dbnum]; const char *create_opts; - const char *starting_db; + + /* Process only template1 in this pass */ + if (strcmp(old_db->db_name, "template1") != 0) + continue; pg_log(PG_STATUS, "%s", old_db->db_name); snprintf(sql_file_name, sizeof(sql_file_name), DB_DUMP_FILE_MASK, old_db->db_oid); @@ -320,26 +328,55 @@ create_new_objects(void) * otherwise we would fail to propagate their database-level * properties. */ - if (strcmp(old_db->db_name, "template1") == 0 || - strcmp(old_db->db_name, "postgres") == 0) - create_opts = "--clean --create"; - else - create_opts = "--create"; + create_opts = "--clean --create"; + + exec_prog(log_file_name, + NULL, + true, + true, + "\"%s/pg_restore\" %s %s --exit-on-error --verbose " + "--dbname postgres \"%s\"", + new_cluster.bindir, + cluster_conn_opts(&new_cluster), + create_opts, + sql_file_name); - /* When processing template1, we can't connect there to start with */ + break; /* done once we've processed template1 */ + } + + for (dbnum = 0; dbnum < old_cluster.dbarr.ndbs; dbnum++) + { + char sql_file_name[MAXPGPATH], + log_file_name[MAXPGPATH]; + DbInfo *old_db = &old_cluster.dbarr.dbs[dbnum]; + const char *create_opts; + + /* Skip template1 in this pass */ if (strcmp(old_db->db_name, "template1") == 0) - starting_db = "postgres"; + continue; + + pg_log(PG_STATUS, "%s", old_db->db_name); + snprintf(sql_file_name, sizeof(sql_file_name), DB_DUMP_FILE_MASK, old_db->db_oid); + snprintf(log_file_name, sizeof(log_file_name), DB_DUMP_LOG_FILE_MASK, old_db->db_oid); + + /* + * template1 and postgres databases will already exist in the target + * installation, so tell pg_restore to drop and recreate them; + * otherwise we would fail to propagate their database-level + * properties. + */ + if (strcmp(old_db->db_name, "postgres") == 0) + create_opts = "--clean --create"; else - starting_db = "template1"; + create_opts = "--create"; parallel_exec_prog(log_file_name, NULL, "\"%s/pg_restore\" %s %s --exit-on-error --verbose " - "--dbname %s \"%s\"", + "--dbname template1 \"%s\"", new_cluster.bindir, cluster_conn_opts(&new_cluster), create_opts, - starting_db, sql_file_name); } -- 2.30.2