summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/Makefile1
-rw-r--r--contrib/hashtest/Makefile18
-rw-r--r--contrib/hashtest/hashtest--1.0.sql52
-rw-r--r--contrib/hashtest/hashtest.c527
-rw-r--r--contrib/hashtest/hashtest.control4
-rw-r--r--contrib/pg_upgrade/check.c216
-rw-r--r--contrib/pg_upgrade/controldata.c34
-rw-r--r--contrib/pg_upgrade/info.c14
-rw-r--r--contrib/pg_upgrade/pg_upgrade.h8
-rw-r--r--doc/src/sgml/catalogs.sgml3
-rw-r--r--doc/src/sgml/json.sgml12
-rw-r--r--doc/src/sgml/plpgsql.sgml4
-rw-r--r--src/Makefile.global.in2
-rw-r--r--src/backend/access/index/genam.c1
-rw-r--r--src/backend/access/nbtree/nbtpage.c2
-rw-r--r--src/backend/access/transam/xlog.c6
-rw-r--r--src/backend/catalog/heap.c1
-rw-r--r--src/backend/commands/dbcommands.c2
-rw-r--r--src/backend/commands/explain.c12
-rw-r--r--src/backend/commands/matview.c2
-rw-r--r--src/backend/commands/tablecmds.c3
-rw-r--r--src/backend/commands/typecmds.c1
-rw-r--r--src/backend/commands/view.c2
-rw-r--r--src/backend/executor/nodeHash.c131
-rw-r--r--src/backend/libpq/be-secure-openssl.c2
-rw-r--r--src/backend/storage/buffer/buf_table.c109
-rw-r--r--src/backend/storage/buffer/bufmgr.c194
-rw-r--r--src/backend/storage/ipc/shm_mq.c126
-rw-r--r--src/backend/storage/ipc/shmem.c23
-rw-r--r--src/backend/utils/adt/jsonb_op.c10
-rw-r--r--src/backend/utils/adt/jsonb_util.c16
-rw-r--r--src/backend/utils/adt/misc.c1
-rw-r--r--src/backend/utils/adt/ruleutils.c1
-rw-r--r--src/backend/utils/hash/Makefile2
-rw-r--r--src/backend/utils/hash/chash.c1075
-rw-r--r--src/bin/pg_basebackup/pg_receivexlog.c10
-rw-r--r--src/bin/pg_basebackup/pg_recvlogical.c31
-rw-r--r--src/bin/pg_ctl/pg_ctl.c34
-rw-r--r--src/include/executor/hashjoin.h5
-rw-r--r--src/include/storage/barrier.h8
-rw-r--r--src/include/storage/buf_internals.h20
-rw-r--r--src/include/storage/lwlock.h2
-rw-r--r--src/include/storage/proc.h14
-rw-r--r--src/include/storage/shm_mq.h14
-rw-r--r--src/include/storage/shmem.h1
-rw-r--r--src/include/utils/builtins.h11
-rw-r--r--src/include/utils/chash.h69
-rw-r--r--src/include/utils/ruleutils.h34
-rw-r--r--src/port/crypt.c2
-rw-r--r--src/test/regress/expected/jsonb.out36
-rw-r--r--src/test/regress/expected/jsonb_1.out36
-rw-r--r--src/test/regress/expected/matview.out2
-rw-r--r--src/test/regress/expected/polygon.out35
-rw-r--r--src/test/regress/sql/jsonb.sql7
-rw-r--r--src/test/regress/sql/polygon.sql35
55 files changed, 2493 insertions, 530 deletions
diff --git a/contrib/Makefile b/contrib/Makefile
index b37d0dd2c3..0b91ac10ee 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -20,6 +20,7 @@ SUBDIRS = \
earthdistance \
file_fdw \
fuzzystrmatch \
+ hashtest \
hstore \
intagg \
intarray \
diff --git a/contrib/hashtest/Makefile b/contrib/hashtest/Makefile
new file mode 100644
index 0000000000..3ee42f87d8
--- /dev/null
+++ b/contrib/hashtest/Makefile
@@ -0,0 +1,18 @@
+# contrib/hashtest/Makefile
+
+MODULE_big = hashtest
+OBJS = hashtest.o
+
+EXTENSION = hashtest
+DATA = hashtest--1.0.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/hashtest
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/hashtest/hashtest--1.0.sql b/contrib/hashtest/hashtest--1.0.sql
new file mode 100644
index 0000000000..e271baff0f
--- /dev/null
+++ b/contrib/hashtest/hashtest--1.0.sql
@@ -0,0 +1,52 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION hashtest" to load this file. \quit
+
+CREATE FUNCTION chash_insert_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_insert_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_search_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_search_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_delete_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_delete_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_concurrent_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_concurrent_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_collision_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_collision_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_insert_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_insert_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_search_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_search_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_delete_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_delete_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_concurrent_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_concurrent_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_collision_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_collision_test'
+LANGUAGE C;
diff --git a/contrib/hashtest/hashtest.c b/contrib/hashtest/hashtest.c
new file mode 100644
index 0000000000..172a5bb156
--- /dev/null
+++ b/contrib/hashtest/hashtest.c
@@ -0,0 +1,527 @@
+/*-------------------------------------------------------------------------
+ * hashtest.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "funcapi.h"
+#include "libpq/auth.h"
+#include "lib/stringinfo.h"
+#include "miscadmin.h"
+#include "portability/instr_time.h"
+#include "storage/ipc.h"
+#include "utils/chash.h"
+
+PG_MODULE_MAGIC;
+
+void _PG_init(void);
+Datum chash_insert_test(PG_FUNCTION_ARGS);
+Datum chash_search_test(PG_FUNCTION_ARGS);
+Datum chash_delete_test(PG_FUNCTION_ARGS);
+Datum chash_concurrent_test(PG_FUNCTION_ARGS);
+Datum chash_collision_test(PG_FUNCTION_ARGS);
+Datum dynahash_insert_test(PG_FUNCTION_ARGS);
+Datum dynahash_search_test(PG_FUNCTION_ARGS);
+Datum dynahash_delete_test(PG_FUNCTION_ARGS);
+Datum dynahash_concurrent_test(PG_FUNCTION_ARGS);
+Datum dynahash_collision_test(PG_FUNCTION_ARGS);
+static void hashtest_shmem_startup(void);
+
+PG_FUNCTION_INFO_V1(chash_insert_test);
+PG_FUNCTION_INFO_V1(chash_search_test);
+PG_FUNCTION_INFO_V1(chash_delete_test);
+PG_FUNCTION_INFO_V1(chash_concurrent_test);
+PG_FUNCTION_INFO_V1(chash_collision_test);
+PG_FUNCTION_INFO_V1(dynahash_insert_test);
+PG_FUNCTION_INFO_V1(dynahash_search_test);
+PG_FUNCTION_INFO_V1(dynahash_delete_test);
+PG_FUNCTION_INFO_V1(dynahash_concurrent_test);
+PG_FUNCTION_INFO_V1(dynahash_collision_test);
+
+typedef struct
+{
+ uint32 key;
+ uint32 val;
+} hentry;
+
+static CHashDescriptor cdesc = {
+ "hashtest-chash", /* name */
+ 1048576, /* capacity */
+ sizeof(hentry), /* element size */
+ sizeof(uint32) /* key size */
+};
+
+#define DYNAHASH_PARTITIONS 16
+
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+static CHashTable chash;
+static HTAB *dynahash;
+static LWLockId dynahash_lock[DYNAHASH_PARTITIONS];
+static ClientAuthentication_hook_type original_client_auth_hook = NULL;
+
+static void hashtest_client_auth_hook(Port *port, int status);
+static void chash_write_stats_to_log(int code, Datum dummy);
+
+#define dynahash_get_lock(hashcode) \
+ (dynahash_lock[(hashcode) % DYNAHASH_PARTITIONS])
+
+void
+_PG_init(void)
+{
+ Size cs;
+ Size ds;
+
+ if (!process_shared_preload_libraries_in_progress)
+ return;
+ prev_shmem_startup_hook = shmem_startup_hook;
+ shmem_startup_hook = hashtest_shmem_startup;
+ chash = CHashBootstrap(&cdesc);
+ cs = CHashEstimateSize(chash);
+ RequestAddinShmemSpace(cs);
+ ds = hash_estimate_size(cdesc.capacity, cdesc.element_size);
+ RequestAddinShmemSpace(ds);
+ elog(LOG, "chash: %u bytes; dynahash: %u bytes", (unsigned) cs,
+ (unsigned) ds);
+ RequestAddinLWLocks(DYNAHASH_PARTITIONS);
+ original_client_auth_hook = ClientAuthentication_hook;
+ ClientAuthentication_hook = hashtest_client_auth_hook;
+
+}
+
+static void
+hashtest_client_auth_hook(Port *port, int status)
+{
+ if (original_client_auth_hook)
+ original_client_auth_hook(port, status);
+ on_proc_exit(chash_write_stats_to_log, (Datum) 0);
+}
+
+static void
+chash_write_stats_to_log(int code, Datum dummy)
+{
+ uint64 stats[CHS_NumberOfStatistics];
+ CHashStatisticsType i;
+ StringInfoData buf;
+
+ CHashStatistics(chash, stats);
+ initStringInfo(&buf);
+
+ for (i = 0; i < CHS_NumberOfStatistics; ++i)
+ {
+ if (stats[i] == 0)
+ continue;
+ appendStringInfo(&buf, UINT64_FORMAT " %s; ", stats[i],
+ CHashStatisticsNames[i]);
+ }
+
+ if (buf.len > 1)
+ {
+ buf.data[buf.len-2] = '\0';
+ elog(LOG, "chash statistics: %s", buf.data);
+ }
+}
+
+static void
+hashtest_shmem_startup(void)
+{
+ HASHCTL info;
+ uint32 i;
+
+ if (prev_shmem_startup_hook)
+ prev_shmem_startup_hook();
+
+ /* Initialize concurrent hash table. */
+ chash = CHashInitialize(chash, &cdesc);
+
+ /* Initialize shared dynahash table. */
+ info.keysize = cdesc.key_size;
+ info.entrysize = cdesc.element_size;
+ info.hash = tag_hash;
+ info.num_partitions = DYNAHASH_PARTITIONS;
+
+ dynahash = ShmemInitHash("hashtest-dynahash",
+ cdesc.capacity, cdesc.capacity,
+ &info,
+ HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+
+ for (i = 0; i < DYNAHASH_PARTITIONS; ++i)
+ dynahash_lock[i] = LWLockAssign();
+}
+
+Datum
+chash_insert_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ e.key = i;
+ e.val = i * 31;
+ ok = CHashInsert(chash, &e);
+ if (!ok)
+ elog(LOG, "insert %u: failed", i);
+ ok = CHashInsert(chash, &e);
+ if (ok)
+ elog(LOG, "insert %u: worked twice", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+chash_search_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ e.key = i;
+ ok = CHashSearch(chash, &e);
+ if (!ok)
+ elog(LOG, "search %u: not found", i);
+ else if (e.val != e.key * 31)
+ elog(LOG, "search %u: found %u", i, e.val);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+chash_delete_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ e.key = i;
+ ok = CHashDelete(chash, &e);
+ if (!ok)
+ elog(LOG, "delete %u: not found", i);
+ ok = CHashDelete(chash, &e);
+ if (ok)
+ elog(LOG, "delete %u: found twice", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+chash_concurrent_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+ uint32 seed = MyProcPid << 16;
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ e.key = seed | i;
+ e.val = MyProcPid;
+ ok = CHashInsert(chash, &e);
+ if (!ok)
+ elog(LOG, "insert %u: found", i);
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ e.key = seed | i;
+ e.val = 0;
+ ok = CHashSearch(chash, &e);
+ if (!ok)
+ {
+ uint64 retry = 1;
+ elog(LOG, "search %u: not found", i);
+ while (!CHashSearch(chash, &e))
+ ++retry;
+ elog(LOG, "search %u: eventually found it after "
+ UINT64_FORMAT " retries", i, retry);
+ }
+ if (e.val != MyProcPid)
+ elog(LOG, "search %u: expected %u found %u", i, (unsigned) MyProcPid, e.val);
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ e.key = seed | i;
+ ok = CHashDelete(chash, &e);
+ if (!ok)
+ {
+ uint64 retry = 1;
+ elog(LOG, "delete %u: not found", i);
+ while (!CHashDelete(chash, &e))
+ ++retry;
+ elog(LOG, "delete %u: eventually deleted it after "
+ UINT64_FORMAT " retries", i, retry);
+ }
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+chash_collision_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+
+ /* Don't stack-allocate this. */
+ static bool mine[10000];
+
+ memset(mine, 0, 10000 * sizeof(bool));
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ e.key = i;
+ e.val = MyProcPid;
+ ok = CHashInsert(chash, &e);
+ if (ok)
+ mine[i] = true;
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ if (!mine[i])
+ continue;
+ e.key = i;
+ ok = CHashSearch(chash, &e);
+ if (!ok)
+ elog(LOG, "search %u: not found", i);
+ else if (e.val != MyProcPid)
+ elog(LOG, "search %u: expected %u found %u",
+ i, (unsigned) MyProcPid, e.val);
+ ok = CHashDelete(chash, &e);
+ if (!ok)
+ elog(LOG, "delete %u: not found", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+static bool
+dynahash_insert(uint32 key, uint32 val)
+{
+ bool found;
+ uint32 hashcode;
+ hentry *e;
+ LWLockId lockid;
+
+ hashcode = get_hash_value(dynahash, (void *) &key);
+ lockid = dynahash_get_lock(hashcode);
+ LWLockAcquire(lockid, LW_EXCLUSIVE);
+ e = hash_search_with_hash_value(dynahash, (void *) &key,
+ hashcode, HASH_ENTER, &found);
+ if (!found)
+ e->val = val;
+ LWLockRelease(lockid);
+
+ return !found;
+}
+
+static bool
+dynahash_search(uint32 key, uint32 *val)
+{
+ uint32 hashcode;
+ hentry *e;
+ LWLockId lockid;
+
+ hashcode = get_hash_value(dynahash, (void *) &key);
+ lockid = dynahash_get_lock(hashcode);
+ LWLockAcquire(lockid, LW_SHARED);
+ e = hash_search_with_hash_value(dynahash, (void *) &key,
+ hashcode, HASH_FIND, NULL);
+ if (e)
+ *val = e->val;
+ LWLockRelease(lockid);
+
+ return e != NULL;
+}
+
+static bool
+dynahash_delete(uint32 key)
+{
+ uint32 hashcode;
+ hentry *e;
+ LWLockId lockid;
+
+ hashcode = get_hash_value(dynahash, (void *) &key);
+ lockid = dynahash_get_lock(hashcode);
+ LWLockAcquire(lockid, LW_EXCLUSIVE);
+ e = hash_search_with_hash_value(dynahash, (void *) &key,
+ hashcode, HASH_REMOVE, NULL);
+ LWLockRelease(lockid);
+
+ return e != NULL;
+}
+
+Datum
+dynahash_insert_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_insert(i, i * 31);
+ if (!ok)
+ elog(LOG, "insert %u: failed", i);
+ ok = dynahash_insert(i, i * 31);
+ if (ok)
+ elog(LOG, "insert %u: worked twice", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+dynahash_search_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+ uint32 val;
+
+ ok = dynahash_search(i, &val);
+ if (!ok)
+ elog(LOG, "search %u: not found", i);
+ else if (val != i* 31)
+ elog(LOG, "search %u: found %u", i, val);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+dynahash_delete_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_delete(i);
+ if (!ok)
+ elog(LOG, "delete %u: not found", i);
+ ok = dynahash_delete(i);
+ if (ok)
+ elog(LOG, "delete %u: found twice", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+dynahash_concurrent_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ uint32 val;
+ uint32 seed = MyProcPid << 16;
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_insert(seed | i, MyProcPid);
+ if (!ok)
+ elog(LOG, "insert %u: found", i);
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_search(seed | i, &val);
+ if (!ok)
+ {
+ uint64 retry = 1;
+ elog(LOG, "search %u: not found", i);
+ while (!dynahash_search(seed | i, &val))
+ ++retry;
+ elog(LOG, "search %u: eventually found it after "
+ UINT64_FORMAT " retries", i, retry);
+ }
+ if (val != MyProcPid)
+ elog(LOG, "search %u: expected %u found %u",
+ i, (unsigned) MyProcPid, val);
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_delete(seed | i);
+ if (!ok)
+ {
+ uint64 retry = 1;
+ elog(LOG, "delete %u: not found", i);
+ while (!dynahash_delete(seed | i))
+ ++retry;
+ elog(LOG, "delete %u: eventually deleted it after "
+ UINT64_FORMAT " retries", i, retry);
+ }
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+dynahash_collision_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ uint32 val;
+
+ /* Don't stack-allocate this. */
+ static bool mine[10000];
+
+ memset(mine, 0, 10000 * sizeof(bool));
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_insert(i, MyProcPid);
+ if (ok)
+ mine[i] = true;
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ if (!mine[i])
+ continue;
+ ok = dynahash_search(i, &val);
+ if (!ok)
+ elog(LOG, "search %u: not found", i);
+ else if (val != MyProcPid)
+ elog(LOG, "search %u: expected %u found %u",
+ i, (unsigned) MyProcPid, val);
+ ok = dynahash_delete(i);
+ if (!ok)
+ elog(LOG, "delete %u: not found", i);
+ }
+
+ PG_RETURN_VOID();
+}
diff --git a/contrib/hashtest/hashtest.control b/contrib/hashtest/hashtest.control
new file mode 100644
index 0000000000..b8e0f01346
--- /dev/null
+++ b/contrib/hashtest/hashtest.control
@@ -0,0 +1,4 @@
+comment = 'hash testing code'
+default_version = '1.0'
+module_pathname = '$libdir/hashtest'
+relocatable = true
diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c
index bbfcab71ce..56db0dd654 100644
--- a/contrib/pg_upgrade/check.c
+++ b/contrib/pg_upgrade/check.c
@@ -14,12 +14,10 @@
#include "pg_upgrade.h"
-static void set_locale_and_encoding(ClusterInfo *cluster);
static void check_new_cluster_is_empty(void);
-static void check_locale_and_encoding(ControlData *oldctrl,
- ControlData *newctrl);
-static bool equivalent_locale(const char *loca, const char *locb);
-static bool equivalent_encoding(const char *chara, const char *charb);
+static void check_databases_are_compatible(void);
+static void check_locale_and_encoding(DbInfo *olddb, DbInfo *newdb);
+static bool equivalent_locale(int category, const char *loca, const char *locb);
static void check_is_install_user(ClusterInfo *cluster);
static void check_for_prepared_transactions(ClusterInfo *cluster);
static void check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster);
@@ -81,8 +79,6 @@ check_and_dump_old_cluster(bool live_check)
if (!live_check)
start_postmaster(&old_cluster, true);
- set_locale_and_encoding(&old_cluster);
-
get_pg_database_relfilenode(&old_cluster);
/* Extract a list of databases and tables from the old cluster */
@@ -127,13 +123,10 @@ check_and_dump_old_cluster(bool live_check)
void
check_new_cluster(void)
{
- set_locale_and_encoding(&new_cluster);
-
- check_locale_and_encoding(&old_cluster.controldata, &new_cluster.controldata);
-
get_db_and_rel_infos(&new_cluster);
check_new_cluster_is_empty();
+ check_databases_are_compatible();
check_loadable_libraries();
@@ -279,93 +272,25 @@ check_cluster_compatibility(bool live_check)
/*
- * set_locale_and_encoding()
- *
- * query the database to get the template0 locale
- */
-static void
-set_locale_and_encoding(ClusterInfo *cluster)
-{
- ControlData *ctrl = &cluster->controldata;
- PGconn *conn;
- PGresult *res;
- int i_encoding;
- int cluster_version = cluster->major_version;
-
- conn = connectToServer(cluster, "template1");
-
- /* for pg < 80400, we got the values from pg_controldata */
- if (cluster_version >= 80400)
- {
- int i_datcollate;
- int i_datctype;
-
- res = executeQueryOrDie(conn,
- "SELECT datcollate, datctype "
- "FROM pg_catalog.pg_database "
- "WHERE datname = 'template0' ");
- assert(PQntuples(res) == 1);
-
- i_datcollate = PQfnumber(res, "datcollate");
- i_datctype = PQfnumber(res, "datctype");
-
- if (GET_MAJOR_VERSION(cluster->major_version) < 902)
- {
- /*
- * Pre-9.2 did not canonicalize the supplied locale names to match
- * what the system returns, while 9.2+ does, so convert pre-9.2 to
- * match.
- */
- ctrl->lc_collate = get_canonical_locale_name(LC_COLLATE,
- pg_strdup(PQgetvalue(res, 0, i_datcollate)));
- ctrl->lc_ctype = get_canonical_locale_name(LC_CTYPE,
- pg_strdup(PQgetvalue(res, 0, i_datctype)));
- }
- else
- {
- ctrl->lc_collate = pg_strdup(PQgetvalue(res, 0, i_datcollate));
- ctrl->lc_ctype = pg_strdup(PQgetvalue(res, 0, i_datctype));
- }
-
- PQclear(res);
- }
-
- res = executeQueryOrDie(conn,
- "SELECT pg_catalog.pg_encoding_to_char(encoding) "
- "FROM pg_catalog.pg_database "
- "WHERE datname = 'template0' ");
- assert(PQntuples(res) == 1);
-
- i_encoding = PQfnumber(res, "pg_encoding_to_char");
- ctrl->encoding = pg_strdup(PQgetvalue(res, 0, i_encoding));
-
- PQclear(res);
-
- PQfinish(conn);
-}
-
-
-/*
* check_locale_and_encoding()
*
- * Check that old and new locale and encoding match. Even though the backend
- * tries to canonicalize stored locale names, the platform often doesn't
- * cooperate, so it's entirely possible that one DB thinks its locale is
- * "en_US.UTF-8" while the other says "en_US.utf8". Try to be forgiving.
+ * Check that locale and encoding of a database in the old and new clusters
+ * are compatible.
*/
static void
-check_locale_and_encoding(ControlData *oldctrl,
- ControlData *newctrl)
+check_locale_and_encoding(DbInfo *olddb, DbInfo *newdb)
{
- if (!equivalent_locale(oldctrl->lc_collate, newctrl->lc_collate))
- pg_fatal("lc_collate cluster values do not match: old \"%s\", new \"%s\"\n",
- oldctrl->lc_collate, newctrl->lc_collate);
- if (!equivalent_locale(oldctrl->lc_ctype, newctrl->lc_ctype))
- pg_fatal("lc_ctype cluster values do not match: old \"%s\", new \"%s\"\n",
- oldctrl->lc_ctype, newctrl->lc_ctype);
- if (!equivalent_encoding(oldctrl->encoding, newctrl->encoding))
- pg_fatal("encoding cluster values do not match: old \"%s\", new \"%s\"\n",
- oldctrl->encoding, newctrl->encoding);
+ if (olddb->db_encoding != newdb->db_encoding)
+ pg_fatal("encodings for database \"%s\" do not match: old \"%s\", new \"%s\"\n",
+ olddb->db_name,
+ pg_encoding_to_char(olddb->db_encoding),
+ pg_encoding_to_char(newdb->db_encoding));
+ if (!equivalent_locale(LC_COLLATE, olddb->db_collate, newdb->db_collate))
+ pg_fatal("lc_collate values for database \"%s\" do not match: old \"%s\", new \"%s\"\n",
+ olddb->db_name, olddb->db_collate, newdb->db_collate);
+ if (!equivalent_locale(LC_CTYPE, olddb->db_ctype, newdb->db_ctype))
+ pg_fatal("lc_ctype values for database \"%s\" do not match: old \"%s\", new \"%s\"\n",
+ olddb->db_name, olddb->db_ctype, newdb->db_ctype);
}
/*
@@ -373,61 +298,46 @@ check_locale_and_encoding(ControlData *oldctrl,
*
* Best effort locale-name comparison. Return false if we are not 100% sure
* the locales are equivalent.
+ *
+ * Note: The encoding parts of the names are ignored. This function is
+ * currently used to compare locale names stored in pg_database, and
+ * pg_database contains a separate encoding field. That's compared directly
+ * in check_locale_and_encoding().
*/
static bool
-equivalent_locale(const char *loca, const char *locb)
+equivalent_locale(int category, const char *loca, const char *locb)
{
- const char *chara = strrchr(loca, '.');
- const char *charb = strrchr(locb, '.');
- int lencmp;
-
- /* If they don't both contain an encoding part, just do strcasecmp(). */
- if (!chara || !charb)
- return (pg_strcasecmp(loca, locb) == 0);
+ const char *chara;
+ const char *charb;
+ char *canona;
+ char *canonb;
+ int lena;
+ int lenb;
/*
- * Compare the encoding parts. Windows tends to use code page numbers for
- * the encoding part, which equivalent_encoding() won't like, so accept if
- * the strings are case-insensitive equal; otherwise use
- * equivalent_encoding() to compare.
+ * If the names are equal, the locales are equivalent. Checking this
+ * first avoids calling setlocale() in the common case that the names
+ * are equal. That's a good thing, if setlocale() is buggy, for example.
*/
- if (pg_strcasecmp(chara + 1, charb + 1) != 0 &&
- !equivalent_encoding(chara + 1, charb + 1))
- return false;
+ if (pg_strcasecmp(loca, locb) == 0)
+ return true;
/*
- * OK, compare the locale identifiers (e.g. en_US part of en_US.utf8).
- *
- * It's tempting to ignore non-alphanumeric chars here, but for now it's
- * not clear that that's necessary; just do case-insensitive comparison.
+ * Not identical. Canonicalize both names, remove the encoding parts,
+ * and try again.
*/
- lencmp = chara - loca;
- if (lencmp != charb - locb)
- return false;
+ canona = get_canonical_locale_name(category, loca);
+ chara = strrchr(canona, '.');
+ lena = chara ? (chara - canona) : strlen(canona);
- return (pg_strncasecmp(loca, locb, lencmp) == 0);
-}
+ canonb = get_canonical_locale_name(category, locb);
+ charb = strrchr(canonb, '.');
+ lenb = charb ? (charb - canonb) : strlen(canonb);
-/*
- * equivalent_encoding()
- *
- * Best effort encoding-name comparison. Return true only if the encodings
- * are valid server-side encodings and known equivalent.
- *
- * Because the lookup in pg_valid_server_encoding() does case folding and
- * ignores non-alphanumeric characters, this will recognize many popular
- * variant spellings as equivalent, eg "utf8" and "UTF-8" will match.
- */
-static bool
-equivalent_encoding(const char *chara, const char *charb)
-{
- int enca = pg_valid_server_encoding(chara);
- int encb = pg_valid_server_encoding(charb);
+ if (lena == lenb && pg_strncasecmp(canona, canonb, lena) == 0)
+ return true;
- if (enca < 0 || encb < 0)
- return false;
-
- return (enca == encb);
+ return false;
}
@@ -450,7 +360,35 @@ check_new_cluster_is_empty(void)
new_cluster.dbarr.dbs[dbnum].db_name);
}
}
+}
+
+/*
+ * Check that every database that already exists in the new cluster is
+ * compatible with the corresponding database in the old one.
+ */
+static void
+check_databases_are_compatible(void)
+{
+ int newdbnum;
+ int olddbnum;
+ DbInfo *newdbinfo;
+ DbInfo *olddbinfo;
+ for (newdbnum = 0; newdbnum < new_cluster.dbarr.ndbs; newdbnum++)
+ {
+ newdbinfo = &new_cluster.dbarr.dbs[newdbnum];
+
+ /* Find the corresponding database in the old cluster */
+ for (olddbnum = 0; olddbnum < old_cluster.dbarr.ndbs; olddbnum++)
+ {
+ olddbinfo = &old_cluster.dbarr.dbs[olddbnum];
+ if (strcmp(newdbinfo->db_name, olddbinfo->db_name) == 0)
+ {
+ check_locale_and_encoding(olddbinfo, newdbinfo);
+ break;
+ }
+ }
+ }
}
@@ -470,7 +408,8 @@ create_script_for_cluster_analyze(char **analyze_script_file_name)
if (os_info.user_specified)
user_specification = psprintf("-U \"%s\" ", os_info.user);
- *analyze_script_file_name = psprintf("analyze_new_cluster.%s", SCRIPT_EXT);
+ *analyze_script_file_name = psprintf("%sanalyze_new_cluster.%s",
+ SCRIPT_PREFIX, SCRIPT_EXT);
if ((script = fopen_priv(*analyze_script_file_name, "w")) == NULL)
pg_fatal("Could not open file \"%s\": %s\n",
@@ -551,7 +490,8 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name)
int tblnum;
char old_cluster_pgdata[MAXPGPATH];
- *deletion_script_file_name = psprintf("delete_old_cluster.%s", SCRIPT_EXT);
+ *deletion_script_file_name = psprintf("%sdelete_old_cluster.%s",
+ SCRIPT_PREFIX, SCRIPT_EXT);
/*
* Some users (oddly) create tablespaces inside the cluster data
diff --git a/contrib/pg_upgrade/controldata.c b/contrib/pg_upgrade/controldata.c
index 8379ebd71b..4e9d5948fa 100644
--- a/contrib/pg_upgrade/controldata.c
+++ b/contrib/pg_upgrade/controldata.c
@@ -122,10 +122,6 @@ get_control_data(ClusterInfo *cluster, bool live_check)
pg_fatal("Could not get control data using %s: %s\n",
cmd, getErrorText(errno));
- /* Only pre-8.4 has these so if they are not set below we will check later */
- cluster->controldata.lc_collate = NULL;
- cluster->controldata.lc_ctype = NULL;
-
/* Only in <= 9.2 */
if (GET_MAJOR_VERSION(cluster->major_version) <= 902)
{
@@ -404,36 +400,6 @@ get_control_data(ClusterInfo *cluster, bool live_check)
cluster->controldata.data_checksum_version = str2uint(p);
got_data_checksum_version = true;
}
- /* In pre-8.4 only */
- else if ((p = strstr(bufin, "LC_COLLATE:")) != NULL)
- {
- p = strchr(p, ':');
-
- if (p == NULL || strlen(p) <= 1)
- pg_fatal("%d: controldata retrieval problem\n", __LINE__);
-
- p++; /* remove ':' char */
- /* skip leading spaces and remove trailing newline */
- p += strspn(p, " ");
- if (strlen(p) > 0 && *(p + strlen(p) - 1) == '\n')
- *(p + strlen(p) - 1) = '\0';
- cluster->controldata.lc_collate = pg_strdup(p);
- }
- /* In pre-8.4 only */
- else if ((p = strstr(bufin, "LC_CTYPE:")) != NULL)
- {
- p = strchr(p, ':');
-
- if (p == NULL || strlen(p) <= 1)
- pg_fatal("%d: controldata retrieval problem\n", __LINE__);
-
- p++; /* remove ':' char */
- /* skip leading spaces and remove trailing newline */
- p += strspn(p, " ");
- if (strlen(p) > 0 && *(p + strlen(p) - 1) == '\n')
- *(p + strlen(p) - 1) = '\0';
- cluster->controldata.lc_ctype = pg_strdup(p);
- }
}
if (output)
diff --git a/contrib/pg_upgrade/info.c b/contrib/pg_upgrade/info.c
index a1773aa8e5..c347dfc493 100644
--- a/contrib/pg_upgrade/info.c
+++ b/contrib/pg_upgrade/info.c
@@ -239,11 +239,15 @@ get_db_infos(ClusterInfo *cluster)
DbInfo *dbinfos;
int i_datname,
i_oid,
+ i_encoding,
+ i_datcollate,
+ i_datctype,
i_spclocation;
char query[QUERY_ALLOC];
snprintf(query, sizeof(query),
- "SELECT d.oid, d.datname, %s "
+ "SELECT d.oid, d.datname, d.encoding, d.datcollate, d.datctype, "
+ "%s AS spclocation "
"FROM pg_catalog.pg_database d "
" LEFT OUTER JOIN pg_catalog.pg_tablespace t "
" ON d.dattablespace = t.oid "
@@ -252,12 +256,15 @@ get_db_infos(ClusterInfo *cluster)
"ORDER BY 2",
/* 9.2 removed the spclocation column */
(GET_MAJOR_VERSION(cluster->major_version) <= 901) ?
- "t.spclocation" : "pg_catalog.pg_tablespace_location(t.oid) AS spclocation");
+ "t.spclocation" : "pg_catalog.pg_tablespace_location(t.oid)");
res = executeQueryOrDie(conn, "%s", query);
i_oid = PQfnumber(res, "oid");
i_datname = PQfnumber(res, "datname");
+ i_encoding = PQfnumber(res, "encoding");
+ i_datcollate = PQfnumber(res, "datcollate");
+ i_datctype = PQfnumber(res, "datctype");
i_spclocation = PQfnumber(res, "spclocation");
ntups = PQntuples(res);
@@ -267,6 +274,9 @@ get_db_infos(ClusterInfo *cluster)
{
dbinfos[tupnum].db_oid = atooid(PQgetvalue(res, tupnum, i_oid));
dbinfos[tupnum].db_name = pg_strdup(PQgetvalue(res, tupnum, i_datname));
+ dbinfos[tupnum].db_encoding = atoi(PQgetvalue(res, tupnum, i_encoding));
+ dbinfos[tupnum].db_collate = pg_strdup(PQgetvalue(res, tupnum, i_datcollate));
+ dbinfos[tupnum].db_ctype = pg_strdup(PQgetvalue(res, tupnum, i_datctype));
snprintf(dbinfos[tupnum].db_tablespace, sizeof(dbinfos[tupnum].db_tablespace), "%s",
PQgetvalue(res, tupnum, i_spclocation));
}
diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h
index 56a7505a96..c3b81e4a08 100644
--- a/contrib/pg_upgrade/pg_upgrade.h
+++ b/contrib/pg_upgrade/pg_upgrade.h
@@ -76,6 +76,7 @@ extern char *output_files[];
#define PATH_SEPARATOR '/'
#define RM_CMD "rm -f"
#define RMDIR_CMD "rm -rf"
+#define SCRIPT_PREFIX "./"
#define SCRIPT_EXT "sh"
#define ECHO_QUOTE "'"
#define ECHO_BLANK ""
@@ -86,6 +87,7 @@ extern char *output_files[];
#define PATH_SEPARATOR '\\'
#define RM_CMD "DEL /q"
#define RMDIR_CMD "RMDIR /s/q"
+#define SCRIPT_PREFIX ""
#define SCRIPT_EXT "bat"
#define EXE_EXT ".exe"
#define ECHO_QUOTE ""
@@ -180,6 +182,9 @@ typedef struct
char *db_name; /* database name */
char db_tablespace[MAXPGPATH]; /* database default tablespace
* path */
+ char *db_collate;
+ char *db_ctype;
+ int db_encoding;
RelInfoArr rel_arr; /* array of all user relinfos */
} DbInfo;
@@ -218,9 +223,6 @@ typedef struct
bool date_is_int;
bool float8_pass_by_value;
bool data_checksum_version;
- char *lc_collate;
- char *lc_ctype;
- char *encoding;
} ControlData;
/*
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index f4617b67e9..f98e282741 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -1227,8 +1227,7 @@
<entry><type>bool</type></entry>
<entry></entry>
<entry>
- This represents a not-null constraint. It is possible to
- change this column to enable or disable the constraint.
+ This represents a not-null constraint.
</entry>
</row>
diff --git a/doc/src/sgml/json.sgml b/doc/src/sgml/json.sgml
index 37dd611aeb..8feb2fbf0a 100644
--- a/doc/src/sgml/json.sgml
+++ b/doc/src/sgml/json.sgml
@@ -269,6 +269,12 @@ SELECT '"foo"'::jsonb @> '"foo"'::jsonb;
-- The array on the right side is contained within the one on the left:
SELECT '[1, 2, 3]'::jsonb @> '[1, 3]'::jsonb;
+-- Order of array elements is not significant, so this is also true:
+SELECT '[1, 2, 3]'::jsonb @> '[3, 1]'::jsonb;
+
+-- Duplicate array elements don't matter either:
+SELECT '[1, 2, 3]'::jsonb @> '[1, 2, 2]'::jsonb;
+
-- The object with a single pair on the right side is contained
-- within the object on the left side:
SELECT '{"product": "PostgreSQL", "version": 9.4, "jsonb":true}'::jsonb @> '{"version":9.4}'::jsonb;
@@ -288,8 +294,10 @@ SELECT '{"foo": {"bar": "baz"}}'::jsonb @> '{"bar": "baz"}'::jsonb; -- yields f
The general principle is that the contained object must match the
containing object as to structure and data contents, possibly after
discarding some non-matching array elements or object key/value pairs
- from the containing object. However, the order of array elements is
- not significant when doing a containment match.
+ from the containing object.
+ But remember that the order of array elements is not significant when
+ doing a containment match, and duplicate array elements are effectively
+ considered only once.
</para>
<para>
diff --git a/doc/src/sgml/plpgsql.sgml b/doc/src/sgml/plpgsql.sgml
index f008e937ee..f195495520 100644
--- a/doc/src/sgml/plpgsql.sgml
+++ b/doc/src/sgml/plpgsql.sgml
@@ -487,8 +487,8 @@ $$ LANGUAGE plpgsql;
CREATE FUNCTION extended_sales(p_itemno int)
RETURNS TABLE(quantity int, total numeric) AS $$
BEGIN
- RETURN QUERY SELECT quantity, quantity * price FROM sales
- WHERE itemno = p_itemno;
+ RETURN QUERY SELECT s.quantity, s.quantity * s.price FROM sales AS s
+ WHERE s.itemno = p_itemno;
END;
$$ LANGUAGE plpgsql;
</programlisting>
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 2af9413f21..e76b22fb2d 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -302,7 +302,7 @@ PROVE_FLAGS = --verbose
# prepend to path if already set, else just set it
define add_to_path
-$(1)='$(if $($(1)),$(2):$$$(1),$(2))'
+$(1)="$(if $($(1)),$(2):$$$(1),$(2))"
endef
# platform-specific environment variable to set shared library path
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 850008b340..8849c08e54 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -28,6 +28,7 @@
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
+#include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/tqual.h"
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index bab5a49187..b71f65de2c 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1186,7 +1186,7 @@ _bt_pagedel(Relation rel, Buffer buf)
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("index \"%s\" contains a half-dead internal page",
RelationGetRelationName(rel)),
- errhint("This can be caused by an interrupt VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
+ errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
_bt_relbuf(rel, buf);
return ndeleted;
}
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 5a4dbb9c53..235b442296 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5193,8 +5193,8 @@ readRecoveryCommandFile(void)
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("invalid recovery_target parameter"),
- errhint("The only allowed value is 'immediate'")));
+ errmsg("invalid value for recovery parameter \"recovery_target\""),
+ errhint("The only allowed value is \"immediate\".")));
ereport(DEBUG2,
(errmsg_internal("recovery_target = '%s'",
item->value)));
@@ -5257,7 +5257,7 @@ readRecoveryCommandFile(void)
"recovery_min_apply_delay"),
hintmsg ? errhint("%s", _(hintmsg)) : 0));
ereport(DEBUG2,
- (errmsg("recovery_min_apply_delay = '%s'", item->value)));
+ (errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
}
else
ereport(FATAL,
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 55c1e79563..c0eade0a3d 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -69,6 +69,7 @@
#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
+#include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/tqual.h"
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 7831e900ba..b52e6b9bc0 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -839,7 +839,7 @@ dropdb(const char *dbname, bool missing_ok)
if (ReplicationSlotsCountDBSlots(db_id, &nslots, &nslots_active))
ereport(ERROR,
(errcode(ERRCODE_OBJECT_IN_USE),
- errmsg("database \"%s\" is used by a logical decoding slot",
+ errmsg("database \"%s\" is used by a logical replication slot",
dbname),
errdetail_plural("There is %d slot, %d of them active.",
"There are %d slots, %d of them active.",
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 781a736115..387d263e87 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -28,6 +28,7 @@
#include "utils/json.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
+#include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/tuplesort.h"
#include "utils/xml.h"
@@ -1900,18 +1901,21 @@ show_hash_info(HashState *hashstate, ExplainState *es)
if (es->format != EXPLAIN_FORMAT_TEXT)
{
ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es);
+ ExplainPropertyLong("Original Hash Buckets",
+ hashtable->nbuckets_original, es);
ExplainPropertyLong("Hash Batches", hashtable->nbatch, es);
ExplainPropertyLong("Original Hash Batches",
hashtable->nbatch_original, es);
ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
}
- else if (hashtable->nbatch_original != hashtable->nbatch)
+ else if ((hashtable->nbatch_original != hashtable->nbatch) ||
+ (hashtable->nbuckets_original != hashtable->nbuckets))
{
appendStringInfoSpaces(es->str, es->indent * 2);
appendStringInfo(es->str,
- "Buckets: %d Batches: %d (originally %d) Memory Usage: %ldkB\n",
- hashtable->nbuckets, hashtable->nbatch,
- hashtable->nbatch_original, spacePeakKb);
+ "Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n",
+ hashtable->nbuckets, hashtable->nbuckets_original,
+ hashtable->nbatch, hashtable->nbatch_original, spacePeakKb);
}
else
{
diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c
index d1c8bb0d53..30bd40db18 100644
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@ -597,7 +597,7 @@ refresh_by_match_merge(Oid matviewOid, Oid tempOid, Oid relowner,
{
ereport(ERROR,
(errcode(ERRCODE_CARDINALITY_VIOLATION),
- errmsg("new data for \"%s\" contains duplicate rows without any NULL columns",
+ errmsg("new data for \"%s\" contains duplicate rows without any null columns",
RelationGetRelationName(matviewRel)),
errdetail("Row: %s",
SPI_getvalue(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1))));
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index cb16c53a60..ecdff1e5e3 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -85,6 +85,7 @@
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/relcache.h"
+#include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/tqual.h"
@@ -9045,7 +9046,7 @@ ATExecSetRelOptions(Relation rel, List *defList, AlterTableType operation,
if (view_updatable_error)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("WITH CHECK OPTION is supported only on auto-updatable views"),
+ errmsg("WITH CHECK OPTION is supported only on automatically updatable views"),
errhint("%s", view_updatable_error)));
}
}
diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c
index ad364efbcb..55a68810f2 100644
--- a/src/backend/commands/typecmds.c
+++ b/src/backend/commands/typecmds.c
@@ -72,6 +72,7 @@
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/rel.h"
+#include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/tqual.h"
diff --git a/src/backend/commands/view.c b/src/backend/commands/view.c
index 9d0039c42a..184bcd0582 100644
--- a/src/backend/commands/view.c
+++ b/src/backend/commands/view.c
@@ -471,7 +471,7 @@ DefineView(ViewStmt *stmt, const char *queryString)
if (view_updatable_error)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("WITH CHECK OPTION is supported only on auto-updatable views"),
+ errmsg("WITH CHECK OPTION is supported only on automatically updatable views"),
errhint("%s", view_updatable_error)));
}
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index b428c18b5c..7c5bb77b0c 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -39,6 +39,7 @@
static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
+static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable);
static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
int mcvsToUse);
static void ExecHashSkewTableInsert(HashJoinTable hashtable,
@@ -117,6 +118,7 @@ MultiExecHash(HashState *node)
/* It's a skew tuple, so put it into that hash table */
ExecHashSkewTableInsert(hashtable, slot, hashvalue,
bucketNumber);
+ hashtable->skewTuples += 1;
}
else
{
@@ -127,6 +129,25 @@ MultiExecHash(HashState *node)
}
}
+ /* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
+ if (hashtable->nbuckets != hashtable->nbuckets_optimal)
+ {
+ /* We never decrease the number of buckets. */
+ Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
+
+#ifdef HJDEBUG
+ printf("Increasing nbuckets %d => %d\n",
+ hashtable->nbuckets, hashtable->nbuckets_optimal);
+#endif
+
+ ExecHashIncreaseNumBuckets(hashtable);
+ }
+
+ /* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */
+ hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple);
+ if (hashtable->spaceUsed > hashtable->spacePeak)
+ hashtable->spacePeak = hashtable->spaceUsed;
+
/* must provide our own instrumentation support */
if (node->ps.instrument)
InstrStopNode(node->ps.instrument, hashtable->totalTuples);
@@ -272,7 +293,10 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
*/
hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
hashtable->nbuckets = nbuckets;
+ hashtable->nbuckets_original = nbuckets;
+ hashtable->nbuckets_optimal = nbuckets;
hashtable->log2_nbuckets = log2_nbuckets;
+ hashtable->log2_nbuckets_optimal = log2_nbuckets;
hashtable->buckets = NULL;
hashtable->keepNulls = keepNulls;
hashtable->skewEnabled = false;
@@ -286,6 +310,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
hashtable->nbatch_outstart = nbatch;
hashtable->growEnabled = true;
hashtable->totalTuples = 0;
+ hashtable->skewTuples = 0;
hashtable->innerBatchFile = NULL;
hashtable->outerBatchFile = NULL;
hashtable->spaceUsed = 0;
@@ -620,6 +645,19 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
*/
ninmemory = nfreed = 0;
+ /* If know we need to resize nbuckets, we can do it while rebatching. */
+ if (hashtable->nbuckets_optimal != hashtable->nbuckets)
+ {
+ /* we never decrease the number of buckets */
+ Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
+
+ hashtable->nbuckets = hashtable->nbuckets_optimal;
+ hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
+
+ hashtable->buckets = repalloc(hashtable->buckets,
+ sizeof(HashJoinTuple) * hashtable->nbuckets);
+ }
+
/*
* We will scan through the chunks directly, so that we can reset the
* buckets now and not have to keep track which tuples in the buckets have
@@ -704,6 +742,78 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
}
/*
+ * ExecHashIncreaseNumBuckets
+ * increase the original number of buckets in order to reduce
+ * number of tuples per bucket
+ */
+static void
+ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
+{
+ HashMemoryChunk chunk;
+
+ /* do nothing if not an increase (it's called increase for a reason) */
+ if (hashtable->nbuckets >= hashtable->nbuckets_optimal)
+ return;
+
+ /*
+ * We already know the optimal number of buckets, so let's just
+ * compute the log2_nbuckets for it.
+ */
+ hashtable->nbuckets = hashtable->nbuckets_optimal;
+ hashtable->log2_nbuckets = my_log2(hashtable->nbuckets_optimal);
+
+ Assert(hashtable->nbuckets > 1);
+ Assert(hashtable->nbuckets <= (INT_MAX / 2));
+ Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
+
+#ifdef HJDEBUG
+ printf("Increasing nbuckets to %d\n", hashtable->nbuckets);
+#endif
+
+ /*
+ * Just reallocate the proper number of buckets - we don't need to
+ * walk through them - we can walk the dense-allocated chunks
+ * (just like in ExecHashIncreaseNumBatches, but without all the
+ * copying into new chunks)
+ */
+ hashtable->buckets =
+ (HashJoinTuple *) repalloc(hashtable->buckets,
+ hashtable->nbuckets * sizeof(HashJoinTuple));
+
+ memset(hashtable->buckets, 0, sizeof(void *) * hashtable->nbuckets);
+
+ /* scan through all tuples in all chunks to rebuild the hash table */
+ for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next)
+ {
+ /* process all tuples stored in this chunk */
+ size_t idx = 0;
+ while (idx < chunk->used)
+ {
+ HashJoinTuple hashTuple = (HashJoinTuple) (chunk->data + idx);
+ int bucketno;
+ int batchno;
+
+ ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
+ &bucketno, &batchno);
+
+ /* add the tuple to the proper bucket */
+ hashTuple->next = hashtable->buckets[bucketno];
+ hashtable->buckets[bucketno] = hashTuple;
+
+ /* advance index past the tuple */
+ idx += MAXALIGN(HJTUPLE_OVERHEAD +
+ HJTUPLE_MINTUPLE(hashTuple)->t_len);
+ }
+ }
+
+#ifdef HJDEBUG
+ printf("Nbuckets increased to %d, average items per bucket %.1f\n",
+ hashtable->nbuckets, batchTuples / hashtable->nbuckets);
+#endif
+}
+
+
+/*
* ExecHashTableInsert
* insert a tuple into the hash table depending on the hash value
* it may just go to a temp file for later batches
@@ -736,6 +846,7 @@ ExecHashTableInsert(HashJoinTable hashtable,
*/
HashJoinTuple hashTuple;
int hashTupleSize;
+ double ntuples = (hashtable->totalTuples - hashtable->skewTuples);
/* Create the HashJoinTuple */
hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
@@ -756,11 +867,24 @@ ExecHashTableInsert(HashJoinTable hashtable,
hashTuple->next = hashtable->buckets[bucketno];
hashtable->buckets[bucketno] = hashTuple;
+ /*
+ * Increase the (optimal) number of buckets if we just exceeded the
+ * NTUP_PER_BUCKET threshold, but only when there's still a single batch.
+ */
+ if ((hashtable->nbatch == 1) &&
+ (hashtable->nbuckets_optimal <= INT_MAX/2) && /* overflow protection */
+ (ntuples >= (hashtable->nbuckets_optimal * NTUP_PER_BUCKET)))
+ {
+ hashtable->nbuckets_optimal *= 2;
+ hashtable->log2_nbuckets_optimal += 1;
+ }
+
/* Account for space used, and back off if we've used too much */
hashtable->spaceUsed += hashTupleSize;
if (hashtable->spaceUsed > hashtable->spacePeak)
hashtable->spacePeak = hashtable->spaceUsed;
- if (hashtable->spaceUsed + hashtable->nbuckets * sizeof(HashJoinTuple)
+ if (hashtable->spaceUsed +
+ hashtable->nbuckets_optimal * sizeof(HashJoinTuple)
> hashtable->spaceAllowed)
ExecHashIncreaseNumBatches(hashtable);
}
@@ -885,7 +1009,10 @@ ExecHashGetHashValue(HashJoinTable hashtable,
* functions are good about randomizing all their output bits, else we are
* likely to have very skewed bucket or batch occupancy.)
*
- * nbuckets doesn't change over the course of the join.
+ * nbuckets and log2_nbuckets may change while nbatch == 1 because of dynamic
+ * bucket count growth. Once we start batching, the value is fixed and does
+ * not change over the course of the join (making it possible to compute batch
+ * number the way we do here).
*
* nbatch is always a power of 2; we increase it only by doubling it. This
* effectively adds one more bit to the top of the batchno.
diff --git a/src/backend/libpq/be-secure-openssl.c b/src/backend/libpq/be-secure-openssl.c
index 8d8f12952a..b05364ced0 100644
--- a/src/backend/libpq/be-secure-openssl.c
+++ b/src/backend/libpq/be-secure-openssl.c
@@ -614,7 +614,7 @@ be_tls_write(Port *port, void *ptr, size_t len)
if (retries >= 20)
ereport(FATAL,
(errcode(ERRCODE_PROTOCOL_VIOLATION),
- errmsg("unable to complete SSL handshake")));
+ errmsg("could not complete SSL handshake on renegotiation, too many failures")));
}
}
}
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
index 7a38f2f150..092cf8fe43 100644
--- a/src/backend/storage/buffer/buf_table.c
+++ b/src/backend/storage/buffer/buf_table.c
@@ -21,8 +21,10 @@
*/
#include "postgres.h"
+#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/buf_internals.h"
+#include "utils/chash.h"
/* entry for buffer lookup hashtable */
@@ -32,8 +34,13 @@ typedef struct
int id; /* Associated buffer ID */
} BufferLookupEnt;
-static HTAB *SharedBufHash;
-
+static CHashDescriptor SharedBufDescriptor = {
+ "buffer lookup table",
+ 0,
+ sizeof(BufferLookupEnt),
+ sizeof(BufferTag)
+};
+static CHashTable SharedBufHash;
/*
* Estimate space needed for mapping hashtable
@@ -42,7 +49,13 @@ static HTAB *SharedBufHash;
Size
BufTableShmemSize(int size)
{
- return hash_estimate_size(size, sizeof(BufferLookupEnt));
+ if (SharedBufHash == NULL)
+ {
+ SharedBufDescriptor.capacity = size;
+ SharedBufHash = CHashBootstrap(&SharedBufDescriptor);
+ }
+
+ return CHashEstimateSize(SharedBufHash);
}
/*
@@ -52,59 +65,29 @@ BufTableShmemSize(int size)
void
InitBufTable(int size)
{
- HASHCTL info;
-
- /* assume no locking is needed yet */
-
- /* BufferTag maps to Buffer */
- info.keysize = sizeof(BufferTag);
- info.entrysize = sizeof(BufferLookupEnt);
- info.hash = tag_hash;
- info.num_partitions = NUM_BUFFER_PARTITIONS;
-
- SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
- size, size,
- &info,
- HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
-}
-
-/*
- * BufTableHashCode
- * Compute the hash code associated with a BufferTag
- *
- * This must be passed to the lookup/insert/delete routines along with the
- * tag. We do it like this because the callers need to know the hash code
- * in order to determine which buffer partition to lock, and we don't want
- * to do the hash computation twice (hash_any is a bit slow).
- */
-uint32
-BufTableHashCode(BufferTag *tagPtr)
-{
- return get_hash_value(SharedBufHash, (void *) tagPtr);
+ if (SharedBufHash == NULL || !IsUnderPostmaster)
+ {
+ Assert(SharedBufDescriptor.capacity == 0 ||
+ SharedBufDescriptor.capacity == size);
+ SharedBufDescriptor.capacity = size;
+ SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor);
+ }
}
/*
* BufTableLookup
* Lookup the given BufferTag; return buffer ID, or -1 if not found
- *
- * Caller must hold at least share lock on BufMappingLock for tag's partition
*/
int
-BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
+BufTableLookup(BufferTag *tagPtr)
{
- BufferLookupEnt *result;
-
- result = (BufferLookupEnt *)
- hash_search_with_hash_value(SharedBufHash,
- (void *) tagPtr,
- hashcode,
- HASH_FIND,
- NULL);
+ BufferLookupEnt ent;
- if (!result)
+ ent.key = *tagPtr;
+ if (!CHashSearch(SharedBufHash, &ent))
return -1;
- return result->id;
+ return ent.id;
}
/*
@@ -118,27 +101,20 @@ BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
* Caller must hold exclusive lock on BufMappingLock for tag's partition
*/
int
-BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
+BufTableInsert(BufferTag *tagPtr, int buf_id)
{
- BufferLookupEnt *result;
- bool found;
+ BufferLookupEnt ent;
+
+ ent.key = *tagPtr;
+ ent.id = buf_id;
Assert(buf_id >= 0); /* -1 is reserved for not-in-table */
Assert(tagPtr->blockNum != P_NEW); /* invalid tag */
- result = (BufferLookupEnt *)
- hash_search_with_hash_value(SharedBufHash,
- (void *) tagPtr,
- hashcode,
- HASH_ENTER,
- &found);
-
- if (found) /* found something already in the table */
- return result->id;
-
- result->id = buf_id;
+ if (CHashInsert(SharedBufHash, &ent))
+ return -1;
- return -1;
+ return ent.id;
}
/*
@@ -148,17 +124,8 @@ BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
* Caller must hold exclusive lock on BufMappingLock for tag's partition
*/
void
-BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
+BufTableDelete(BufferTag *tagPtr)
{
- BufferLookupEnt *result;
-
- result = (BufferLookupEnt *)
- hash_search_with_hash_value(SharedBufHash,
- (void *) tagPtr,
- hashcode,
- HASH_REMOVE,
- NULL);
-
- if (!result) /* shouldn't happen */
+ if (!CHashDelete(SharedBufHash, tagPtr))
elog(ERROR, "shared buffer hash table corrupted");
}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 45d1d61d95..437deb905c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -429,22 +429,14 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
else
{
BufferTag newTag; /* identity of requested block */
- uint32 newHash; /* hash value for newTag */
- LWLock *newPartitionLock; /* buffer partition lock for it */
int buf_id;
/* create a tag so we can lookup the buffer */
INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
forkNum, blockNum);
- /* determine its hash code and partition lock ID */
- newHash = BufTableHashCode(&newTag);
- newPartitionLock = BufMappingPartitionLock(newHash);
-
/* see if the block is in the buffer pool already */
- LWLockAcquire(newPartitionLock, LW_SHARED);
- buf_id = BufTableLookup(&newTag, newHash);
- LWLockRelease(newPartitionLock);
+ buf_id = BufTableLookup(&newTag);
/* If not in buffers, initiate prefetch */
if (buf_id < 0)
@@ -822,11 +814,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
bool *foundPtr)
{
BufferTag newTag; /* identity of requested block */
- uint32 newHash; /* hash value for newTag */
- LWLock *newPartitionLock; /* buffer partition lock for it */
BufferTag oldTag; /* previous identity of selected buffer */
- uint32 oldHash; /* hash value for oldTag */
- LWLock *oldPartitionLock; /* buffer partition lock for it */
BufFlags oldFlags;
int buf_id;
volatile BufferDesc *buf;
@@ -835,29 +823,31 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
/* create a tag so we can lookup the buffer */
INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
- /* determine its hash code and partition lock ID */
- newHash = BufTableHashCode(&newTag);
- newPartitionLock = BufMappingPartitionLock(newHash);
-
/* see if the block is in the buffer pool already */
- LWLockAcquire(newPartitionLock, LW_SHARED);
- buf_id = BufTableLookup(&newTag, newHash);
+start:
+ buf_id = BufTableLookup(&newTag);
if (buf_id >= 0)
{
+ BufferDesc *foundbuf;
+
/*
* Found it. Now, pin the buffer so no one can steal it from the
- * buffer pool, and check to see if the correct data has been loaded
- * into the buffer.
+ * buffer pool.
*/
- buf = &BufferDescriptors[buf_id];
+ foundbuf = &BufferDescriptors[buf_id];
- valid = PinBuffer(buf, strategy);
+ valid = PinBuffer(foundbuf, strategy);
- /* Can release the mapping lock as soon as we've pinned it */
- LWLockRelease(newPartitionLock);
+ /* Check whether someone recycled the buffer before we pinned it. */
+ if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag))
+ {
+ UnpinBuffer(foundbuf, true);
+ goto start;
+ }
*foundPtr = TRUE;
+ /* Check to see if the correct data has been loaded into the buffer. */
if (!valid)
{
/*
@@ -867,7 +857,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* own read attempt if the page is still not BM_VALID.
* StartBufferIO does it all.
*/
- if (StartBufferIO(buf, true))
+ if (StartBufferIO(foundbuf, true))
{
/*
* If we get here, previous attempts to read the buffer must
@@ -877,15 +867,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
}
}
- return buf;
+ return foundbuf;
}
- /*
- * Didn't find it in the buffer pool. We'll have to initialize a new
- * buffer. Remember to unlock the mapping lock while doing the work.
- */
- LWLockRelease(newPartitionLock);
-
/* Loop here in case we have to try another victim buffer */
for (;;)
{
@@ -986,42 +970,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
*/
if (oldFlags & BM_TAG_VALID)
{
- /*
- * Need to compute the old tag's hashcode and partition lock ID.
- * XXX is it worth storing the hashcode in BufferDesc so we need
- * not recompute it here? Probably not.
- */
+ /* Save old tag. */
oldTag = buf->tag;
- oldHash = BufTableHashCode(&oldTag);
- oldPartitionLock = BufMappingPartitionLock(oldHash);
-
- /*
- * Must lock the lower-numbered partition first to avoid
- * deadlocks.
- */
- if (oldPartitionLock < newPartitionLock)
- {
- LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
- LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
- }
- else if (oldPartitionLock > newPartitionLock)
- {
- LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
- LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
- }
- else
- {
- /* only one partition, only one lock */
- LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
- }
- }
- else
- {
- /* if it wasn't valid, we need only the new partition */
- LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
- /* these just keep the compiler quiet about uninit variables */
- oldHash = 0;
- oldPartitionLock = 0;
}
/*
@@ -1031,32 +981,34 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* Note that we have not yet removed the hashtable entry for the old
* tag.
*/
- buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
+enter:
+ buf_id = BufTableInsert(&newTag, buf->buf_id);
if (buf_id >= 0)
{
+ BufferDesc *foundbuf;
+
/*
- * Got a collision. Someone has already done what we were about to
- * do. We'll just handle this as if it were found in the buffer
- * pool in the first place. First, give up the buffer we were
- * planning to use.
+ * We've got a collision, apparently: it looks like someone else
+ * did what we were about to do. We can handle this as if we had
+ * found the buffer in the pool in the first place, but we must
+ * recheck the buffer tag after pinning it, because it could still
+ * get renamed under us.
+ */
+ foundbuf = &BufferDescriptors[buf_id];
+ valid = PinBuffer(foundbuf, strategy);
+ if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag))
+ {
+ UnpinBuffer(foundbuf, true);
+ goto enter;
+ }
+
+ /*
+ * Collision confirmed. Give up the buffer we were planning to
+ * use.
*/
UnpinBuffer(buf, true);
- /* Can give up that buffer's mapping partition lock now */
- if ((oldFlags & BM_TAG_VALID) &&
- oldPartitionLock != newPartitionLock)
- LWLockRelease(oldPartitionLock);
-
- /* remaining code should match code at top of routine */
-
- buf = &BufferDescriptors[buf_id];
-
- valid = PinBuffer(buf, strategy);
-
- /* Can release the mapping lock as soon as we've pinned it */
- LWLockRelease(newPartitionLock);
-
*foundPtr = TRUE;
if (!valid)
@@ -1068,7 +1020,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* then set up our own read attempt if the page is still not
* BM_VALID. StartBufferIO does it all.
*/
- if (StartBufferIO(buf, true))
+ if (StartBufferIO(foundbuf, true))
{
/*
* If we get here, previous attempts to read the buffer
@@ -1078,7 +1030,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
}
}
- return buf;
+ return foundbuf;
}
/*
@@ -1097,11 +1049,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
break;
UnlockBufHdr(buf);
- BufTableDelete(&newTag, newHash);
- if ((oldFlags & BM_TAG_VALID) &&
- oldPartitionLock != newPartitionLock)
- LWLockRelease(oldPartitionLock);
- LWLockRelease(newPartitionLock);
+ BufTableDelete(&newTag);
UnpinBuffer(buf, true);
}
@@ -1124,13 +1072,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
UnlockBufHdr(buf);
if (oldFlags & BM_TAG_VALID)
- {
- BufTableDelete(&oldTag, oldHash);
- if (oldPartitionLock != newPartitionLock)
- LWLockRelease(oldPartitionLock);
- }
-
- LWLockRelease(newPartitionLock);
+ BufTableDelete(&oldTag);
/*
* Buffer contents are currently invalid. Try to get the io_in_progress
@@ -1166,42 +1108,11 @@ static void
InvalidateBuffer(volatile BufferDesc *buf)
{
BufferTag oldTag;
- uint32 oldHash; /* hash value for oldTag */
- LWLock *oldPartitionLock; /* buffer partition lock for it */
BufFlags oldFlags;
/* Save the original buffer tag before dropping the spinlock */
oldTag = buf->tag;
- UnlockBufHdr(buf);
-
- /*
- * Need to compute the old tag's hashcode and partition lock ID. XXX is it
- * worth storing the hashcode in BufferDesc so we need not recompute it
- * here? Probably not.
- */
- oldHash = BufTableHashCode(&oldTag);
- oldPartitionLock = BufMappingPartitionLock(oldHash);
-
-retry:
-
- /*
- * Acquire exclusive mapping lock in preparation for changing the buffer's
- * association.
- */
- LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-
- /* Re-lock the buffer header */
- LockBufHdr(buf);
-
- /* If it's changed while we were waiting for lock, do nothing */
- if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
- {
- UnlockBufHdr(buf);
- LWLockRelease(oldPartitionLock);
- return;
- }
-
/*
* We assume the only reason for it to be pinned is that someone else is
* flushing the page out. Wait for them to finish. (This could be an
@@ -1211,15 +1122,21 @@ retry:
* yet done StartBufferIO, WaitIO will fall through and we'll effectively
* be busy-looping here.)
*/
- if (buf->refcount != 0)
+ while (buf->refcount != 0)
{
UnlockBufHdr(buf);
- LWLockRelease(oldPartitionLock);
/* safety check: should definitely not be our *own* pin */
if (GetPrivateRefCount(buf->buf_id) > 0)
elog(ERROR, "buffer is pinned in InvalidateBuffer");
WaitIO(buf);
- goto retry;
+ LockBufHdr(buf);
+
+ /* If it's changed while we were waiting for lock, do nothing */
+ if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
+ {
+ UnlockBufHdr(buf);
+ return;
+ }
}
/*
@@ -1237,12 +1154,7 @@ retry:
* Remove the buffer from the lookup hashtable, if it was in there.
*/
if (oldFlags & BM_TAG_VALID)
- BufTableDelete(&oldTag, oldHash);
-
- /*
- * Done with mapping lock.
- */
- LWLockRelease(oldPartitionLock);
+ BufTableDelete(&oldTag);
/*
* Insert the buffer at the head of the list of free buffers.
diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c
index d96627a774..90df5930e1 100644
--- a/src/backend/storage/ipc/shm_mq.c
+++ b/src/backend/storage/ipc/shm_mq.c
@@ -139,7 +139,7 @@ struct shm_mq_handle
};
static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mq, Size nbytes,
- void *data, bool nowait, Size *bytes_written);
+ const void *data, bool nowait, Size *bytes_written);
static shm_mq_result shm_mq_receive_bytes(shm_mq *mq, Size bytes_needed,
bool nowait, Size *nbytesp, void **datap);
static bool shm_mq_wait_internal(volatile shm_mq *mq, PGPROC *volatile * ptr,
@@ -301,7 +301,33 @@ shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle)
}
/*
+ * Associate a BackgroundWorkerHandle with a shm_mq_handle just as if it had
+ * been passed to shm_mq_attach.
+ */
+void
+shm_mq_set_handle(shm_mq_handle *mqh, BackgroundWorkerHandle *handle)
+{
+ Assert(mqh->mqh_handle == NULL);
+ mqh->mqh_handle = handle;
+}
+
+/*
* Write a message into a shared message queue.
+ */
+shm_mq_result
+shm_mq_send(shm_mq_handle *mqh, Size nbytes, const void *data, bool nowait)
+{
+ shm_mq_iovec iov;
+
+ iov.data = data;
+ iov.len = nbytes;
+
+ return shm_mq_sendv(mqh, &iov, 1, nowait);
+}
+
+/*
+ * Write a message into a shared message queue, gathered from multiple
+ * addresses.
*
* When nowait = false, we'll wait on our process latch when the ring buffer
* fills up, and then continue writing once the receiver has drained some data.
@@ -315,14 +341,22 @@ shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle)
* the length or payload will corrupt the queue.)
*/
shm_mq_result
-shm_mq_send(shm_mq_handle *mqh, Size nbytes, void *data, bool nowait)
+shm_mq_sendv(shm_mq_handle *mqh, shm_mq_iovec *iov, int iovcnt, bool nowait)
{
shm_mq_result res;
shm_mq *mq = mqh->mqh_queue;
+ Size nbytes = 0;
Size bytes_written;
+ int i;
+ int which_iov = 0;
+ Size offset;
Assert(mq->mq_sender == MyProc);
+ /* Compute total size of write. */
+ for (i = 0; i < iovcnt; ++i)
+ nbytes += iov[i].len;
+
/* Try to write, or finish writing, the length word into the buffer. */
while (!mqh->mqh_length_word_complete)
{
@@ -348,18 +382,80 @@ shm_mq_send(shm_mq_handle *mqh, Size nbytes, void *data, bool nowait)
/* Write the actual data bytes into the buffer. */
Assert(mqh->mqh_partial_bytes <= nbytes);
- res = shm_mq_send_bytes(mqh, nbytes - mqh->mqh_partial_bytes,
- ((char *) data) + mqh->mqh_partial_bytes,
- nowait, &bytes_written);
- if (res == SHM_MQ_WOULD_BLOCK)
- mqh->mqh_partial_bytes += bytes_written;
- else
+ offset = mqh->mqh_partial_bytes;
+ do
{
- mqh->mqh_partial_bytes = 0;
- mqh->mqh_length_word_complete = false;
- }
- if (res != SHM_MQ_SUCCESS)
- return res;
+ Size chunksize;
+
+ /* Figure out which bytes need to be sent next. */
+ if (offset >= iov[which_iov].len)
+ {
+ offset -= iov[which_iov].len;
+ ++which_iov;
+ if (which_iov >= iovcnt)
+ break;
+ continue;
+ }
+
+ /*
+ * We want to avoid copying the data if at all possible, but every
+ * chunk of bytes we write into the queue has to be MAXALIGN'd,
+ * except the last. Thus, if a chunk other than the last one ends
+ * on a non-MAXALIGN'd boundary, we have to combine the tail end of
+ * its data with data from one or more following chunks until we
+ * either reach the last chunk or accumulate a number of bytes which
+ * is MAXALIGN'd.
+ */
+ if (which_iov + 1 < iovcnt &&
+ offset + MAXIMUM_ALIGNOF > iov[which_iov].len)
+ {
+ char tmpbuf[MAXIMUM_ALIGNOF];
+ int j = 0;
+
+ for (;;)
+ {
+ if (offset < iov[which_iov].len)
+ {
+ tmpbuf[j] = iov[which_iov].data[offset];
+ j++;
+ offset++;
+ if (j == MAXIMUM_ALIGNOF)
+ break;
+ }
+ else
+ {
+ offset -= iov[which_iov].len;
+ which_iov++;
+ if (which_iov >= iovcnt)
+ break;
+ }
+ }
+ res = shm_mq_send_bytes(mqh, j, tmpbuf, nowait, &bytes_written);
+ mqh->mqh_partial_bytes += bytes_written;
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+ continue;
+ }
+
+ /*
+ * If this is the last chunk, we can write all the data, even if it
+ * isn't a multiple of MAXIMUM_ALIGNOF. Otherwise, we need to
+ * MAXALIGN_DOWN the write size.
+ */
+ chunksize = iov[which_iov].len - offset;
+ if (which_iov + 1 < iovcnt)
+ chunksize = MAXALIGN_DOWN(chunksize);
+ res = shm_mq_send_bytes(mqh, chunksize, &iov[which_iov].data[offset],
+ nowait, &bytes_written);
+ mqh->mqh_partial_bytes += bytes_written;
+ offset += bytes_written;
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+ } while (mqh->mqh_partial_bytes < nbytes);
+
+ /* Reset for next message. */
+ mqh->mqh_partial_bytes = 0;
+ mqh->mqh_length_word_complete = false;
/* Notify receiver of the newly-written data, and return. */
return shm_mq_notify_receiver(mq);
@@ -653,8 +749,8 @@ shm_mq_detach(shm_mq *mq)
* Write bytes into a shared message queue.
*/
static shm_mq_result
-shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, void *data, bool nowait,
- Size *bytes_written)
+shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data,
+ bool nowait, Size *bytes_written)
{
shm_mq *mq = mqh->mqh_queue;
Size sent = 0;
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 2ea2216a65..38614a449d 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -423,6 +423,29 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
return structPtr;
}
+/*
+ * ShmemInitStruct -- Attach to an existing structure in shared memory.
+ */
+void *
+ShmemAttachStruct(const char *name)
+{
+ ShmemIndexEnt *result;
+ void *ptr;
+ bool found;
+
+ LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+ result = (ShmemIndexEnt *)
+ hash_search(ShmemIndex, name, HASH_FIND, &found);
+ if (!found || result == NULL)
+ elog(ERROR, "shared memory structure %s not found", name);
+ ptr = result->location;
+ Assert(ptr != NULL);
+
+ LWLockRelease(ShmemIndexLock);
+
+ return ptr;
+}
/*
* Add two Size values, checking for overflow
diff --git a/src/backend/utils/adt/jsonb_op.c b/src/backend/utils/adt/jsonb_op.c
index 2d071b2523..d9aaac9ac2 100644
--- a/src/backend/utils/adt/jsonb_op.c
+++ b/src/backend/utils/adt/jsonb_op.c
@@ -57,7 +57,7 @@ jsonb_exists_any(PG_FUNCTION_ARGS)
for (i = 0; i < elem_count; i++)
{
- JsonbValue strVal;
+ JsonbValue strVal;
if (key_nulls[i])
continue;
@@ -90,7 +90,7 @@ jsonb_exists_all(PG_FUNCTION_ARGS)
for (i = 0; i < elem_count; i++)
{
- JsonbValue strVal;
+ JsonbValue strVal;
if (key_nulls[i])
continue;
@@ -117,8 +117,7 @@ jsonb_contains(PG_FUNCTION_ARGS)
JsonbIterator *it1,
*it2;
- if (JB_ROOT_COUNT(val) < JB_ROOT_COUNT(tmpl) ||
- JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl))
+ if (JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl))
PG_RETURN_BOOL(false);
it1 = JsonbIteratorInit(&val->root);
@@ -137,8 +136,7 @@ jsonb_contained(PG_FUNCTION_ARGS)
JsonbIterator *it1,
*it2;
- if (JB_ROOT_COUNT(val) < JB_ROOT_COUNT(tmpl) ||
- JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl))
+ if (JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl))
PG_RETURN_BOOL(false);
it1 = JsonbIteratorInit(&val->root);
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index f157df3532..2ff85396d0 100644
--- a/src/backend/utils/adt/jsonb_util.c
+++ b/src/backend/utils/adt/jsonb_util.c
@@ -957,13 +957,24 @@ JsonbDeepContains(JsonbIterator **val, JsonbIterator **mContained)
}
else if (rcont == WJB_BEGIN_OBJECT)
{
- JsonbValue *lhsVal; /* lhsVal is from pair in lhs object */
-
+ Assert(vval.type == jbvObject);
Assert(vcontained.type == jbvObject);
+ /*
+ * If the lhs has fewer pairs than the rhs, it can't possibly contain
+ * the rhs. (This conclusion is safe only because we de-duplicate
+ * keys in all Jsonb objects; thus there can be no corresponding
+ * optimization in the array case.) The case probably won't arise
+ * often, but since it's such a cheap check we may as well make it.
+ */
+ if (vval.val.object.nPairs < vcontained.val.object.nPairs)
+ return false;
+
/* Work through rhs "is it contained within?" object */
for (;;)
{
+ JsonbValue *lhsVal; /* lhsVal is from pair in lhs object */
+
rcont = JsonbIteratorNext(mContained, &vcontained, false);
/*
@@ -1047,6 +1058,7 @@ JsonbDeepContains(JsonbIterator **val, JsonbIterator **mContained)
JsonbValue *lhsConts = NULL;
uint32 nLhsElems = vval.val.array.nElems;
+ Assert(vval.type == jbvArray);
Assert(vcontained.type == jbvArray);
/*
diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c
index 4eeb6314fa..67539ecde9 100644
--- a/src/backend/utils/adt/misc.c
+++ b/src/backend/utils/adt/misc.c
@@ -35,6 +35,7 @@
#include "storage/proc.h"
#include "storage/procarray.h"
#include "utils/lsyscache.h"
+#include "utils/ruleutils.h"
#include "tcop/tcopprot.h"
#include "utils/builtins.h"
#include "utils/timestamp.h"
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 6e41cbd142..24ade6cc20 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -55,6 +55,7 @@
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
+#include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/tqual.h"
diff --git a/src/backend/utils/hash/Makefile b/src/backend/utils/hash/Makefile
index 05d347c856..5d5338266d 100644
--- a/src/backend/utils/hash/Makefile
+++ b/src/backend/utils/hash/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/hash
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = dynahash.o hashfn.o
+OBJS = chash.o dynahash.o hashfn.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/hash/chash.c b/src/backend/utils/hash/chash.c
new file mode 100644
index 0000000000..0d4dc78a4e
--- /dev/null
+++ b/src/backend/utils/hash/chash.c
@@ -0,0 +1,1075 @@
+/*-------------------------------------------------------------------------
+ *
+ * chash.c
+ * concurrent hash tables
+ *
+ * A concurrent hash table stores a collection of fixed-size objects.
+ * From the point of view of this module, such objects are merely an
+ * opaque array of bytes, but the caller will typically implement them as
+ * a C "struct". Some fixed-size, leading portion of each object is
+ * designated as the key, which must be distinct for all objects in the
+ * collection. Since PostgreSQL's shared memory model does not permit
+ * dynamic shared-memory allocation, we preallocate shared-memory space
+ * for the maximum number of entities which can be stored (plus a few
+ * extra, for reasons that will be further explained below). This space
+ * is allocated as a single large array called the arena, and we often
+ * refer to entities by their position in the arena rather than via an
+ * ordinary pointer. This saves a considerable amount of memory, since
+ * most modern architectures are 64-bit and therefore use 8-byte pointers,
+ * while arena offsets can be stored in a 32-bit word. In fact, we
+ * reserve one bit in each such word as a mark bit, so the maximum size
+ * of the arena is 2^31 elements, a restriction that does not currently
+ * appear to be problematic. An additional advantage of this representation
+ * is that aligned 32-bit loads and stores are atomic on all architectures
+ * we support, but 64-bit loads and stores are not.
+ *
+ * When an element is inserted, we copy the data from the backend-private
+ * object supplied by the caller into one of these shared-memory entities.
+ * When the hash table is searched, the caller passes a backend-private
+ * entity with just the key filled in; if a matching element is found,
+ * data is copied from the shared memory entity into the non-key portion
+ * of the user-supplied entity. In this way, clients of this module
+ * never use pointers into shared memory directly.
+ *
+ * As normal, we structure the hash table as an array of buckets, whose
+ * size is always a power of two, so that the low-order bytes of the
+ * hash code can be used to select a bucket. If multiple entities has
+ * to the same bucket, we use separate chaining: each entity in the
+ * arena has an 8-byte header that stores the 4-byte arena offset of the
+ * next item in the bucket and the hash value of the entity's key.
+ * Bucket chains are maintained in order by ascending hash value and
+ * then by ascending entity key (as per memcmp) so that there is
+ * precisely one legal location at which a given new item can be inserted
+ * into a bucket.
+ *
+ * Items are inserted into buckets using compare-and-swap (CAS). Thus, this
+ * module cannot be used on architectures where we do not have 4-byte
+ * compare-and-swap. When an item is deleted, we first set its mark bit,
+ * which is stored within the next-pointer, again using CAS. Once this
+ * step is completed, the node is deleted. As a cleanup operation, we then
+ * use CAS to modify the next-pointer of the previous node to point to the
+ * node following the deleted node. Note that, even once this cleanup
+ * operation has been performed, some other backend concurrently walking the
+ * chain might still be visiting the deleted node. Thus, we must be certain
+ * not to overwrite the deleted node's next-pointer until all concurrent
+ * bucket scans have completed. This means, in particular, that we cannot
+ * immediately view the deleted node as available for reuse.
+ *
+ * Instead, when a delete-marked node is removed from the bucket chain, it
+ * is added to one of many garbage lists. There is a many-to-one mapping from
+ * buckets to garbage lists, so that the choice of bucket determines the
+ * garbage list but not visca versa. Any process which wishes to scan a bucket
+ * must first advertise in shared memory the corresponding garbage list number.
+ * When a backend wishes to move entries from a garbage list to a free list,
+ * it must first wait (by spinning) for any backends scanning that garbage
+ * list to complete their scans.
+ *
+ * Ideally, it would be nice to make this completely lock-free, but because
+ * of the above-described choice of garbage collection algorithm, it currently
+ * isn't. For an algorithm to be lock-free, it must be possible to suspend
+ * the execution of any number of processes for an arbitrary period of time
+ * without impeding the overall progress of the system. For this code, that
+ * is true except when garbage collection occurs. In that case, an insert,
+ * search, or delete operation can obstruct an inserting thread attempting to
+ * perform garbage collection for an unbounded period of time. The algorithm
+ * could be adapted as to be completely lock-free, essentially by guaranteeing
+ * that even in the worst case no combination of processes can obstruct garbage
+ * collection to a sufficient degree as to prevent an inserter from finding
+ * an available entry in a hash table containing fewer live elements than its
+ * declared maximum capacity. However, it's not clear that this is a good
+ * idea, because it would likely slow down operation in the non-contended
+ * case to solve a problem that we hope won't happen anyway.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/utils/hash/chash.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "access/hash.h"
+#include "storage/barrier.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "utils/chash.h"
+#include "utils/memutils.h"
+
+/*
+ * CHashPtr represents an offset into the arena, plus a mark bit that is
+ * used to implement concurrent deletion.
+ */
+typedef uint32 CHashPtr;
+#define InvalidCHashPtr ((uint32) -2)
+#define CHashPtrIsInvalid(x) ((x) >= InvalidCHashPtr)
+#define CHashPtrIsMarked(x) ((x) & 1)
+#define CHashPtrGetOffset(x) ((x) >> 1)
+#define CHashPtrMark(x) ((x) | 1)
+#define CHashPtrUnmark(x) ((x) & ~1)
+#define MakeCHashPtr(x) ((x) << 1)
+#define CHashMaxCapacity CHashPtrGetOffset(InvalidCHashPtr)
+
+/*
+ * Each object stored in the hash table is represented by a CHashNode, which
+ * stores a pointer to the next item in the same bucket, and the exact hash
+ * value of the current item. Each CHashNode is followed by space for the
+ * item itself.
+ */
+typedef struct
+{
+ CHashPtr next; /* arena offset of next element */
+ union
+ {
+ uint32 hashcode; /* hash(key) */
+ CHashPtr gcnext; /* arena offset of next garbage item */
+ } un;
+} CHashNode;
+
+#define SizeOfCHashNode MAXALIGN(sizeof(CHashNode))
+#define CHashNodeGetItem(x) (((char *) x) + SizeOfCHashNode)
+
+/*
+ * CHashTableData stores all the information that we need in order to access
+ * a concurrent hash table. We store one copy of this data in shared memory,
+ * and an additional copy in the private memory of each backend accessing the
+ * table.
+ */
+typedef struct CHashTableData
+{
+ /*
+ * These fields do not change after initialization.
+ */
+ CHashDescriptor desc; /* descriptor for this hash table */
+ uint32 bucket_mask; /* # of buckets, minus one */
+ uint8 garbage_shift; /* log2(# of buckets/# of garbage lists) */
+ uint8 freelist_shift; /* log2(# of garbage lists/# freelists) */
+ uint16 nfreelists; /* # of freelists */
+ uint32 arena_limit; /* # of arena elements */
+ uint32 arena_stride; /* bytes allocated per arena element */
+ CHashPtr *bucket; /* array with 1 entry per bucket */
+ CHashPtr *extra; /* entries for garbage and free lists */
+ char *arena; /* arena */
+
+ /*
+ * These fields will be different in each backend; the shared copy is
+ * irrelevant.
+ */
+ int gc_pid; /* PID that set gc_next */
+ uint32 gc_next; /* next garbage list to reclaim */
+ uint64 stats[CHS_NumberOfStatistics]; /* statistics */
+} CHashTableData;
+
+/* Compute # of buckets, garbage lists, or free lists. */
+#define CHashTableNBuckets(table) \
+ ((table)->bucket_mask + 1)
+#define CHashTableNGarbage(table) \
+ (CHashTableNBuckets((table)) >> (table)->garbage_shift)
+#define CHashTableNFreeLists(table) \
+ ((table)->nfreelists)
+
+/*
+ * Garbage lists and free lists are interleaved to reduce cache line
+ * contention on the free lists, so the calculation of where an individual
+ * list is located is a bit complex.
+ */
+#define CHashTableGetGarbageList(table, n) \
+ (&(table)->extra[(n) + ((n) >> (table)->freelist_shift)])
+#define CHashTableGetGarbageByBucket(table, n) \
+ (CHashTableGetGarbageList((table), (n) >> (table)->garbage_shift))
+#define CHashTableGetFreeList(table, n) \
+ (&(table)->extra[(n) + (((n) + 1) << (table)->freelist_shift)])
+
+/* Access macros for arena nodes. */
+#define CHashTableGetRaw(table, offset) \
+ (AssertMacro((offset) < (table)->arena_limit), \
+ (CHashNode *) ((table)->arena + (table)->arena_stride * (offset)))
+#define CHashTableGetNode(table, ptr) \
+ (AssertMacro(!CHashPtrIsInvalid(ptr)), \
+ CHashTableGetRaw((table), CHashPtrGetOffset((ptr))))
+
+/* Statistics macros. */
+#define CHashTableIncrementStatistic(table, stat) \
+ ((table)->stats[(stat)]++)
+
+/*
+ * Bucket scan.
+ */
+typedef struct
+{
+ CHashPtr target;
+ CHashPtr next;
+ CHashPtr *pointer_to_target;
+ CHashNode *target_node;
+ bool found;
+} CHashScanResult;
+
+/* Human-readable statistics names. */
+char *CHashStatisticsNames[] = {
+ "searches",
+ "searches failed",
+ "inserts",
+ "inserts failed",
+ "inserts retried",
+ "deletions",
+ "deletions failed",
+ "deletions retried",
+ "scan expunges",
+ "scan expunges failed",
+ "scans restarted",
+ "cleanup scans",
+ "allocations failed",
+ "garbage enqueues retried",
+ "garbage dequeues failed",
+ "garbage collections",
+ "garbage collection spins",
+ "garbage collection reclaims skipped",
+ "garbage collection fast reclaims",
+ "garbage collection reclaims retried",
+ "<end>"
+};
+
+/* Function prototypes. */
+static CHashPtr CHashAllocate(CHashTable table);
+static CHashPtr CHashAllocateViaGC(CHashTable table);
+static void CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c);
+static void CHashBucketScan(CHashTable table,
+ CHashPtr *start,
+ uint32 hashcode,
+ const void *key,
+ CHashScanResult *res);
+
+/*
+ * First stage of CHashTable initialization. We fill in all the constants
+ * here, but not the pointers.
+ */
+CHashTable
+CHashBootstrap(CHashDescriptor *desc)
+{
+ CHashTable table;
+ uint32 bucket_shift;
+
+ /* Sanity check. */
+ Assert(!strcmp(CHashStatisticsNames[CHS_NumberOfStatistics], "<end>"));
+
+ /* Allocate table and copy descriptor. */
+ table = MemoryContextAllocZero(TopMemoryContext, sizeof(CHashTableData));
+ memcpy(&table->desc, desc, sizeof(CHashDescriptor));
+
+ /* Sanity checks. */
+ if (desc->capacity < 1 || desc->capacity > CHashMaxCapacity)
+ elog(ERROR, "invalid capacity for concurrent hash");
+ if (desc->key_size < 1 || desc->key_size > desc->element_size)
+ elog(ERROR, "invalid key size for concurrent hash");
+
+ /*
+ * The number of buckets must be a power of two. To avoid (as much as
+ * possible) having to traverse long bucket chains, we aim for a load
+ * factor <= 1.0, so this is a pretty simple calculation: we just find the
+ * smallest power of two greater than or equal to the target capacity.
+ */
+ bucket_shift = fls(desc->capacity) - 1;
+ table->bucket_mask = (1 << bucket_shift) - 1;
+
+ /*
+ * We choose to have one garbage list for every 64 hash table buckets
+ * (that is, garbage_shift = 6) unless there are fewer than 64 buckets in
+ * total, in which case we still have a minimum of one garbage list.
+ *
+ * Increasing the garbage_shift would reduce the likelihood of a backend
+ * performing garbage collection needing to wait for a backend walking a
+ * bucket to finish its scan. On the other hand, decreasing the garbage
+ * shift would allow more items to be recovered in a single garbage
+ * collection cycle. It's not clear what the optimal value is.
+ */
+ table->garbage_shift = Min(bucket_shift, 6);
+ table->gc_next = 0;
+ table->gc_pid = 0;
+
+ /*
+ * Experimentation reveals that the free list manipulation is both one of
+ * the slowest parts of this algorithm and the most vulnerable to
+ * contention. Therefore, we want to have as many free lists as possible,
+ * but there's no need to have more than the number of CPU cores, so we
+ * limit the number of freelists to 64. There might be a benefit in some
+ * larger limit on a really big system, but we'll cap it here pending some
+ * actual test results. We're also limited to having no more freelists
+ * than we do garbage lists.
+ */
+#define LOG2_MAX_FREELIST 6
+ if (bucket_shift - table->garbage_shift < LOG2_MAX_FREELIST)
+ table->freelist_shift = 0;
+ else
+ table->freelist_shift =
+ (bucket_shift - table->garbage_shift) - LOG2_MAX_FREELIST;
+ table->nfreelists =
+ 1 << (bucket_shift - (table->garbage_shift + table->freelist_shift));
+
+ /*
+ * To make garbage collection efficient, we overallocate. Normally, we
+ * overallocate by one-eighth, but if that would be less than 15 elements,
+ * then we allocate 15 elements instead. This extra capacity can actually
+ * be used, but for best performance, it shouldn't be. It's the caller's
+ * responsibility to avoid this.
+ */
+ table->arena_limit = desc->capacity;
+ if (desc->capacity < 120)
+ table->arena_limit += 15;
+ else
+ table->arena_limit += table->arena_limit / 8;
+
+ /* Each arena element must be MAXALIGN'd and include per-node space. */
+ table->arena_stride = SizeOfCHashNode + MAXALIGN(desc->element_size);
+
+ /* Zero out statistics. */
+ memset(table->stats, 0, sizeof(uint64) * CHS_NumberOfStatistics);
+
+ return table;
+}
+
+/*
+ * Estimate shared memory requirements.
+ */
+Size
+CHashEstimateSize(CHashTable table)
+{
+ Size total_buckets;
+ Size size;
+ Size nbuckets = CHashTableNBuckets(table);
+ Size ngarbage = CHashTableNGarbage(table);
+ Size nfreelists = CHashTableNFreeLists(table);
+
+ Assert(nbuckets > 0 && ngarbage > 0 && nfreelists > 0);
+ total_buckets = add_size(nbuckets, ngarbage);
+ total_buckets = add_size(total_buckets, nfreelists);
+
+ size = MAXALIGN(sizeof(CHashTableData));
+ size = add_size(size, mul_size(sizeof(CHashPtr), total_buckets));
+ size = add_size(size, mul_size(table->arena_stride, table->arena_limit));
+
+ return size;
+}
+
+/*
+ * Create a concurrent hash table in shared memory, or attach to an existing
+ * table.
+ */
+CHashTable
+CHashInitialize(CHashTable table, CHashDescriptor *desc)
+{
+ Size size;
+ bool found;
+ void *shmem;
+ uint32 i;
+ uint32 nbuckets;
+ uint32 nfreelists;
+ uint32 ngarbage;
+ uint32 nextra;
+
+ /*
+ * If we're under the postmaster, this must be the EXEC_BACKEND case where
+ * we need to attach to an existing shared-memory segment.
+ */
+ if (IsUnderPostmaster)
+ {
+ void *shmem;
+
+ Assert(table == NULL);
+ table = MemoryContextAlloc(TopMemoryContext, sizeof(CHashTableData));
+ shmem = ShmemAttachStruct(desc->shmem_name);
+ memcpy(table, shmem, sizeof(CHashTableData));
+ return table;
+ }
+
+ /*
+ * Otherwise, the hash table should not already exist, and we must
+ * create it. But the table should already be bootstrapped, since we
+ * must previously have computed its size when figuring out our shared
+ * memory allocation.
+ */
+ Assert(table != NULL);
+ size = CHashEstimateSize(table);
+ shmem = ShmemInitStruct(table->desc.shmem_name, size, &found);
+ Assert(!found);
+
+ /* Bucket, garbage, and freelist arrays follow table info. */
+ table->bucket = (CHashPtr *)
+ (((char *) shmem) + MAXALIGN(sizeof(CHashTableData)));
+ nbuckets = CHashTableNBuckets(table);
+ table->extra = &table->bucket[nbuckets];
+
+ /* Arena follows the various lists. */
+ ngarbage = CHashTableNGarbage(table);
+ nfreelists = CHashTableNFreeLists(table);
+ nextra = ngarbage + nfreelists;
+ table->arena = (void *) (&table->extra[nextra]);
+
+ /* Initialize all three sets of lists to empty. */
+ for (i = 0; i < nbuckets; ++i)
+ table->bucket[i] = InvalidCHashPtr;
+ for (i = 0; i < nextra; ++i)
+ table->extra[i] = InvalidCHashPtr;
+
+ /* Put all arena elements on the free lists. */
+ for (i = 0; i < table->arena_limit; ++i)
+ {
+ CHashPtr *f = CHashTableGetFreeList(table, i % nfreelists);
+ CHashNode *n = CHashTableGetRaw(table, i);
+
+ n->un.gcnext = *f;
+ *f = MakeCHashPtr(i);
+ }
+
+ /*
+ * Copy table (with pointers now filled in) to shared memory. This is
+ * arguably unnecessary when not using EXEC_BACKEND, but we do it anyway.
+ */
+ memcpy(shmem, table, sizeof(CHashTableData));
+
+ return table;
+}
+
+/*
+ * Search a concurrent hash table. entry should be a block of memory large
+ * enough to hold a complete entry, with just the key portion filled in. If
+ * a matching entry is found, this function will fill in the rest of the entry
+ * from the data in the hash table and return true. If not, it will return
+ * false.
+ */
+bool
+CHashSearch(CHashTable table, void *entry)
+{
+ uint32 hashcode = hash_any(entry, table->desc.key_size);
+ uint32 bucket = hashcode & table->bucket_mask;
+ CHashPtr *b = &table->bucket[bucket];
+ CHashScanResult scan;
+
+ /* Prevent garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] == NULL);
+ MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+ pg_memory_barrier();
+
+ /* Scan bucket and return data from any matching entry. */
+ CHashBucketScan(table, b, hashcode, entry, &scan);
+ if (scan.found)
+ memcpy(((char *) entry) + table->desc.key_size,
+ CHashNodeGetItem(scan.target_node) + table->desc.key_size,
+ table->desc.element_size - table->desc.key_size);
+
+ /* Allow garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] != NULL);
+ pg_memory_barrier();
+ MyProc->hazard[0] = NULL;
+
+ CHashTableIncrementStatistic(table, CHS_Search);
+ if (!scan.found)
+ CHashTableIncrementStatistic(table, CHS_Search_Failed);
+ return scan.found;
+}
+
+/*
+ * Insert into a concurrent hash table. entry should be the complete entry
+ * to be inserted. If no duplicate entry is found in the table, this function
+ * will insert the entry and return true. Otherwise, the duplicate entry's
+ * value will be copied into the supplied entry and the function will return
+ * false.
+ *
+ * The caller is responsible for ensuring that no inserts are performed into
+ * a hash table which is at capacity. The behavor of such an insert is
+ * undefined (the actual behavior is that the insert may either succeed,
+ * degrading performance; or CHashAllocate may enter a tight loop until such
+ * time as an element is deleted).
+ */
+bool
+CHashInsert(CHashTable table, void *entry)
+{
+ uint32 hashcode = hash_any(entry, table->desc.key_size);
+ uint32 bucket = hashcode & table->bucket_mask;
+ CHashPtr *b = &table->bucket[bucket];
+ CHashPtr new;
+ CHashNode *nnew;
+ CHashScanResult scan;
+
+ /*
+ * Allocate and initialize a new entry, on the assumption that the insert
+ * will succeed. If it ends up failing, we must be sure to put this back
+ * on some free list, lest it be permanently leaked.
+ */
+ new = CHashAllocate(table);
+ nnew = CHashTableGetNode(table, new);
+ nnew->un.hashcode = hashcode;
+ memcpy(CHashNodeGetItem(nnew), entry, table->desc.element_size);
+
+ /* Prevent garbage collection for this bucket. */
+ MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+ pg_memory_barrier();
+
+ /*
+ * Scan the bucket. If we don't find a match, use compare-and-swap to
+ * insert the new node at the insert position. If we do find a match,
+ * return the data to the caller.
+ */
+retry:
+ CHashBucketScan(table, b, hashcode, entry, &scan);
+ if (scan.found)
+ memcpy(((char *) entry) + table->desc.key_size,
+ CHashNodeGetItem(scan.target_node) + table->desc.key_size,
+ table->desc.element_size - table->desc.key_size);
+ else
+ {
+ /*
+ * We didn't find a matching element, so use compare-and-swap to
+ * attempt to insert the new element we've prepared. If this fails,
+ * someone currently inserted or deleted an element. The correct
+ * insertion point might have changed, or the key we're trying to
+ * insert might now be present when it wasn't before, so we'll have
+ * to search the bucket chain anew.
+ *
+ * There is a risk of starvation here, but we hope it will not happen
+ * in practice. Contention for inserting new elements should be
+ * spread out pretty much evenly across N+M possible insertion points,
+ * where N is the number of buckets and M is the number of elements
+ * in the table. Even for a quite modestly size table this is likely
+ * to exceed the number of CPU cores.
+ */
+ Assert(!CHashPtrIsMarked(scan.target));
+ nnew->next = scan.target;
+ if (!__sync_bool_compare_and_swap(scan.pointer_to_target,
+ scan.target, new))
+ {
+ CHashTableIncrementStatistic(table, CHS_Insert_Retry);
+ goto retry;
+ }
+ }
+
+ /* Allow garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] != NULL);
+ pg_memory_barrier();
+ MyProc->hazard[0] = NULL;
+
+ /*
+ * If the insert failed, add the entry we found to the appropriate
+ * garbage list. We can't simply put this back on the freelist,
+ * because that leads to an ABA problem: some other backend could
+ * read the head of the freelist, go into the tank, and then use
+ * CAS to pop an element. If at that point, it finds the same
+ * element at the head of the freelist but with a different successor,
+ * we're hosed. To prevent that, we ensure that elements are added
+ * to a given freelist only after verifying that any allocations in
+ * progress at the time we popped the freelist has completed. This
+ * guarantees that any allocation still in progress at the time this
+ * element makes it back to the freelist is trying to allocate some
+ * other node.
+ */
+ CHashTableIncrementStatistic(table, CHS_Insert);
+ if (scan.found)
+ {
+ CHashTableIncrementStatistic(table, CHS_Insert_Failed);
+ CHashAddToGarbage(table, bucket, new);
+ }
+
+ /* The insert succeeded if and only if no duplicate was found. */
+ return !scan.found;
+}
+
+/*
+ * Delete from a concurrent hash table. entry need only contain the key field.
+ * Returns true if we find and delete a matching key and false otherwise.
+ */
+bool
+CHashDelete(CHashTable table, void *entry)
+{
+ uint32 hashcode = hash_any(entry, table->desc.key_size);
+ uint32 bucket = hashcode & table->bucket_mask;
+ CHashPtr *b = &table->bucket[bucket];
+ CHashScanResult scan;
+
+ /* Prevent garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] == NULL);
+ MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+ pg_memory_barrier();
+
+ /* Scan bucket. */
+retry:
+ CHashBucketScan(table, b, hashcode, entry, &scan);
+
+ /* If we found it, try to delete it. */
+ if (scan.found)
+ {
+ Assert(!CHashPtrIsMarked(scan.next));
+
+ /* Attempt to apply delete-mark. */
+ if (!__sync_bool_compare_and_swap(&scan.target_node->next,
+ scan.next,
+ CHashPtrMark(scan.next)))
+ {
+ CHashTableIncrementStatistic(table, CHS_Delete_Retry);
+ goto retry;
+ }
+
+ /* Deletion is done; attempt to remove node from list. */
+ if (__sync_bool_compare_and_swap(scan.pointer_to_target,
+ scan.target,
+ scan.next))
+ CHashAddToGarbage(table, bucket, scan.target);
+ else
+ {
+ CHashScanResult cleanup_scan;
+
+ /*
+ * If we weren't able to remove the deleted item, rescan
+ * the bucket to make sure it's really gone. This is just
+ * like a regular bucket scan, except that we don't care
+ * about the results. We're just doing it to achieve the
+ * side-effect of removing delete-marked nodes from the
+ * bucket chain.
+ */
+ CHashTableIncrementStatistic(table, CHS_Cleanup_Scan);
+ CHashBucketScan(table, b, hashcode, entry, &cleanup_scan);
+ }
+ }
+
+ /* Allow garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] != NULL);
+ pg_memory_barrier();
+ MyProc->hazard[0] = NULL;
+
+ /* We're done. */
+ CHashTableIncrementStatistic(table, CHS_Delete);
+ if (!scan.found)
+ CHashTableIncrementStatistic(table, CHS_Delete_Failed);
+ return scan.found;
+}
+
+/*
+ * Provide backend-local statistics to caller.
+ */
+void
+CHashStatistics(CHashTable table, uint64 *stats)
+{
+ memcpy(stats, &table->stats, sizeof(uint64) * CHS_NumberOfStatistics);
+}
+
+/*
+ * Scan one bucket of a concurrent hash table, storing the results in a
+ * CHashResult object provided by the caller.
+ *
+ * Caller must suppress garbage collection for the target bucket before
+ * calling this function, and resume it afterwards.
+ *
+ * On return, res->found will be true if a matching item was found and false
+ * otherwise. res->target will be a CHashPtr referencing the matching item,
+ * or the first one following the position where the matching item should have
+ * been; res->pointer_to_target will be a pointer to the memory address from
+ * which res->target was read.
+ *
+ * If res->target is not invalid, then res->target_node is a pointer to its
+ * location in memory. If res->found, then res->next will be the next pointer
+ * of res->target_node; otherwise, it's undefined.
+ */
+static void
+CHashBucketScan(CHashTable table,
+ CHashPtr *start,
+ uint32 hashcode,
+ const void *key,
+ CHashScanResult *res)
+{
+ CHashPtr target;
+ CHashPtr *pointer_to_target;
+ CHashNode *target_node = NULL;
+
+retry:
+ pointer_to_target = start;
+ target = *pointer_to_target;
+ for (;;)
+ {
+ CHashPtr next;
+ uint32 h;
+ int cmp;
+
+ /*
+ * If we've reached the end of the bucket chain, stop; otherwise,
+ * figure out the actual address of the next item.
+ */
+ if (CHashPtrIsInvalid(target))
+ {
+ res->found = false;
+ break;
+ }
+ target_node = CHashTableGetNode(table, target);
+
+ /*
+ * target may have been fetched from an arena entry that could be
+ * concurrently modified, so a dependency barrier is required before
+ * dereferencing the derived pointer.
+ */
+ pg_read_barrier_depends();
+ next = target_node->next;
+
+ /*
+ * For simplicity, any bucket scan, even if it's servicing a search,
+ * will attempt to remove delete-marked items from the bucket. This
+ * ensures that delete-marked elements are removed from bucket chains
+ * as quickly as possible and reduces code duplication. See
+ * CHashDelete for further comments about why delete-marking is
+ * necessary and how it allows safe deletion.
+ */
+ if (CHashPtrIsMarked(next))
+ {
+zap:
+ if (__sync_bool_compare_and_swap(pointer_to_target,
+ target,
+ CHashPtrUnmark(next)))
+ {
+ /*
+ * No one else can possibly have modified target_node->next,
+ * because such modifications are not allowed after a
+ * delete-mark has been applied. Thus, if we just keep
+ * following the next pointers, we're guaranteed to visit
+ * all non-deleted items (and possibly some deleted items)
+ * that were present at the time we began the scan.
+ */
+ CHashTableIncrementStatistic(table, CHS_Scan_Expunge);
+ CHashAddToGarbage(table, hashcode & table->bucket_mask,
+ target);
+ target = CHashPtrUnmark(next);
+ }
+ else
+ {
+ /*
+ * If the previous node has been delete-marked, we can't
+ * further update the next-pointer, so restart the scan.
+ * Otherwise, we know that some other backend removed one
+ * or more deleted nodes from the bucket chain just as we
+ * were trying to do, and we can simply continue the scan
+ * from wherever the previous node is pointing now. It's
+ * possible that some concurrent inserts have happened, too,
+ * but that's OK; we can view those as having happened "before"
+ * whatever operation this scan is supporting.
+ *
+ * Although starvation is a theoretical possibility here if
+ * we are forced to retry repeatedly, even a single retry is
+ * vanishingly unlikely in practice. It requires some other
+ * backend to delete both the node that we're looking at and
+ * the node which precedes it before we advance to the next
+ * node. That could certainly happen occasionally, but we'd
+ * have to be pretty unlucky to have it happen even twice in
+ * a row.
+ */
+ CHashTableIncrementStatistic(table, CHS_Scan_Expunge_Fail);
+ target = *pointer_to_target;
+ if (CHashPtrIsMarked(target))
+ {
+ CHashTableIncrementStatistic(table, CHS_Scan_Restart);
+ goto retry;
+ }
+ }
+ continue;
+ }
+
+ /*
+ * Bucket chains are kept in order, so that there is exactly one legal
+ * point at which any given key can be inserted. The ordering is by
+ * hashcode first, and then by memcmp ordering of the keys involved.
+ */
+ h = target_node->un.hashcode;
+ if (h == hashcode)
+ cmp = memcmp(CHashNodeGetItem(target_node), key,
+ table->desc.key_size);
+ else if (h > hashcode)
+ cmp = 1;
+ else
+ cmp = -1;
+
+ /*
+ * If cmp < 0, then we haven't yet reached the point at which we'd
+ * expect to find the key, so we must continue the scan. If cmp == 0,
+ * we've found the key and can stop. If cmp > 0, we've either passed
+ * the point where we expect to find the key OR someone delete-marked
+ * the item and overwrote the hashcode with a gcnext pointer. In the
+ * latter case we must take care not to be fooled into stopping the
+ * scan early.
+ */
+ if (cmp >= 0)
+ {
+ if (cmp == 0)
+ {
+ res->found = true;
+ res->next = next;
+ break;
+ }
+ else
+ {
+ /*
+ * pg_read_barrier() prevents the reread of the next pointer
+ * from being speculated ahead of the read of the hash value.
+ */
+ pg_read_barrier();
+ next = target_node->next;
+ if (CHashPtrIsMarked(next))
+ goto zap;
+ res->found = false;
+ break;
+ }
+ }
+
+ /* Continue scan from next node. */
+ pointer_to_target = &target_node->next;
+ target = next;
+ }
+
+ /* Send results back to caller. */
+ res->target = target;
+ res->pointer_to_target = pointer_to_target;
+ res->target_node = target_node;
+}
+
+/*
+ * Allocate an arena slot for a new item to be inserted into a hash table.
+ *
+ * We don't want to wait until every single free-list is completely empty
+ * before beginning to garbage collect, because that could result in very
+ * fast allocation followed by a storm of garbage collection activity.
+ * It could also lead to every inserting backend ganging up on the only
+ * non-empty freelist.
+ *
+ * To avoid that, we check free lists and garbage lists in alternation.
+ * We always start with the same free list - which one is based on our
+ * backend ID - but we try to round-robin over all the available garbage
+ * lists. Whenever we successfully garbage collect, we put the recovered
+ * items on our own free list. In this way, if there's only one backend
+ * active, it will typically find a free buffer in the first place it looks:
+ * its own free list. It will also settle into a pattern of garbage
+ * collecting the garbage list which it has visited least recently, which
+ * is what we want.
+ */
+static CHashPtr
+CHashAllocate(CHashTable table)
+{
+ uint32 f_current;
+ CHashPtr new;
+
+ /* Pick a starting freelist base on our backend ID. */
+ f_current = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
+
+ /* If this process hasn't initialized gc_next yet, do that now. */
+ if (table->gc_pid != MyProcPid)
+ {
+ table->gc_pid = MyProcPid;
+ table->gc_next = ((uint32) MyProcPid) % CHashTableNGarbage(table);
+ }
+
+ /* Loop until we allocate a buffer. */
+ for (;;)
+ {
+ CHashPtr *b;
+
+ /*
+ * Try to pop a buffer from a freelist using compare-and-swap.
+ *
+ * Note that this is only safe if it's impossible for the next pointer
+ * of the first element on the list to change between the time when
+ * we read it and the time we use CAS to pop it off the list. This
+ * means that we can't allow any element that is currently on this
+ * freelist to be allocated, marked as garbage, garbage collected,
+ * and returned back to this freelist before we finish the CAS.
+ *
+ * If we attempt to pop the free-list and fail, we retry immediately
+ * with the same free-list. This reduces the frequency with which
+ * we're obliged to update our hazard pointers, which is a material
+ * savings due to the associated memory barrier.
+ */
+ b = CHashTableGetFreeList(table, f_current);
+ MyProc->hazard[0] = b;
+ pg_memory_barrier();
+ new = *b;
+ while (!CHashPtrIsInvalid(new))
+ {
+ CHashNode *n = CHashTableGetNode(table, new);
+
+ /*
+ * n is computed from table->freelist[f_current], which could
+ * be modified by concurrent activity, so we need a dependency
+ * barrier here.
+ */
+ pg_read_barrier_depends();
+ if (__sync_bool_compare_and_swap(b, new, n->un.gcnext))
+ return new;
+ CHashTableIncrementStatistic(table, CHS_Allocate_Fail);
+ new = *b;
+ }
+
+ /* Attempt garbage collection. */
+ new = CHashAllocateViaGC(table);
+ if (!CHashPtrIsInvalid(new))
+ return new;
+
+ /* Advance to next freelist. */
+ f_current = (f_current + 1) % CHashTableNFreeLists(table);
+ }
+}
+
+/*
+ * Attempt to satisfy an allocation request via garbage collection.
+ */
+static CHashPtr
+CHashAllocateViaGC(CHashTable table)
+{
+ uint32 f_home;
+ CHashPtr *b;
+ CHashPtr *fh;
+ CHashPtr fhead;
+ CHashPtr garbage;
+ CHashPtr new;
+ CHashNode *n;
+ uint32 i;
+
+ /* Pick a target freelist based on our backend ID. */
+ f_home = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
+ fh = CHashTableGetFreeList(table, f_home);
+
+ /* Select target garbage list. */
+ table->gc_next = (table->gc_next + 1) % CHashTableNGarbage(table);
+ b = CHashTableGetGarbageList(table, table->gc_next);
+ garbage = *b;
+
+ /* If list is empty, fail. */
+ if (CHashPtrIsInvalid(garbage))
+ return InvalidCHashPtr;
+
+ /* If we're unable to empty the list via compare-and-swap, fail. */
+ if (!__sync_bool_compare_and_swap(b, garbage, InvalidCHashPtr))
+ {
+ CHashTableIncrementStatistic(table, CHS_Garbage_Dequeue_Fail);
+ return InvalidCHashPtr;
+ }
+
+ /*
+ * Before removing elements removed from the garbage list to the
+ * freelist, we must wait until (1) all bucket scans that might
+ * still see elements on the freelist as part of the bucket chain
+ * have completed and (2) all allocations that might see an old
+ * version of the freelist containing one of the elements to be
+ * garbage collected have completed.
+ *
+ * Note: We can't begin this operation until the clearing of the
+ * garbage list has been committed to memory, but since that was
+ * done using an atomic operation no explicit barrier is needed
+ * here.
+ *
+ * Note: We could have a "soft" version of this that merely
+ * requeues the garbage if it's not immediately recycleable, but
+ * it's not clear that we need such a thing. On the flip side we
+ * might want to eventually enter a longer sleep here, or PANIC,
+ * but it's not clear exactly how to calibrate that.
+ */
+ CHashTableIncrementStatistic(table, CHS_GC);
+ MyProc->hazard[0] = NULL;
+ for (i = 0; i < ProcGlobal->allProcCount; i++)
+ {
+ volatile PGPROC *proc = &ProcGlobal->allProcs[i];
+ void *hazard;
+
+ hazard = proc->hazard[0];
+ if (hazard == b || hazard == fh)
+ {
+ CHashTableIncrementStatistic(table, CHS_GC_Spin);
+ do
+ {
+ hazard = proc->hazard[0];
+ } while (hazard == b || hazard == fh);
+ }
+ }
+
+ /* Remove one item from list to satisfy current allocation. */
+ new = garbage;
+ n = CHashTableGetNode(table, new);
+ pg_read_barrier_depends();
+ fhead = n->un.gcnext;
+
+ if (CHashPtrIsInvalid(fhead))
+ {
+ /*
+ * We have reclaimed exactly node, so there's nothing to put
+ * back on the free list. In this case (only) we need a
+ * memory barrier, because the reads above must complete
+ * before we overwrite n->un.gcnext with a new hashcode.
+ * (This is only needed when we reclaim exactly one node,
+ * because in any other case we'll do a compare-and-swap
+ * before returning, which implies a full barrier.)
+ */
+ pg_memory_barrier();
+ CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Skipped);
+ }
+ else if (__sync_bool_compare_and_swap(fh, InvalidCHashPtr, fhead))
+ {
+ /*
+ * Our free list is empty, and we've succesfully pushed the
+ * reclaimed nodes onto it. So we're done.
+ */
+ CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Fast);
+ }
+ else
+ {
+ CHashPtr fcurrent;
+ CHashPtr fnext;
+ CHashPtr oldhead;
+
+ /* Walk list of reclaimed elements to end. */
+ fcurrent = fhead;
+ for (;;)
+ {
+ n = CHashTableGetNode(table, fcurrent);
+ fnext = n->un.gcnext;
+ if (CHashPtrIsInvalid(fnext))
+ break;
+ fcurrent = fnext;
+ }
+
+ /* Push reclaimed elements onto home free list. */
+ for (;;)
+ {
+ oldhead = *fh;
+ n->un.gcnext = oldhead;
+ if (__sync_bool_compare_and_swap(fh, oldhead, fhead))
+ break;
+ CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Retry);
+ }
+ }
+
+ /* Return the element we saved for ourselves. */
+ return new;
+}
+
+/*
+ * Add an arena slot to the appropriate garbage list.
+ *
+ * The next garbage collection cycle for the affected bucket will move it
+ * to the free list. We can't do that immediately because there might be
+ * someone traversing the list who is counting on being able to follow the
+ * next pointer. It's OK to clobber the hash value, though, since a spurious
+ * failure to match an already-deleted item shouldn't cause any problems;
+ * this is why gcnext can share space with the hash value.
+ */
+static void
+CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c)
+{
+ CHashPtr g;
+ CHashNode *n;
+ CHashPtr *garbage;
+
+ n = CHashTableGetNode(table, c);
+ garbage = CHashTableGetGarbageByBucket(table, bucket);
+
+ while (1)
+ {
+ g = *garbage;
+ n->un.gcnext = g;
+ if (__sync_bool_compare_and_swap(garbage, g, c))
+ break;
+ CHashTableIncrementStatistic(table, CHS_Garbage_Enqueue_Retry);
+ }
+}
diff --git a/src/bin/pg_basebackup/pg_receivexlog.c b/src/bin/pg_basebackup/pg_receivexlog.c
index e6f69e4edd..7374cc8eb4 100644
--- a/src/bin/pg_basebackup/pg_receivexlog.c
+++ b/src/bin/pg_basebackup/pg_receivexlog.c
@@ -66,9 +66,12 @@ usage(void)
printf(_(" %s [OPTION]...\n"), progname);
printf(_("\nOptions:\n"));
printf(_(" -D, --directory=DIR receive transaction log files into this directory\n"));
+ printf(_(" -F --fsync-interval=SECS\n"
+ " time between fsyncs to transaction log files (default: %d)\n"), (fsync_interval / 1000));
printf(_(" -n, --no-loop do not loop on connection lost\n"));
- printf(_(" -F --fsync-interval=INTERVAL\n"
- " frequency of syncs to transaction log files (in seconds)\n"));
+ printf(_(" -s, --status-interval=SECS\n"
+ " time between status packets sent to server (default: %d)\n"), (standby_message_timeout / 1000));
+ printf(_(" -S, --slot=SLOTNAME replication slot to use\n"));
printf(_(" -v, --verbose output verbose messages\n"));
printf(_(" -V, --version output version information, then exit\n"));
printf(_(" -?, --help show this help, then exit\n"));
@@ -76,12 +79,9 @@ usage(void)
printf(_(" -d, --dbname=CONNSTR connection string\n"));
printf(_(" -h, --host=HOSTNAME database server host or socket directory\n"));
printf(_(" -p, --port=PORT database server port number\n"));
- printf(_(" -s, --status-interval=INTERVAL\n"
- " time between status packets sent to server (in seconds)\n"));
printf(_(" -U, --username=NAME connect as specified database user\n"));
printf(_(" -w, --no-password never prompt for password\n"));
printf(_(" -W, --password force password prompt (should happen automatically)\n"));
- printf(_(" -S, --slot=SLOTNAME replication slot to use\n"));
printf(_("\nOptional actions:\n"));
printf(_(" --create-slot create a new replication slot (for the slot's name see --slot)\n"));
printf(_(" --drop-slot drop the replication slot (for the slot's name see --slot)\n"));
diff --git a/src/bin/pg_basebackup/pg_recvlogical.c b/src/bin/pg_basebackup/pg_recvlogical.c
index 1a01167912..0d97638851 100644
--- a/src/bin/pg_basebackup/pg_recvlogical.c
+++ b/src/bin/pg_basebackup/pg_recvlogical.c
@@ -62,15 +62,27 @@ static void disconnect_and_exit(int code);
static void
usage(void)
{
- printf(_("%s receives PostgreSQL logical change stream.\n\n"),
+ printf(_("%s receives PostgreSQL logical change streams.\n\n"),
progname);
printf(_("Usage:\n"));
printf(_(" %s [OPTION]...\n"), progname);
+ printf(_("\nAction to be performed:\n"));
+ printf(_(" --create-slot create a new replication slot (for the slot's name see --slot)\n"));
+ printf(_(" --drop-slot drop the replication slot (for the slot's name see --slot)\n"));
+ printf(_(" --start start streaming in a replication slot (for the slot's name see --slot)\n"));
printf(_("\nOptions:\n"));
- printf(_(" -f, --file=FILE receive log into this file. - for stdout\n"));
+ printf(_(" -f, --file=FILE receive log into this file, - for stdout\n"));
printf(_(" -F --fsync-interval=SECS\n"
- " frequency of syncs to the output file (default: %d)\n"), (fsync_interval / 1000));
+ " time between fsyncs to the output file (default: %d)\n"), (fsync_interval / 1000));
+ printf(_(" -I, --startpos=LSN where in an existing slot should the streaming start\n"));
printf(_(" -n, --no-loop do not loop on connection lost\n"));
+ printf(_(" -o, --option=NAME[=VALUE]\n"
+ " pass option NAME with optional value VALUE to the\n"
+ " output plugin\n"));
+ printf(_(" -P, --plugin=PLUGIN use output plugin PLUGIN (default: %s)\n"), plugin);
+ printf(_(" -s, --status-interval=SECS\n"
+ " time between status packets sent to server (default: %d)\n"), (standby_message_timeout / 1000));
+ printf(_(" -S, --slot=SLOTNAME name of the logical replication slot\n"));
printf(_(" -v, --verbose output verbose messages\n"));
printf(_(" -V, --version output version information, then exit\n"));
printf(_(" -?, --help show this help, then exit\n"));
@@ -81,19 +93,6 @@ usage(void)
printf(_(" -U, --username=NAME connect as specified database user\n"));
printf(_(" -w, --no-password never prompt for password\n"));
printf(_(" -W, --password force password prompt (should happen automatically)\n"));
- printf(_("\nReplication options:\n"));
- printf(_(" -I, --startpos=PTR where in an existing slot should the streaming start\n"));
- printf(_(" -o, --option=NAME[=VALUE]\n"
- " specify option NAME with optional value VALUE, to be passed\n"
- " to the output plugin\n"));
- printf(_(" -P, --plugin=PLUGIN use output plugin PLUGIN (default: %s)\n"), plugin);
- printf(_(" -s, --status-interval=SECS\n"
- " time between status packets sent to server (default: %d)\n"), (standby_message_timeout / 1000));
- printf(_(" -S, --slot=SLOT name of the logical replication slot\n"));
- printf(_("\nAction to be performed:\n"));
- printf(_(" --create-slot create a new replication slot (for the slot's name see --slot)\n"));
- printf(_(" --drop-slot drop the replication slot (for the slot's name see --slot)\n"));
- printf(_(" --start start streaming in a replication slot (for the slot's name see --slot)\n"));
printf(_("\nReport bugs to <pgsql-bugs@postgresql.org>.\n"));
}
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index a46ca53ba6..733f1cbc86 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -1456,7 +1456,9 @@ pgwin32_doRegister(void)
NULL, NULL, "RPCSS\0", register_username, register_password)) == NULL)
{
CloseServiceHandle(hSCM);
- write_stderr(_("%s: could not register service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError());
+ write_stderr(_("%s: could not register service \"%s\": error code %lu\n"),
+ progname, register_servicename,
+ (unsigned long) GetLastError());
exit(1);
}
CloseServiceHandle(hService);
@@ -1484,14 +1486,18 @@ pgwin32_doUnregister(void)
if ((hService = OpenService(hSCM, register_servicename, DELETE)) == NULL)
{
CloseServiceHandle(hSCM);
- write_stderr(_("%s: could not open service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError());
+ write_stderr(_("%s: could not open service \"%s\": error code %lu\n"),
+ progname, register_servicename,
+ (unsigned long) GetLastError());
exit(1);
}
if (!DeleteService(hService))
{
CloseServiceHandle(hService);
CloseServiceHandle(hSCM);
- write_stderr(_("%s: could not unregister service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError());
+ write_stderr(_("%s: could not unregister service \"%s\": error code %lu\n"),
+ progname, register_servicename,
+ (unsigned long) GetLastError());
exit(1);
}
CloseServiceHandle(hService);
@@ -1627,7 +1633,9 @@ pgwin32_doRunAsService(void)
if (StartServiceCtrlDispatcher(st) == 0)
{
- write_stderr(_("%s: could not start service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError());
+ write_stderr(_("%s: could not start service \"%s\": error code %lu\n"),
+ progname, register_servicename,
+ (unsigned long) GetLastError());
exit(1);
}
}
@@ -1708,7 +1716,14 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser
/* Open the current token to use as a base for the restricted one */
if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ALL_ACCESS, &origToken))
{
- write_stderr(_("%s: could not open process token: error code %lu\n"), progname, GetLastError());
+ /*
+ * Most Windows targets make DWORD a 32-bit unsigned long. Cygwin
+ * x86_64, an LP64 target, makes it a 32-bit unsigned int. In code
+ * built for Cygwin as well as for native Windows targets, cast DWORD
+ * before printing.
+ */
+ write_stderr(_("%s: could not open process token: error code %lu\n"),
+ progname, (unsigned long) GetLastError());
return 0;
}
@@ -1721,7 +1736,8 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser
SECURITY_BUILTIN_DOMAIN_RID, DOMAIN_ALIAS_RID_POWER_USERS, 0, 0, 0, 0, 0,
0, &dropSids[1].Sid))
{
- write_stderr(_("%s: could not allocate SIDs: error code %lu\n"), progname, GetLastError());
+ write_stderr(_("%s: could not allocate SIDs: error code %lu\n"),
+ progname, (unsigned long) GetLastError());
return 0;
}
@@ -1740,7 +1756,8 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser
if (!b)
{
- write_stderr(_("%s: could not create restricted token: error code %lu\n"), progname, GetLastError());
+ write_stderr(_("%s: could not create restricted token: error code %lu\n"),
+ progname, (unsigned long) GetLastError());
return 0;
}
@@ -1791,7 +1808,8 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser
HANDLE job;
char jobname[128];
- sprintf(jobname, "PostgreSQL_%lu", processInfo->dwProcessId);
+ sprintf(jobname, "PostgreSQL_%lu",
+ (unsigned long) processInfo->dwProcessId);
job = _CreateJobObject(NULL, jobname);
if (job)
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index c9e61dfa39..0e1e0cd5f0 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -127,6 +127,10 @@ typedef struct HashJoinTableData
int nbuckets; /* # buckets in the in-memory hash table */
int log2_nbuckets; /* its log2 (nbuckets must be a power of 2) */
+ int nbuckets_original; /* # buckets when starting the first hash */
+ int nbuckets_optimal; /* optimal # buckets (per batch) */
+ int log2_nbuckets_optimal; /* same as log2_nbuckets optimal */
+
/* buckets[i] is head of list of tuples in i'th in-memory bucket */
struct HashJoinTupleData **buckets;
/* buckets array is per-batch storage, as are all the tuples */
@@ -148,6 +152,7 @@ typedef struct HashJoinTableData
bool growEnabled; /* flag to shut off nbatch increases */
double totalTuples; /* # tuples obtained from inner plan */
+ double skewTuples; /* # tuples inserted into skew tuples */
/*
* These arrays are allocated for the life of the hash join, but only if
diff --git a/src/include/storage/barrier.h b/src/include/storage/barrier.h
index b36705b862..6ef779bf95 100644
--- a/src/include/storage/barrier.h
+++ b/src/include/storage/barrier.h
@@ -20,4 +20,12 @@
*/
#include "port/atomics.h"
+/*
+ * If dependency barriers are undefined, we define them as a no-op. The only
+ * known platform where anything more is required is DEC Alpha.
+ */
+#if !defined(pg_read_barrier_depends)
+#define pg_read_barrier_depends()
+#endif
+
#endif /* BARRIER_H */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 0e69b633c3..4c6fac8052 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -96,20 +96,6 @@ typedef struct buftag
)
/*
- * The shared buffer mapping table is partitioned to reduce contention.
- * To determine which partition lock a given tag requires, compute the tag's
- * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
- * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
- */
-#define BufTableHashPartition(hashcode) \
- ((hashcode) % NUM_BUFFER_PARTITIONS)
-#define BufMappingPartitionLock(hashcode) \
- (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \
- BufTableHashPartition(hashcode)].lock)
-#define BufMappingPartitionLockByIndex(i) \
- (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock)
-
-/*
* BufferDesc -- shared descriptor/state data for a single shared buffer.
*
* Note: buf_hdr_lock must be held to examine or change the tag, flags,
@@ -200,9 +186,9 @@ extern void StrategyInitialize(bool init);
extern Size BufTableShmemSize(int size);
extern void InitBufTable(int size);
extern uint32 BufTableHashCode(BufferTag *tagPtr);
-extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
-extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
-extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
+extern int BufTableLookup(BufferTag *tagPtr);
+extern int BufTableInsert(BufferTag *tagPtr, int buf_id);
+extern void BufTableDelete(BufferTag *tagPtr);
/* localbuf.c */
extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 595e69da48..8e98425ca4 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -140,7 +140,7 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
*/
/* Number of partitions of the shared buffer mapping hashtable */
-#define NUM_BUFFER_PARTITIONS 128
+#define NUM_BUFFER_PARTITIONS 0
/* Number of partitions the shared lock tables are divided into */
#define LOG2_NUM_LOCK_PARTITIONS 4
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 38758d3ea5..cdf2f268fd 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -59,6 +59,14 @@ struct XidCache
#define FP_LOCK_SLOTS_PER_BACKEND 16
/*
+ * Some lock-free algorithms require each backend process to be able to
+ * advertise a certain number of "hazard pointers" in shared memory.
+ * Right now one per backend is enough for our purpose, but some
+ * algorithms require more.
+ */
+#define NUM_HAZARD_POINTERS 1
+
+/*
* Each backend has a PGPROC struct in shared memory. There is also a list of
* currently-unused PGPROC structs that will be reallocated to new backends.
*
@@ -143,6 +151,12 @@ struct PGPROC
bool fpVXIDLock; /* are we holding a fast-path VXID lock? */
LocalTransactionId fpLocalTransactionId; /* lxid for fast-path VXID
* lock */
+
+ /*
+ * Hazard pointers. Currently one is enough, though some algorithms
+ * require a few more.
+ */
+ void *hazard[NUM_HAZARD_POINTERS];
};
/* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
diff --git a/src/include/storage/shm_mq.h b/src/include/storage/shm_mq.h
index 5bae3807af..063400ae28 100644
--- a/src/include/storage/shm_mq.h
+++ b/src/include/storage/shm_mq.h
@@ -25,6 +25,13 @@ typedef struct shm_mq shm_mq;
struct shm_mq_handle;
typedef struct shm_mq_handle shm_mq_handle;
+/* Descriptors for a single write spanning multiple locations. */
+typedef struct
+{
+ const char *data;
+ Size len;
+} shm_mq_iovec;
+
/* Possible results of a send or receive operation. */
typedef enum
{
@@ -52,12 +59,17 @@ extern PGPROC *shm_mq_get_sender(shm_mq *);
extern shm_mq_handle *shm_mq_attach(shm_mq *mq, dsm_segment *seg,
BackgroundWorkerHandle *handle);
+/* Associate worker handle with shm_mq. */
+extern void shm_mq_set_handle(shm_mq_handle *, BackgroundWorkerHandle *);
+
/* Break connection. */
extern void shm_mq_detach(shm_mq *);
/* Send or receive messages. */
extern shm_mq_result shm_mq_send(shm_mq_handle *mqh,
- Size nbytes, void *data, bool nowait);
+ Size nbytes, const void *data, bool nowait);
+extern shm_mq_result shm_mq_sendv(shm_mq_handle *mqh,
+ shm_mq_iovec *iov, int iovcnt, bool nowait);
extern shm_mq_result shm_mq_receive(shm_mq_handle *mqh,
Size *nbytesp, void **datap, bool nowait);
diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h
index 745eb7e576..4ff8415fac 100644
--- a/src/include/storage/shmem.h
+++ b/src/include/storage/shmem.h
@@ -40,6 +40,7 @@ extern void InitShmemIndex(void);
extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size,
HASHCTL *infoP, int hash_flags);
extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr);
+extern void *ShmemAttachStruct(const char *name);
extern Size add_size(Size s1, Size s2);
extern Size mul_size(Size s1, Size s2);
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index d88e7a3b26..fb1b4a42dd 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -676,13 +676,10 @@ extern Datum pg_get_viewdef_name(PG_FUNCTION_ARGS);
extern Datum pg_get_viewdef_name_ext(PG_FUNCTION_ARGS);
extern Datum pg_get_indexdef(PG_FUNCTION_ARGS);
extern Datum pg_get_indexdef_ext(PG_FUNCTION_ARGS);
-extern char *pg_get_indexdef_string(Oid indexrelid);
-extern char *pg_get_indexdef_columns(Oid indexrelid, bool pretty);
extern Datum pg_get_triggerdef(PG_FUNCTION_ARGS);
extern Datum pg_get_triggerdef_ext(PG_FUNCTION_ARGS);
extern Datum pg_get_constraintdef(PG_FUNCTION_ARGS);
extern Datum pg_get_constraintdef_ext(PG_FUNCTION_ARGS);
-extern char *pg_get_constraintdef_string(Oid constraintId);
extern Datum pg_get_expr(PG_FUNCTION_ARGS);
extern Datum pg_get_expr_ext(PG_FUNCTION_ARGS);
extern Datum pg_get_userbyid(PG_FUNCTION_ARGS);
@@ -692,17 +689,9 @@ extern Datum pg_get_function_arguments(PG_FUNCTION_ARGS);
extern Datum pg_get_function_identity_arguments(PG_FUNCTION_ARGS);
extern Datum pg_get_function_result(PG_FUNCTION_ARGS);
extern Datum pg_get_function_arg_default(PG_FUNCTION_ARGS);
-extern char *deparse_expression(Node *expr, List *dpcontext,
- bool forceprefix, bool showimplicit);
-extern List *deparse_context_for(const char *aliasname, Oid relid);
-extern List *deparse_context_for_planstate(Node *planstate, List *ancestors,
- List *rtable, List *rtable_names);
-extern List *select_rtable_names_for_explain(List *rtable,
- Bitmapset *rels_used);
extern const char *quote_identifier(const char *ident);
extern char *quote_qualified_identifier(const char *qualifier,
const char *ident);
-extern char *generate_collation_name(Oid collid);
/* tid.c */
diff --git a/src/include/utils/chash.h b/src/include/utils/chash.h
new file mode 100644
index 0000000000..ee0573c9c7
--- /dev/null
+++ b/src/include/utils/chash.h
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * chash.h
+ * Concurrent shared-memory hash table.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/chash.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CHASH_H
+#define CHASH_H
+
+/* Everything caller must supply to set up a concurrent hash table. */
+typedef struct
+{
+ const char *shmem_name; /* shared memory name for this hash table */
+ uint32 capacity; /* maximum size of hash table */
+ uint16 element_size; /* size of each element */
+ uint16 key_size; /* size of each key */
+} CHashDescriptor;
+
+/* Concurrent hash table statistics. */
+typedef enum
+{
+ CHS_Search, /* search */
+ CHS_Search_Failed, /* search failed (no such key) */
+ CHS_Insert, /* insert */
+ CHS_Insert_Failed, /* insert failed (duplicate key) */
+ CHS_Insert_Retry, /* insert retried (concurrent update) */
+ CHS_Delete, /* delete */
+ CHS_Delete_Failed, /* delete failed (no such key) */
+ CHS_Delete_Retry, /* delete retried (concurrent update) */
+ CHS_Scan_Expunge, /* scan expunged deleted item */
+ CHS_Scan_Expunge_Fail, /* scan failed to expunge */
+ CHS_Scan_Restart, /* concurrent deletes forced a scan restart */
+ CHS_Cleanup_Scan, /* concurrent update forced a cleanup scan */
+ CHS_Allocate_Fail, /* allocation failed to pop freelist */
+ CHS_Garbage_Enqueue_Retry, /* enqueue on garbage list retried */
+ CHS_Garbage_Dequeue_Fail, /* dequeue of garbage failed */
+ CHS_GC, /* garbage collection cycle */
+ CHS_GC_Spin, /* GC spun waiting for concurrent process */
+ CHS_GC_Reclaim_Skipped, /* GC recovered only one item */
+ CHS_GC_Reclaim_Fast, /* GC put garbage on freelist via fast path */
+ CHS_GC_Reclaim_Retry, /* enqueue of garbage on freelist retried */
+ CHS_NumberOfStatistics /* number of statistics */
+} CHashStatisticsType;
+
+/* Human-readable names for statistics. */
+extern char *CHashStatisticsNames[];
+
+/* Opaque handle for a concurrent hash table. */
+struct CHashTableData;
+typedef struct CHashTableData *CHashTable;
+
+/* Initialization functions. */
+extern CHashTable CHashBootstrap(CHashDescriptor *desc);
+extern Size CHashEstimateSize(CHashTable table);
+extern CHashTable CHashInitialize(CHashTable table, CHashDescriptor *desc);
+
+/* Accessor functions. */
+extern bool CHashInsert(CHashTable table, void *entry);
+extern bool CHashDelete(CHashTable table, void *key);
+extern bool CHashSearch(CHashTable table, void *entry);
+extern void CHashStatistics(CHashTable table, uint64 *stats);
+
+#endif /* CHASH_H */
diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h
new file mode 100644
index 0000000000..520b06653c
--- /dev/null
+++ b/src/include/utils/ruleutils.h
@@ -0,0 +1,34 @@
+/*-------------------------------------------------------------------------
+ *
+ * ruleutils.h
+ * Declarations for ruleutils.c
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/ruleutils.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef RULEUTILS_H
+#define RULEUTILS_H
+
+#include "nodes/nodes.h"
+#include "nodes/parsenodes.h"
+#include "nodes/pg_list.h"
+
+
+extern char *pg_get_indexdef_string(Oid indexrelid);
+extern char *pg_get_indexdef_columns(Oid indexrelid, bool pretty);
+
+extern char *pg_get_constraintdef_string(Oid constraintId);
+extern char *deparse_expression(Node *expr, List *dpcontext,
+ bool forceprefix, bool showimplicit);
+extern List *deparse_context_for(const char *aliasname, Oid relid);
+extern List *deparse_context_for_planstate(Node *planstate, List *ancestors,
+ List *rtable, List *rtable_names);
+extern List *select_rtable_names_for_explain(List *rtable,
+ Bitmapset *rels_used);
+extern char *generate_collation_name(Oid collid);
+
+#endif /* RULEUTILS_H */
diff --git a/src/port/crypt.c b/src/port/crypt.c
index ef8bf46338..6a902ef0fc 100644
--- a/src/port/crypt.c
+++ b/src/port/crypt.c
@@ -87,7 +87,7 @@ static int des_cipher(const char *in, char *out, long salt, int num_iter);
* define "B64" to be the declaration for a 64 bit integer.
* XXX this feature is currently unused, see "endian" comment below.
*/
-#define B64 __int64
+/* #define B64 int64 */
/*
* define "LARGEDATA" to get faster permutations, by using about 72 kilobytes
diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out
index eb37da7168..9146f59435 100644
--- a/src/test/regress/expected/jsonb.out
+++ b/src/test/regress/expected/jsonb.out
@@ -707,6 +707,42 @@ SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b", "c":"q"}';
f
(1 row)
+SELECT '[1,2]'::jsonb @> '[1,2,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[1,1,2]'::jsonb @> '[1,2,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[[1,2]]'::jsonb @> '[[1,2,2]]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[1,2,2]'::jsonb <@ '[1,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[1,2,2]'::jsonb <@ '[1,1,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[[1,2,2]]'::jsonb <@ '[[1,2]]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
SELECT jsonb_contained('{"a":"b"}', '{"a":"b", "b":1, "c":null}');
jsonb_contained
-----------------
diff --git a/src/test/regress/expected/jsonb_1.out b/src/test/regress/expected/jsonb_1.out
index f3bfc7bcf5..83d61f8c7e 100644
--- a/src/test/regress/expected/jsonb_1.out
+++ b/src/test/regress/expected/jsonb_1.out
@@ -707,6 +707,42 @@ SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b", "c":"q"}';
f
(1 row)
+SELECT '[1,2]'::jsonb @> '[1,2,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[1,1,2]'::jsonb @> '[1,2,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[[1,2]]'::jsonb @> '[[1,2,2]]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[1,2,2]'::jsonb <@ '[1,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[1,2,2]'::jsonb <@ '[1,1,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[[1,2,2]]'::jsonb <@ '[[1,2]]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
SELECT jsonb_contained('{"a":"b"}', '{"a":"b", "b":1, "c":null}');
jsonb_contained
-----------------
diff --git a/src/test/regress/expected/matview.out b/src/test/regress/expected/matview.out
index 1076b76210..b04510c599 100644
--- a/src/test/regress/expected/matview.out
+++ b/src/test/regress/expected/matview.out
@@ -411,7 +411,7 @@ REFRESH MATERIALIZED VIEW mv;
ERROR: could not create unique index "mv_a_idx"
DETAIL: Key (a)=(1) is duplicated.
REFRESH MATERIALIZED VIEW CONCURRENTLY mv;
-ERROR: new data for "mv" contains duplicate rows without any NULL columns
+ERROR: new data for "mv" contains duplicate rows without any null columns
DETAIL: Row: (1,10)
DROP TABLE foo CASCADE;
NOTICE: drop cascades to materialized view mv
diff --git a/src/test/regress/expected/polygon.out b/src/test/regress/expected/polygon.out
index b252902720..33388eb909 100644
--- a/src/test/regress/expected/polygon.out
+++ b/src/test/regress/expected/polygon.out
@@ -3,15 +3,15 @@
--
-- polygon logic
--
--- 3 o
--- |
--- 2 + |
--- / |
--- 1 # o +
--- / |
--- 0 #-----o-+
+-- 3 o
+-- |
+-- 2 + |
+-- / |
+-- 1 # +
+-- / o |
+-- 0 #-----o-+
--
--- 0 1 2 3 4
+-- 0 1 2 3 4
--
CREATE TABLE POLYGON_TBL(f1 polygon);
INSERT INTO POLYGON_TBL(f1) VALUES ('(2.0,0.0),(2.0,4.0),(0.0,0.0)');
@@ -128,15 +128,16 @@ SELECT '' AS one, p.*
--
-- polygon logic
--
--- 3 o
--- |
--- 2 + |
--- / |
--- 1 / o +
+-- 3 o
+-- /|
+-- 2 + |
+-- / |
+-- 1 / o +
-- / |
--- 0 +-----o-+
+-- 0 +-----o-+
+--
+-- 0 1 2 3 4
--
--- 0 1 2 3 4
--
-- left of
SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' << polygon '(3.0,1.0),(3.0,3.0),(1.0,0.0)' AS false;
@@ -248,11 +249,11 @@ SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' && polygon '(3.0,1.0),(3.0,3.0),(
(1 row)
-- +--------------------+
--- | *---* 1
+-- | *---* 1
-- | + | |
-- | 2 *---*
-- +--------------------+
--- 3
+-- 3
-- Edges 1-2, 2-3 are not shown on picture
SELECT '((0,4),(6,4),(1,2),(6,0),(0,0))'::polygon && '((2,1),(2,3),(3,3),(3,1))'::polygon AS "true";
true
diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql
index ed266d5c88..f1ed021be2 100644
--- a/src/test/regress/sql/jsonb.sql
+++ b/src/test/regress/sql/jsonb.sql
@@ -156,6 +156,13 @@ SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"c"}';
SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b"}';
SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b", "c":"q"}';
+SELECT '[1,2]'::jsonb @> '[1,2,2]'::jsonb;
+SELECT '[1,1,2]'::jsonb @> '[1,2,2]'::jsonb;
+SELECT '[[1,2]]'::jsonb @> '[[1,2,2]]'::jsonb;
+SELECT '[1,2,2]'::jsonb <@ '[1,2]'::jsonb;
+SELECT '[1,2,2]'::jsonb <@ '[1,1,2]'::jsonb;
+SELECT '[[1,2,2]]'::jsonb <@ '[[1,2]]'::jsonb;
+
SELECT jsonb_contained('{"a":"b"}', '{"a":"b", "b":1, "c":null}');
SELECT jsonb_contained('{"a":"b", "c":null}', '{"a":"b", "b":1, "c":null}');
SELECT jsonb_contained('{"a":"b", "g":null}', '{"a":"b", "b":1, "c":null}');
diff --git a/src/test/regress/sql/polygon.sql b/src/test/regress/sql/polygon.sql
index 2dad566f37..d95fa96447 100644
--- a/src/test/regress/sql/polygon.sql
+++ b/src/test/regress/sql/polygon.sql
@@ -3,15 +3,15 @@
--
-- polygon logic
--
--- 3 o
--- |
--- 2 + |
--- / |
--- 1 # o +
--- / |
--- 0 #-----o-+
+-- 3 o
+-- |
+-- 2 + |
+-- / |
+-- 1 # +
+-- / o |
+-- 0 #-----o-+
--
--- 0 1 2 3 4
+-- 0 1 2 3 4
--
CREATE TABLE POLYGON_TBL(f1 polygon);
@@ -83,15 +83,16 @@ SELECT '' AS one, p.*
--
-- polygon logic
--
--- 3 o
--- |
--- 2 + |
--- / |
--- 1 / o +
+-- 3 o
+-- /|
+-- 2 + |
+-- / |
+-- 1 / o +
-- / |
--- 0 +-----o-+
+-- 0 +-----o-+
+--
+-- 0 1 2 3 4
--
--- 0 1 2 3 4
--
-- left of
SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' << polygon '(3.0,1.0),(3.0,3.0),(1.0,0.0)' AS false;
@@ -155,11 +156,11 @@ SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' ~= polygon '(3.0,1.0),(3.0,3.0),(
SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' && polygon '(3.0,1.0),(3.0,3.0),(1.0,0.0)' AS true;
-- +--------------------+
--- | *---* 1
+-- | *---* 1
-- | + | |
-- | 2 *---*
-- +--------------------+
--- 3
+-- 3
-- Edges 1-2, 2-3 are not shown on picture
SELECT '((0,4),(6,4),(1,2),(6,0),(0,0))'::polygon && '((2,1),(2,3),(3,3),(3,1))'::polygon AS "true";