summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Freund2014-10-16 15:15:12 +0000
committerAndres Freund2014-10-16 15:15:12 +0000
commit18216ef51a4a6c4d99630708a0a2d4a612db7d0e (patch)
treed8dec68c6ffda935d7c926848bd79f3e992c7bbf
parentb2b35f355b37f35ff5304faa41af67d32c809c4a (diff)
parente0b11290ab77eee9c1282636b8af7e64afd9cfcb (diff)
Merge remote-tracking branch 'rhaas/chash2014' into rwlock-contention-cleantmp
* rhaas/chash2014: (87 commits) Oops. Oops. Use chash for buftable stuff. Fix #includes. Rearrange pointers so that the freelist pointers are as far from each other as possible, to reduce contention. Code cleanup. Reorganize fields to match comments. Refactor garbage collection logic into a separate subroutine. Set hazard pointers correctly instead of wrong. Duh. De-obfuscate deletion code, maybe. Code tightening. Add memory barrier in single-node-reclaim case. Improve comments. Get rid of CHashBucketCleanup; CHashBucketScan can do what we need. Comment fixes. Track GC reclaims skipped in stats. Wonky hack to print stats on every backend exit. Rewrite statistics system. Minor optimization of allocator. If we fail to allocate from a non-empty freelist, retry same list. Add some missing stats counter bumps. ...
-rw-r--r--contrib/Makefile1
-rw-r--r--contrib/hashtest/Makefile18
-rw-r--r--contrib/hashtest/hashtest--1.0.sql52
-rw-r--r--contrib/hashtest/hashtest.c527
-rw-r--r--contrib/hashtest/hashtest.control4
-rw-r--r--contrib/pg_upgrade/check.c216
-rw-r--r--contrib/pg_upgrade/controldata.c34
-rw-r--r--contrib/pg_upgrade/info.c14
-rw-r--r--contrib/pg_upgrade/pg_upgrade.h8
-rw-r--r--doc/src/sgml/catalogs.sgml3
-rw-r--r--doc/src/sgml/json.sgml12
-rw-r--r--doc/src/sgml/plpgsql.sgml4
-rw-r--r--src/Makefile.global.in2
-rw-r--r--src/backend/access/index/genam.c1
-rw-r--r--src/backend/access/nbtree/nbtpage.c2
-rw-r--r--src/backend/access/transam/xlog.c6
-rw-r--r--src/backend/catalog/heap.c1
-rw-r--r--src/backend/commands/dbcommands.c2
-rw-r--r--src/backend/commands/explain.c12
-rw-r--r--src/backend/commands/matview.c2
-rw-r--r--src/backend/commands/tablecmds.c3
-rw-r--r--src/backend/commands/typecmds.c1
-rw-r--r--src/backend/commands/view.c2
-rw-r--r--src/backend/executor/nodeHash.c131
-rw-r--r--src/backend/libpq/be-secure-openssl.c2
-rw-r--r--src/backend/storage/buffer/buf_table.c109
-rw-r--r--src/backend/storage/buffer/bufmgr.c194
-rw-r--r--src/backend/storage/ipc/shm_mq.c126
-rw-r--r--src/backend/storage/ipc/shmem.c23
-rw-r--r--src/backend/utils/adt/jsonb_op.c10
-rw-r--r--src/backend/utils/adt/jsonb_util.c16
-rw-r--r--src/backend/utils/adt/misc.c1
-rw-r--r--src/backend/utils/adt/ruleutils.c1
-rw-r--r--src/backend/utils/hash/Makefile2
-rw-r--r--src/backend/utils/hash/chash.c1075
-rw-r--r--src/bin/pg_basebackup/pg_receivexlog.c10
-rw-r--r--src/bin/pg_basebackup/pg_recvlogical.c31
-rw-r--r--src/bin/pg_ctl/pg_ctl.c34
-rw-r--r--src/include/executor/hashjoin.h5
-rw-r--r--src/include/storage/barrier.h8
-rw-r--r--src/include/storage/buf_internals.h20
-rw-r--r--src/include/storage/lwlock.h2
-rw-r--r--src/include/storage/proc.h14
-rw-r--r--src/include/storage/shm_mq.h14
-rw-r--r--src/include/storage/shmem.h1
-rw-r--r--src/include/utils/builtins.h11
-rw-r--r--src/include/utils/chash.h69
-rw-r--r--src/include/utils/ruleutils.h34
-rw-r--r--src/port/crypt.c2
-rw-r--r--src/test/regress/expected/jsonb.out36
-rw-r--r--src/test/regress/expected/jsonb_1.out36
-rw-r--r--src/test/regress/expected/matview.out2
-rw-r--r--src/test/regress/expected/polygon.out35
-rw-r--r--src/test/regress/sql/jsonb.sql7
-rw-r--r--src/test/regress/sql/polygon.sql35
55 files changed, 2493 insertions, 530 deletions
diff --git a/contrib/Makefile b/contrib/Makefile
index b37d0dd2c3..0b91ac10ee 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -20,6 +20,7 @@ SUBDIRS = \
earthdistance \
file_fdw \
fuzzystrmatch \
+ hashtest \
hstore \
intagg \
intarray \
diff --git a/contrib/hashtest/Makefile b/contrib/hashtest/Makefile
new file mode 100644
index 0000000000..3ee42f87d8
--- /dev/null
+++ b/contrib/hashtest/Makefile
@@ -0,0 +1,18 @@
+# contrib/hashtest/Makefile
+
+MODULE_big = hashtest
+OBJS = hashtest.o
+
+EXTENSION = hashtest
+DATA = hashtest--1.0.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/hashtest
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/hashtest/hashtest--1.0.sql b/contrib/hashtest/hashtest--1.0.sql
new file mode 100644
index 0000000000..e271baff0f
--- /dev/null
+++ b/contrib/hashtest/hashtest--1.0.sql
@@ -0,0 +1,52 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION hashtest" to load this file. \quit
+
+CREATE FUNCTION chash_insert_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_insert_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_search_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_search_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_delete_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_delete_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_concurrent_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_concurrent_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_collision_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_collision_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_insert_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_insert_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_search_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_search_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_delete_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_delete_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_concurrent_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_concurrent_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_collision_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_collision_test'
+LANGUAGE C;
diff --git a/contrib/hashtest/hashtest.c b/contrib/hashtest/hashtest.c
new file mode 100644
index 0000000000..172a5bb156
--- /dev/null
+++ b/contrib/hashtest/hashtest.c
@@ -0,0 +1,527 @@
+/*-------------------------------------------------------------------------
+ * hashtest.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "funcapi.h"
+#include "libpq/auth.h"
+#include "lib/stringinfo.h"
+#include "miscadmin.h"
+#include "portability/instr_time.h"
+#include "storage/ipc.h"
+#include "utils/chash.h"
+
+PG_MODULE_MAGIC;
+
+void _PG_init(void);
+Datum chash_insert_test(PG_FUNCTION_ARGS);
+Datum chash_search_test(PG_FUNCTION_ARGS);
+Datum chash_delete_test(PG_FUNCTION_ARGS);
+Datum chash_concurrent_test(PG_FUNCTION_ARGS);
+Datum chash_collision_test(PG_FUNCTION_ARGS);
+Datum dynahash_insert_test(PG_FUNCTION_ARGS);
+Datum dynahash_search_test(PG_FUNCTION_ARGS);
+Datum dynahash_delete_test(PG_FUNCTION_ARGS);
+Datum dynahash_concurrent_test(PG_FUNCTION_ARGS);
+Datum dynahash_collision_test(PG_FUNCTION_ARGS);
+static void hashtest_shmem_startup(void);
+
+PG_FUNCTION_INFO_V1(chash_insert_test);
+PG_FUNCTION_INFO_V1(chash_search_test);
+PG_FUNCTION_INFO_V1(chash_delete_test);
+PG_FUNCTION_INFO_V1(chash_concurrent_test);
+PG_FUNCTION_INFO_V1(chash_collision_test);
+PG_FUNCTION_INFO_V1(dynahash_insert_test);
+PG_FUNCTION_INFO_V1(dynahash_search_test);
+PG_FUNCTION_INFO_V1(dynahash_delete_test);
+PG_FUNCTION_INFO_V1(dynahash_concurrent_test);
+PG_FUNCTION_INFO_V1(dynahash_collision_test);
+
+typedef struct
+{
+ uint32 key;
+ uint32 val;
+} hentry;
+
+static CHashDescriptor cdesc = {
+ "hashtest-chash", /* name */
+ 1048576, /* capacity */
+ sizeof(hentry), /* element size */
+ sizeof(uint32) /* key size */
+};
+
+#define DYNAHASH_PARTITIONS 16
+
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+static CHashTable chash;
+static HTAB *dynahash;
+static LWLockId dynahash_lock[DYNAHASH_PARTITIONS];
+static ClientAuthentication_hook_type original_client_auth_hook = NULL;
+
+static void hashtest_client_auth_hook(Port *port, int status);
+static void chash_write_stats_to_log(int code, Datum dummy);
+
+#define dynahash_get_lock(hashcode) \
+ (dynahash_lock[(hashcode) % DYNAHASH_PARTITIONS])
+
+void
+_PG_init(void)
+{
+ Size cs;
+ Size ds;
+
+ if (!process_shared_preload_libraries_in_progress)
+ return;
+ prev_shmem_startup_hook = shmem_startup_hook;
+ shmem_startup_hook = hashtest_shmem_startup;
+ chash = CHashBootstrap(&cdesc);
+ cs = CHashEstimateSize(chash);
+ RequestAddinShmemSpace(cs);
+ ds = hash_estimate_size(cdesc.capacity, cdesc.element_size);
+ RequestAddinShmemSpace(ds);
+ elog(LOG, "chash: %u bytes; dynahash: %u bytes", (unsigned) cs,
+ (unsigned) ds);
+ RequestAddinLWLocks(DYNAHASH_PARTITIONS);
+ original_client_auth_hook = ClientAuthentication_hook;
+ ClientAuthentication_hook = hashtest_client_auth_hook;
+
+}
+
+static void
+hashtest_client_auth_hook(Port *port, int status)
+{
+ if (original_client_auth_hook)
+ original_client_auth_hook(port, status);
+ on_proc_exit(chash_write_stats_to_log, (Datum) 0);
+}
+
+static void
+chash_write_stats_to_log(int code, Datum dummy)
+{
+ uint64 stats[CHS_NumberOfStatistics];
+ CHashStatisticsType i;
+ StringInfoData buf;
+
+ CHashStatistics(chash, stats);
+ initStringInfo(&buf);
+
+ for (i = 0; i < CHS_NumberOfStatistics; ++i)
+ {
+ if (stats[i] == 0)
+ continue;
+ appendStringInfo(&buf, UINT64_FORMAT " %s; ", stats[i],
+ CHashStatisticsNames[i]);
+ }
+
+ if (buf.len > 1)
+ {
+ buf.data[buf.len-2] = '\0';
+ elog(LOG, "chash statistics: %s", buf.data);
+ }
+}
+
+static void
+hashtest_shmem_startup(void)
+{
+ HASHCTL info;
+ uint32 i;
+
+ if (prev_shmem_startup_hook)
+ prev_shmem_startup_hook();
+
+ /* Initialize concurrent hash table. */
+ chash = CHashInitialize(chash, &cdesc);
+
+ /* Initialize shared dynahash table. */
+ info.keysize = cdesc.key_size;
+ info.entrysize = cdesc.element_size;
+ info.hash = tag_hash;
+ info.num_partitions = DYNAHASH_PARTITIONS;
+
+ dynahash = ShmemInitHash("hashtest-dynahash",
+ cdesc.capacity, cdesc.capacity,
+ &info,
+ HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+
+ for (i = 0; i < DYNAHASH_PARTITIONS; ++i)
+ dynahash_lock[i] = LWLockAssign();
+}
+
+Datum
+chash_insert_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ e.key = i;
+ e.val = i * 31;
+ ok = CHashInsert(chash, &e);
+ if (!ok)
+ elog(LOG, "insert %u: failed", i);
+ ok = CHashInsert(chash, &e);
+ if (ok)
+ elog(LOG, "insert %u: worked twice", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+chash_search_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ e.key = i;
+ ok = CHashSearch(chash, &e);
+ if (!ok)
+ elog(LOG, "search %u: not found", i);
+ else if (e.val != e.key * 31)
+ elog(LOG, "search %u: found %u", i, e.val);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+chash_delete_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ e.key = i;
+ ok = CHashDelete(chash, &e);
+ if (!ok)
+ elog(LOG, "delete %u: not found", i);
+ ok = CHashDelete(chash, &e);
+ if (ok)
+ elog(LOG, "delete %u: found twice", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+chash_concurrent_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+ uint32 seed = MyProcPid << 16;
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ e.key = seed | i;
+ e.val = MyProcPid;
+ ok = CHashInsert(chash, &e);
+ if (!ok)
+ elog(LOG, "insert %u: found", i);
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ e.key = seed | i;
+ e.val = 0;
+ ok = CHashSearch(chash, &e);
+ if (!ok)
+ {
+ uint64 retry = 1;
+ elog(LOG, "search %u: not found", i);
+ while (!CHashSearch(chash, &e))
+ ++retry;
+ elog(LOG, "search %u: eventually found it after "
+ UINT64_FORMAT " retries", i, retry);
+ }
+ if (e.val != MyProcPid)
+ elog(LOG, "search %u: expected %u found %u", i, (unsigned) MyProcPid, e.val);
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ e.key = seed | i;
+ ok = CHashDelete(chash, &e);
+ if (!ok)
+ {
+ uint64 retry = 1;
+ elog(LOG, "delete %u: not found", i);
+ while (!CHashDelete(chash, &e))
+ ++retry;
+ elog(LOG, "delete %u: eventually deleted it after "
+ UINT64_FORMAT " retries", i, retry);
+ }
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+chash_collision_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+
+ /* Don't stack-allocate this. */
+ static bool mine[10000];
+
+ memset(mine, 0, 10000 * sizeof(bool));
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ e.key = i;
+ e.val = MyProcPid;
+ ok = CHashInsert(chash, &e);
+ if (ok)
+ mine[i] = true;
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ if (!mine[i])
+ continue;
+ e.key = i;
+ ok = CHashSearch(chash, &e);
+ if (!ok)
+ elog(LOG, "search %u: not found", i);
+ else if (e.val != MyProcPid)
+ elog(LOG, "search %u: expected %u found %u",
+ i, (unsigned) MyProcPid, e.val);
+ ok = CHashDelete(chash, &e);
+ if (!ok)
+ elog(LOG, "delete %u: not found", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+static bool
+dynahash_insert(uint32 key, uint32 val)
+{
+ bool found;
+ uint32 hashcode;
+ hentry *e;
+ LWLockId lockid;
+
+ hashcode = get_hash_value(dynahash, (void *) &key);
+ lockid = dynahash_get_lock(hashcode);
+ LWLockAcquire(lockid, LW_EXCLUSIVE);
+ e = hash_search_with_hash_value(dynahash, (void *) &key,
+ hashcode, HASH_ENTER, &found);
+ if (!found)
+ e->val = val;
+ LWLockRelease(lockid);
+
+ return !found;
+}
+
+static bool
+dynahash_search(uint32 key, uint32 *val)
+{
+ uint32 hashcode;
+ hentry *e;
+ LWLockId lockid;
+
+ hashcode = get_hash_value(dynahash, (void *) &key);
+ lockid = dynahash_get_lock(hashcode);
+ LWLockAcquire(lockid, LW_SHARED);
+ e = hash_search_with_hash_value(dynahash, (void *) &key,
+ hashcode, HASH_FIND, NULL);
+ if (e)
+ *val = e->val;
+ LWLockRelease(lockid);
+
+ return e != NULL;
+}
+
+static bool
+dynahash_delete(uint32 key)
+{
+ uint32 hashcode;
+ hentry *e;
+ LWLockId lockid;
+
+ hashcode = get_hash_value(dynahash, (void *) &key);
+ lockid = dynahash_get_lock(hashcode);
+ LWLockAcquire(lockid, LW_EXCLUSIVE);
+ e = hash_search_with_hash_value(dynahash, (void *) &key,
+ hashcode, HASH_REMOVE, NULL);
+ LWLockRelease(lockid);
+
+ return e != NULL;
+}
+
+Datum
+dynahash_insert_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_insert(i, i * 31);
+ if (!ok)
+ elog(LOG, "insert %u: failed", i);
+ ok = dynahash_insert(i, i * 31);
+ if (ok)
+ elog(LOG, "insert %u: worked twice", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+dynahash_search_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+ uint32 val;
+
+ ok = dynahash_search(i, &val);
+ if (!ok)
+ elog(LOG, "search %u: not found", i);
+ else if (val != i* 31)
+ elog(LOG, "search %u: found %u", i, val);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+dynahash_delete_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_delete(i);
+ if (!ok)
+ elog(LOG, "delete %u: not found", i);
+ ok = dynahash_delete(i);
+ if (ok)
+ elog(LOG, "delete %u: found twice", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+dynahash_concurrent_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ uint32 val;
+ uint32 seed = MyProcPid << 16;
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_insert(seed | i, MyProcPid);
+ if (!ok)
+ elog(LOG, "insert %u: found", i);
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_search(seed | i, &val);
+ if (!ok)
+ {
+ uint64 retry = 1;
+ elog(LOG, "search %u: not found", i);
+ while (!dynahash_search(seed | i, &val))
+ ++retry;
+ elog(LOG, "search %u: eventually found it after "
+ UINT64_FORMAT " retries", i, retry);
+ }
+ if (val != MyProcPid)
+ elog(LOG, "search %u: expected %u found %u",
+ i, (unsigned) MyProcPid, val);
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_delete(seed | i);
+ if (!ok)
+ {
+ uint64 retry = 1;
+ elog(LOG, "delete %u: not found", i);
+ while (!dynahash_delete(seed | i))
+ ++retry;
+ elog(LOG, "delete %u: eventually deleted it after "
+ UINT64_FORMAT " retries", i, retry);
+ }
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+dynahash_collision_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ uint32 val;
+
+ /* Don't stack-allocate this. */
+ static bool mine[10000];
+
+ memset(mine, 0, 10000 * sizeof(bool));
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_insert(i, MyProcPid);
+ if (ok)
+ mine[i] = true;
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ if (!mine[i])
+ continue;
+ ok = dynahash_search(i, &val);
+ if (!ok)
+ elog(LOG, "search %u: not found", i);
+ else if (val != MyProcPid)
+ elog(LOG, "search %u: expected %u found %u",
+ i, (unsigned) MyProcPid, val);
+ ok = dynahash_delete(i);
+ if (!ok)
+ elog(LOG, "delete %u: not found", i);
+ }
+
+ PG_RETURN_VOID();
+}
diff --git a/contrib/hashtest/hashtest.control b/contrib/hashtest/hashtest.control
new file mode 100644
index 0000000000..b8e0f01346
--- /dev/null
+++ b/contrib/hashtest/hashtest.control
@@ -0,0 +1,4 @@
+comment = 'hash testing code'
+default_version = '1.0'
+module_pathname = '$libdir/hashtest'
+relocatable = true
diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c
index bbfcab71ce..56db0dd654 100644
--- a/contrib/pg_upgrade/check.c
+++ b/contrib/pg_upgrade/check.c
@@ -14,12 +14,10 @@
#include "pg_upgrade.h"
-static void set_locale_and_encoding(ClusterInfo *cluster);
static void check_new_cluster_is_empty(void);
-static void check_locale_and_encoding(ControlData *oldctrl,
- ControlData *newctrl);
-static bool equivalent_locale(const char *loca, const char *locb);
-static bool equivalent_encoding(const char *chara, const char *charb);
+static void check_databases_are_compatible(void);
+static void check_locale_and_encoding(DbInfo *olddb, DbInfo *newdb);
+static bool equivalent_locale(int category, const char *loca, const char *locb);
static void check_is_install_user(ClusterInfo *cluster);
static void check_for_prepared_transactions(ClusterInfo *cluster);
static void check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster);
@@ -81,8 +79,6 @@ check_and_dump_old_cluster(bool live_check)
if (!live_check)
start_postmaster(&old_cluster, true);
- set_locale_and_encoding(&old_cluster);
-
get_pg_database_relfilenode(&old_cluster);
/* Extract a list of databases and tables from the old cluster */
@@ -127,13 +123,10 @@ check_and_dump_old_cluster(bool live_check)
void
check_new_cluster(void)
{
- set_locale_and_encoding(&new_cluster);
-
- check_locale_and_encoding(&old_cluster.controldata, &new_cluster.controldata);
-
get_db_and_rel_infos(&new_cluster);
check_new_cluster_is_empty();
+ check_databases_are_compatible();
check_loadable_libraries();
@@ -279,93 +272,25 @@ check_cluster_compatibility(bool live_check)
/*
- * set_locale_and_encoding()
- *
- * query the database to get the template0 locale
- */
-static void
-set_locale_and_encoding(ClusterInfo *cluster)
-{
- ControlData *ctrl = &cluster->controldata;
- PGconn *conn;
- PGresult *res;
- int i_encoding;
- int cluster_version = cluster->major_version;
-
- conn = connectToServer(cluster, "template1");
-
- /* for pg < 80400, we got the values from pg_controldata */
- if (cluster_version >= 80400)
- {
- int i_datcollate;
- int i_datctype;
-
- res = executeQueryOrDie(conn,
- "SELECT datcollate, datctype "
- "FROM pg_catalog.pg_database "
- "WHERE datname = 'template0' ");
- assert(PQntuples(res) == 1);
-
- i_datcollate = PQfnumber(res, "datcollate");
- i_datctype = PQfnumber(res, "datctype");
-
- if (GET_MAJOR_VERSION(cluster->major_version) < 902)
- {
- /*
- * Pre-9.2 did not canonicalize the supplied locale names to match
- * what the system returns, while 9.2+ does, so convert pre-9.2 to
- * match.
- */
- ctrl->lc_collate = get_canonical_locale_name(LC_COLLATE,
- pg_strdup(PQgetvalue(res, 0, i_datcollate)));
- ctrl->lc_ctype = get_canonical_locale_name(LC_CTYPE,
- pg_strdup(PQgetvalue(res, 0, i_datctype)));
- }
- else
- {
- ctrl->lc_collate = pg_strdup(PQgetvalue(res, 0, i_datcollate));
- ctrl->lc_ctype = pg_strdup(PQgetvalue(res, 0, i_datctype));
- }
-
- PQclear(res);
- }
-
- res = executeQueryOrDie(conn,
- "SELECT pg_catalog.pg_encoding_to_char(encoding) "
- "FROM pg_catalog.pg_database "
- "WHERE datname = 'template0' ");
- assert(PQntuples(res) == 1);
-
- i_encoding = PQfnumber(res, "pg_encoding_to_char");
- ctrl->encoding = pg_strdup(PQgetvalue(res, 0, i_encoding));
-
- PQclear(res);
-
- PQfinish(conn);
-}
-
-
-/*
* check_locale_and_encoding()
*
- * Check that old and new locale and encoding match. Even though the backend
- * tries to canonicalize stored locale names, the platform often doesn't
- * cooperate, so it's entirely possible that one DB thinks its locale is
- * "en_US.UTF-8" while the other says "en_US.utf8". Try to be forgiving.
+ * Check that locale and encoding of a database in the old and new clusters
+ * are compatible.
*/
static void
-check_locale_and_encoding(ControlData *oldctrl,
- ControlData *newctrl)
+check_locale_and_encoding(DbInfo *olddb, DbInfo *newdb)
{
- if (!equivalent_locale(oldctrl->lc_collate, newctrl->lc_collate))
- pg_fatal("lc_collate cluster values do not match: old \"%s\", new \"%s\"\n",
- oldctrl->lc_collate, newctrl->lc_collate);
- if (!equivalent_locale(oldctrl->lc_ctype, newctrl->lc_ctype))
- pg_fatal("lc_ctype cluster values do not match: old \"%s\", new \"%s\"\n",
- oldctrl->lc_ctype, newctrl->lc_ctype);
- if (!equivalent_encoding(oldctrl->encoding, newctrl->encoding))
- pg_fatal("encoding cluster values do not match: old \"%s\", new \"%s\"\n",
- oldctrl->encoding, newctrl->encoding);
+ if (olddb->db_encoding != newdb->db_encoding)
+ pg_fatal("encodings for database \"%s\" do not match: old \"%s\", new \"%s\"\n",
+ olddb->db_name,
+ pg_encoding_to_char(olddb->db_encoding),
+ pg_encoding_to_char(newdb->db_encoding));
+ if (!equivalent_locale(LC_COLLATE, olddb->db_collate, newdb->db_collate))
+ pg_fatal("lc_collate values for database \"%s\" do not match: old \"%s\", new \"%s\"\n",
+ olddb->db_name, olddb->db_collate, newdb->db_collate);
+ if (!equivalent_locale(LC_CTYPE, olddb->db_ctype, newdb->db_ctype))
+ pg_fatal("lc_ctype values for database \"%s\" do not match: old \"%s\", new \"%s\"\n",
+ olddb->db_name, olddb->db_ctype, newdb->db_ctype);
}
/*
@@ -373,61 +298,46 @@ check_locale_and_encoding(ControlData *oldctrl,
*
* Best effort locale-name comparison. Return false if we are not 100% sure
* the locales are equivalent.
+ *
+ * Note: The encoding parts of the names are ignored. This function is
+ * currently used to compare locale names stored in pg_database, and
+ * pg_database contains a separate encoding field. That's compared directly
+ * in check_locale_and_encoding().
*/
static bool
-equivalent_locale(const char *loca, const char *locb)
+equivalent_locale(int category, const char *loca, const char *locb)
{
- const char *chara = strrchr(loca, '.');
- const char *charb = strrchr(locb, '.');
- int lencmp;
-
- /* If they don't both contain an encoding part, just do strcasecmp(). */
- if (!chara || !charb)
- return (pg_strcasecmp(loca, locb) == 0);
+ const char *chara;
+ const char *charb;
+ char *canona;
+ char *canonb;
+ int lena;
+ int lenb;
/*
- * Compare the encoding parts. Windows tends to use code page numbers for
- * the encoding part, which equivalent_encoding() won't like, so accept if
- * the strings are case-insensitive equal; otherwise use
- * equivalent_encoding() to compare.
+ * If the names are equal, the locales are equivalent. Checking this
+ * first avoids calling setlocale() in the common case that the names
+ * are equal. That's a good thing, if setlocale() is buggy, for example.
*/
- if (pg_strcasecmp(chara + 1, charb + 1) != 0 &&
- !equivalent_encoding(chara + 1, charb + 1))
- return false;
+ if (pg_strcasecmp(loca, locb) == 0)
+ return true;
/*
- * OK, compare the locale identifiers (e.g. en_US part of en_US.utf8).
- *
- * It's tempting to ignore non-alphanumeric chars here, but for now it's
- * not clear that that's necessary; just do case-insensitive comparison.
+ * Not identical. Canonicalize both names, remove the encoding parts,
+ * and try again.
*/
- lencmp = chara - loca;
- if (lencmp != charb - locb)
- return false;
+ canona = get_canonical_locale_name(category, loca);
+ chara = strrchr(canona, '.');
+ lena = chara ? (chara - canona) : strlen(canona);
- return (pg_strncasecmp(loca, locb, lencmp) == 0);
-}
+ canonb = get_canonical_locale_name(category, locb);
+ charb = strrchr(canonb, '.');
+ lenb = charb ? (charb - canonb) : strlen(canonb);
-/*
- * equivalent_encoding()
- *
- * Best effort encoding-name comparison. Return true only if the encodings
- * are valid server-side encodings and known equivalent.
- *
- * Because the lookup in pg_valid_server_encoding() does case folding and
- * ignores non-alphanumeric characters, this will recognize many popular
- * variant spellings as equivalent, eg "utf8" and "UTF-8" will match.
- */
-static bool
-equivalent_encoding(const char *chara, const char *charb)
-{
- int enca = pg_valid_server_encoding(chara);
- int encb = pg_valid_server_encoding(charb);
+ if (lena == lenb && pg_strncasecmp(canona, canonb, lena) == 0)
+ return true;
- if (enca < 0 || encb < 0)
- return false;
-
- return (enca == encb);
+ return false;
}
@@ -450,7 +360,35 @@ check_new_cluster_is_empty(void)
new_cluster.dbarr.dbs[dbnum].db_name);
}
}
+}
+
+/*
+ * Check that every database that already exists in the new cluster is
+ * compatible with the corresponding database in the old one.
+ */
+static void
+check_databases_are_compatible(void)
+{
+ int newdbnum;
+ int olddbnum;
+ DbInfo *newdbinfo;
+ DbInfo *olddbinfo;
+ for (newdbnum = 0; newdbnum < new_cluster.dbarr.ndbs; newdbnum++)
+ {
+ newdbinfo = &new_cluster.dbarr.dbs[newdbnum];
+
+ /* Find the corresponding database in the old cluster */
+ for (olddbnum = 0; olddbnum < old_cluster.dbarr.ndbs; olddbnum++)
+ {
+ olddbinfo = &old_cluster.dbarr.dbs[olddbnum];
+ if (strcmp(newdbinfo->db_name, olddbinfo->db_name) == 0)
+ {
+ check_locale_and_encoding(olddbinfo, newdbinfo);
+ break;
+ }
+ }
+ }
}
@@ -470,7 +408,8 @@ create_script_for_cluster_analyze(char **analyze_script_file_name)
if (os_info.user_specified)
user_specification = psprintf("-U \"%s\" ", os_info.user);
- *analyze_script_file_name = psprintf("analyze_new_cluster.%s", SCRIPT_EXT);
+ *analyze_script_file_name = psprintf("%sanalyze_new_cluster.%s",
+ SCRIPT_PREFIX, SCRIPT_EXT);
if ((script = fopen_priv(*analyze_script_file_name, "w")) == NULL)
pg_fatal("Could not open file \"%s\": %s\n",
@@ -551,7 +490,8 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name)
int tblnum;
char old_cluster_pgdata[MAXPGPATH];
- *deletion_script_file_name = psprintf("delete_old_cluster.%s", SCRIPT_EXT);
+ *deletion_script_file_name = psprintf("%sdelete_old_cluster.%s",
+ SCRIPT_PREFIX, SCRIPT_EXT);
/*
* Some users (oddly) create tablespaces inside the cluster data
diff --git a/contrib/pg_upgrade/controldata.c b/contrib/pg_upgrade/controldata.c
index 8379ebd71b..4e9d5948fa 100644
--- a/contrib/pg_upgrade/controldata.c
+++ b/contrib/pg_upgrade/controldata.c
@@ -122,10 +122,6 @@ get_control_data(ClusterInfo *cluster, bool live_check)
pg_fatal("Could not get control data using %s: %s\n",
cmd, getErrorText(errno));
- /* Only pre-8.4 has these so if they are not set below we will check later */
- cluster->controldata.lc_collate = NULL;
- cluster->controldata.lc_ctype = NULL;
-
/* Only in <= 9.2 */
if (GET_MAJOR_VERSION(cluster->major_version) <= 902)
{
@@ -404,36 +400,6 @@ get_control_data(ClusterInfo *cluster, bool live_check)
cluster->controldata.data_checksum_version = str2uint(p);
got_data_checksum_version = true;
}
- /* In pre-8.4 only */
- else if ((p = strstr(bufin, "LC_COLLATE:")) != NULL)
- {
- p = strchr(p, ':');
-
- if (p == NULL || strlen(p) <= 1)
- pg_fatal("%d: controldata retrieval problem\n", __LINE__);
-
- p++; /* remove ':' char */
- /* skip leading spaces and remove trailing newline */
- p += strspn(p, " ");
- if (strlen(p) > 0 && *(p + strlen(p) - 1) == '\n')
- *(p + strlen(p) - 1) = '\0';
- cluster->controldata.lc_collate = pg_strdup(p);
- }
- /* In pre-8.4 only */
- else if ((p = strstr(bufin, "LC_CTYPE:")) != NULL)
- {
- p = strchr(p, ':');
-
- if (p == NULL || strlen(p) <= 1)
- pg_fatal("%d: controldata retrieval problem\n", __LINE__);
-
- p++; /* remove ':' char */
- /* skip leading spaces and remove trailing newline */
- p += strspn(p, " ");
- if (strlen(p) > 0 && *(p + strlen(p) - 1) == '\n')
- *(p + strlen(p) - 1) = '\0';
- cluster->controldata.lc_ctype = pg_strdup(p);
- }
}
if (output)
diff --git a/contrib/pg_upgrade/info.c b/contrib/pg_upgrade/info.c
index a1773aa8e5..c347dfc493 100644
--- a/contrib/pg_upgrade/info.c
+++ b/contrib/pg_upgrade/info.c
@@ -239,11 +239,15 @@ get_db_infos(ClusterInfo *cluster)
DbInfo *dbinfos;
int i_datname,
i_oid,
+ i_encoding,
+ i_datcollate,
+ i_datctype,
i_spclocation;
char query[QUERY_ALLOC];
snprintf(query, sizeof(query),
- "SELECT d.oid, d.datname, %s "
+ "SELECT d.oid, d.datname, d.encoding, d.datcollate, d.datctype, "
+ "%s AS spclocation "
"FROM pg_catalog.pg_database d "
" LEFT OUTER JOIN pg_catalog.pg_tablespace t "
" ON d.dattablespace = t.oid "
@@ -252,12 +256,15 @@ get_db_infos(ClusterInfo *cluster)
"ORDER BY 2",
/* 9.2 removed the spclocation column */
(GET_MAJOR_VERSION(cluster->major_version) <= 901) ?
- "t.spclocation" : "pg_catalog.pg_tablespace_location(t.oid) AS spclocation");
+ "t.spclocation" : "pg_catalog.pg_tablespace_location(t.oid)");
res = executeQueryOrDie(conn, "%s", query);
i_oid = PQfnumber(res, "oid");
i_datname = PQfnumber(res, "datname");
+ i_encoding = PQfnumber(res, "encoding");
+ i_datcollate = PQfnumber(res, "datcollate");
+ i_datctype = PQfnumber(res, "datctype");
i_spclocation = PQfnumber(res, "spclocation");
ntups = PQntuples(res);
@@ -267,6 +274,9 @@ get_db_infos(ClusterInfo *cluster)
{
dbinfos[tupnum].db_oid = atooid(PQgetvalue(res, tupnum, i_oid));
dbinfos[tupnum].db_name = pg_strdup(PQgetvalue(res, tupnum, i_datname));
+ dbinfos[tupnum].db_encoding = atoi(PQgetvalue(res, tupnum, i_encoding));
+ dbinfos[tupnum].db_collate = pg_strdup(PQgetvalue(res, tupnum, i_datcollate));
+ dbinfos[tupnum].db_ctype = pg_strdup(PQgetvalue(res, tupnum, i_datctype));
snprintf(dbinfos[tupnum].db_tablespace, sizeof(dbinfos[tupnum].db_tablespace), "%s",
PQgetvalue(res, tupnum, i_spclocation));
}
diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h
index 56a7505a96..c3b81e4a08 100644
--- a/contrib/pg_upgrade/pg_upgrade.h
+++ b/contrib/pg_upgrade/pg_upgrade.h
@@ -76,6 +76,7 @@ extern char *output_files[];
#define PATH_SEPARATOR '/'
#define RM_CMD "rm -f"
#define RMDIR_CMD "rm -rf"
+#define SCRIPT_PREFIX "./"
#define SCRIPT_EXT "sh"
#define ECHO_QUOTE "'"
#define ECHO_BLANK ""
@@ -86,6 +87,7 @@ extern char *output_files[];
#define PATH_SEPARATOR '\\'
#define RM_CMD "DEL /q"
#define RMDIR_CMD "RMDIR /s/q"
+#define SCRIPT_PREFIX ""
#define SCRIPT_EXT "bat"
#define EXE_EXT ".exe"
#define ECHO_QUOTE ""
@@ -180,6 +182,9 @@ typedef struct
char *db_name; /* database name */
char db_tablespace[MAXPGPATH]; /* database default tablespace
* path */
+ char *db_collate;
+ char *db_ctype;
+ int db_encoding;
RelInfoArr rel_arr; /* array of all user relinfos */
} DbInfo;
@@ -218,9 +223,6 @@ typedef struct
bool date_is_int;
bool float8_pass_by_value;
bool data_checksum_version;
- char *lc_collate;
- char *lc_ctype;
- char *encoding;
} ControlData;
/*
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index f4617b67e9..f98e282741 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -1227,8 +1227,7 @@
<entry><type>bool</type></entry>
<entry></entry>
<entry>
- This represents a not-null constraint. It is possible to
- change this column to enable or disable the constraint.
+ This represents a not-null constraint.
</entry>
</row>
diff --git a/doc/src/sgml/json.sgml b/doc/src/sgml/json.sgml
index 37dd611aeb..8feb2fbf0a 100644
--- a/doc/src/sgml/json.sgml
+++ b/doc/src/sgml/json.sgml
@@ -269,6 +269,12 @@ SELECT '"foo"'::jsonb @> '"foo"'::jsonb;
-- The array on the right side is contained within the one on the left:
SELECT '[1, 2, 3]'::jsonb @> '[1, 3]'::jsonb;
+-- Order of array elements is not significant, so this is also true:
+SELECT '[1, 2, 3]'::jsonb @> '[3, 1]'::jsonb;
+
+-- Duplicate array elements don't matter either:
+SELECT '[1, 2, 3]'::jsonb @> '[1, 2, 2]'::jsonb;
+
-- The object with a single pair on the right side is contained
-- within the object on the left side:
SELECT '{"product": "PostgreSQL", "version": 9.4, "jsonb":true}'::jsonb @> '{"version":9.4}'::jsonb;
@@ -288,8 +294,10 @@ SELECT '{"foo": {"bar": "baz"}}'::jsonb @> '{"bar": "baz"}'::jsonb; -- yields f
The general principle is that the contained object must match the
containing object as to structure and data contents, possibly after
discarding some non-matching array elements or object key/value pairs
- from the containing object. However, the order of array elements is
- not significant when doing a containment match.
+ from the containing object.
+ But remember that the order of array elements is not significant when
+ doing a containment match, and duplicate array elements are effectively
+ considered only once.
</para>
<para>
diff --git a/doc/src/sgml/plpgsql.sgml b/doc/src/sgml/plpgsql.sgml
index f008e937ee..f195495520 100644
--- a/doc/src/sgml/plpgsql.sgml
+++ b/doc/src/sgml/plpgsql.sgml
@@ -487,8 +487,8 @@ $$ LANGUAGE plpgsql;
CREATE FUNCTION extended_sales(p_itemno int)
RETURNS TABLE(quantity int, total numeric) AS $$
BEGIN
- RETURN QUERY SELECT quantity, quantity * price FROM sales
- WHERE itemno = p_itemno;
+ RETURN QUERY SELECT s.quantity, s.quantity * s.price FROM sales AS s
+ WHERE s.itemno = p_itemno;
END;
$$ LANGUAGE plpgsql;
</programlisting>
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 2af9413f21..e76b22fb2d 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -302,7 +302,7 @@ PROVE_FLAGS = --verbose
# prepend to path if already set, else just set it
define add_to_path
-$(1)='$(if $($(1)),$(2):$$$(1),$(2))'
+$(1)="$(if $($(1)),$(2):$$$(1),$(2))"
endef
# platform-specific environment variable to set shared library path
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 850008b340..8849c08e54 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -28,6 +28,7 @@
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
+#include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/tqual.h"
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index bab5a49187..b71f65de2c 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1186,7 +1186,7 @@ _bt_pagedel(Relation rel, Buffer buf)
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("index \"%s\" contains a half-dead internal page",
RelationGetRelationName(rel)),
- errhint("This can be caused by an interrupt VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
+ errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
_bt_relbuf(rel, buf);
return ndeleted;
}
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 5a4dbb9c53..235b442296 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5193,8 +5193,8 @@ readRecoveryCommandFile(void)
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("invalid recovery_target parameter"),
- errhint("The only allowed value is 'immediate'")));
+ errmsg("invalid value for recovery parameter \"recovery_target\""),
+ errhint("The only allowed value is \"immediate\".")));
ereport(DEBUG2,
(errmsg_internal("recovery_target = '%s'",
item->value)));
@@ -5257,7 +5257,7 @@ readRecoveryCommandFile(void)
"recovery_min_apply_delay"),
hintmsg ? errhint("%s", _(hintmsg)) : 0));
ereport(DEBUG2,
- (errmsg("recovery_min_apply_delay = '%s'", item->value)));
+ (errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
}
else
ereport(FATAL,
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 55c1e79563..c0eade0a3d 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -69,6 +69,7 @@
#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
+#include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/tqual.h"
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 7831e900ba..b52e6b9bc0 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -839,7 +839,7 @@ dropdb(const char *dbname, bool missing_ok)
if (ReplicationSlotsCountDBSlots(db_id, &nslots, &nslots_active))
ereport(ERROR,
(errcode(ERRCODE_OBJECT_IN_USE),
- errmsg("database \"%s\" is used by a logical decoding slot",
+ errmsg("database \"%s\" is used by a logical replication slot",
dbname),
errdetail_plural("There is %d slot, %d of them active.",
"There are %d slots, %d of them active.",
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 781a736115..387d263e87 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -28,6 +28,7 @@
#include "utils/json.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
+#include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/tuplesort.h"
#include "utils/xml.h"
@@ -1900,18 +1901,21 @@ show_hash_info(HashState *hashstate, ExplainState *es)
if (es->format != EXPLAIN_FORMAT_TEXT)
{
ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es);
+ ExplainPropertyLong("Original Hash Buckets",
+ hashtable->nbuckets_original, es);
ExplainPropertyLong("Hash Batches", hashtable->nbatch, es);
ExplainPropertyLong("Original Hash Batches",
hashtable->nbatch_original, es);
ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
}
- else if (hashtable->nbatch_original != hashtable->nbatch)
+ else if ((hashtable->nbatch_original != hashtable->nbatch) ||
+ (hashtable->nbuckets_original != hashtable->nbuckets))
{
appendStringInfoSpaces(es->str, es->indent * 2);
appendStringInfo(es->str,
- "Buckets: %d Batches: %d (originally %d) Memory Usage: %ldkB\n",
- hashtable->nbuckets, hashtable->nbatch,
- hashtable->nbatch_original, spacePeakKb);
+ "Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n",
+ hashtable->nbuckets, hashtable->nbuckets_original,
+ hashtable->nbatch, hashtable->nbatch_original, spacePeakKb);
}
else
{
diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c
index d1c8bb0d53..30bd40db18 100644
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@ -597,7 +597,7 @@ refresh_by_match_merge(Oid matviewOid, Oid tempOid, Oid relowner,
{
ereport(ERROR,
(errcode(ERRCODE_CARDINALITY_VIOLATION),
- errmsg("new data for \"%s\" contains duplicate rows without any NULL columns",
+ errmsg("new data for \"%s\" contains duplicate rows without any null columns",
RelationGetRelationName(matviewRel)),
errdetail("Row: %s",
SPI_getvalue(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1))));
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index cb16c53a60..ecdff1e5e3 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -85,6 +85,7 @@
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/relcache.h"
+#include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/tqual.h"
@@ -9045,7 +9046,7 @@ ATExecSetRelOptions(Relation rel, List *defList, AlterTableType operation,
if (view_updatable_error)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("WITH CHECK OPTION is supported only on auto-updatable views"),
+ errmsg("WITH CHECK OPTION is supported only on automatically updatable views"),
errhint("%s", view_updatable_error)));
}
}
diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c
index ad364efbcb..55a68810f2 100644
--- a/src/backend/commands/typecmds.c
+++ b/src/backend/commands/typecmds.c
@@ -72,6 +72,7 @@
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/rel.h"
+#include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/tqual.h"
diff --git a/src/backend/commands/view.c b/src/backend/commands/view.c
index 9d0039c42a..184bcd0582 100644
--- a/src/backend/commands/view.c
+++ b/src/backend/commands/view.c
@@ -471,7 +471,7 @@ DefineView(ViewStmt *stmt, const char *queryString)
if (view_updatable_error)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("WITH CHECK OPTION is supported only on auto-updatable views"),
+ errmsg("WITH CHECK OPTION is supported only on automatically updatable views"),
errhint("%s", view_updatable_error)));
}
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index b428c18b5c..7c5bb77b0c 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -39,6 +39,7 @@
static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
+static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable);
static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
int mcvsToUse);
static void ExecHashSkewTableInsert(HashJoinTable hashtable,
@@ -117,6 +118,7 @@ MultiExecHash(HashState *node)
/* It's a skew tuple, so put it into that hash table */
ExecHashSkewTableInsert(hashtable, slot, hashvalue,
bucketNumber);
+ hashtable->skewTuples += 1;
}
else
{
@@ -127,6 +129,25 @@ MultiExecHash(HashState *node)
}
}
+ /* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
+ if (hashtable->nbuckets != hashtable->nbuckets_optimal)
+ {
+ /* We never decrease the number of buckets. */
+ Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
+
+#ifdef HJDEBUG
+ printf("Increasing nbuckets %d => %d\n",
+ hashtable->nbuckets, hashtable->nbuckets_optimal);
+#endif
+
+ ExecHashIncreaseNumBuckets(hashtable);
+ }
+
+ /* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */
+ hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple);
+ if (hashtable->spaceUsed > hashtable->spacePeak)
+ hashtable->spacePeak = hashtable->spaceUsed;
+
/* must provide our own instrumentation support */
if (node->ps.instrument)
InstrStopNode(node->ps.instrument, hashtable->totalTuples);
@@ -272,7 +293,10 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
*/
hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
hashtable->nbuckets = nbuckets;
+ hashtable->nbuckets_original = nbuckets;
+ hashtable->nbuckets_optimal = nbuckets;
hashtable->log2_nbuckets = log2_nbuckets;
+ hashtable->log2_nbuckets_optimal = log2_nbuckets;
hashtable->buckets = NULL;
hashtable->keepNulls = keepNulls;
hashtable->skewEnabled = false;
@@ -286,6 +310,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
hashtable->nbatch_outstart = nbatch;
hashtable->growEnabled = true;
hashtable->totalTuples = 0;
+ hashtable->skewTuples = 0;
hashtable->innerBatchFile = NULL;
hashtable->outerBatchFile = NULL;
hashtable->spaceUsed = 0;
@@ -620,6 +645,19 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
*/
ninmemory = nfreed = 0;
+ /* If know we need to resize nbuckets, we can do it while rebatching. */
+ if (hashtable->nbuckets_optimal != hashtable->nbuckets)
+ {
+ /* we never decrease the number of buckets */
+ Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
+
+ hashtable->nbuckets = hashtable->nbuckets_optimal;
+ hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
+
+ hashtable->buckets = repalloc(hashtable->buckets,
+ sizeof(HashJoinTuple) * hashtable->nbuckets);
+ }
+
/*
* We will scan through the chunks directly, so that we can reset the
* buckets now and not have to keep track which tuples in the buckets have
@@ -704,6 +742,78 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
}
/*
+ * ExecHashIncreaseNumBuckets
+ * increase the original number of buckets in order to reduce
+ * number of tuples per bucket
+ */
+static void
+ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
+{
+ HashMemoryChunk chunk;
+
+ /* do nothing if not an increase (it's called increase for a reason) */
+ if (hashtable->nbuckets >= hashtable->nbuckets_optimal)
+ return;
+
+ /*
+ * We already know the optimal number of buckets, so let's just
+ * compute the log2_nbuckets for it.
+ */
+ hashtable->nbuckets = hashtable->nbuckets_optimal;
+ hashtable->log2_nbuckets = my_log2(hashtable->nbuckets_optimal);
+
+ Assert(hashtable->nbuckets > 1);
+ Assert(hashtable->nbuckets <= (INT_MAX / 2));
+ Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
+
+#ifdef HJDEBUG
+ printf("Increasing nbuckets to %d\n", hashtable->nbuckets);
+#endif
+
+ /*
+ * Just reallocate the proper number of buckets - we don't need to
+ * walk through them - we can walk the dense-allocated chunks
+ * (just like in ExecHashIncreaseNumBatches, but without all the
+ * copying into new chunks)
+ */
+ hashtable->buckets =
+ (HashJoinTuple *) repalloc(hashtable->buckets,
+ hashtable->nbuckets * sizeof(HashJoinTuple));
+
+ memset(hashtable->buckets, 0, sizeof(void *) * hashtable->nbuckets);
+
+ /* scan through all tuples in all chunks to rebuild the hash table */
+ for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next)
+ {
+ /* process all tuples stored in this chunk */
+ size_t idx = 0;
+ while (idx < chunk->used)
+ {
+ HashJoinTuple hashTuple = (HashJoinTuple) (chunk->data + idx);
+ int bucketno;
+ int batchno;
+
+ ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
+ &bucketno, &batchno);
+
+ /* add the tuple to the proper bucket */
+ hashTuple->next = hashtable->buckets[bucketno];
+ hashtable->buckets[bucketno] = hashTuple;
+
+ /* advance index past the tuple */
+ idx += MAXALIGN(HJTUPLE_OVERHEAD +
+ HJTUPLE_MINTUPLE(hashTuple)->t_len);
+ }
+ }
+
+#ifdef HJDEBUG
+ printf("Nbuckets increased to %d, average items per bucket %.1f\n",
+ hashtable->nbuckets, batchTuples / hashtable->nbuckets);
+#endif
+}
+
+
+/*
* ExecHashTableInsert
* insert a tuple into the hash table depending on the hash value
* it may just go to a temp file for later batches
@@ -736,6 +846,7 @@ ExecHashTableInsert(HashJoinTable hashtable,
*/
HashJoinTuple hashTuple;
int hashTupleSize;
+ double ntuples = (hashtable->totalTuples - hashtable->skewTuples);
/* Create the HashJoinTuple */
hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
@@ -756,11 +867,24 @@ ExecHashTableInsert(HashJoinTable hashtable,
hashTuple->next = hashtable->buckets[bucketno];
hashtable->buckets[bucketno] = hashTuple;
+ /*
+ * Increase the (optimal) number of buckets if we just exceeded the
+ * NTUP_PER_BUCKET threshold, but only when there's still a single batch.
+ */
+ if ((hashtable->nbatch == 1) &&
+ (hashtable->nbuckets_optimal <= INT_MAX/2) && /* overflow protection */
+ (ntuples >= (hashtable->nbuckets_optimal * NTUP_PER_BUCKET)))
+ {
+ hashtable->nbuckets_optimal *= 2;
+ hashtable->log2_nbuckets_optimal += 1;
+ }
+
/* Account for space used, and back off if we've used too much */
hashtable->spaceUsed += hashTupleSize;
if (hashtable->spaceUsed > hashtable->spacePeak)
hashtable->spacePeak = hashtable->spaceUsed;
- if (hashtable->spaceUsed + hashtable->nbuckets * sizeof(HashJoinTuple)
+ if (hashtable->spaceUsed +
+ hashtable->nbuckets_optimal * sizeof(HashJoinTuple)
> hashtable->spaceAllowed)
ExecHashIncreaseNumBatches(hashtable);
}
@@ -885,7 +1009,10 @@ ExecHashGetHashValue(HashJoinTable hashtable,
* functions are good about randomizing all their output bits, else we are
* likely to have very skewed bucket or batch occupancy.)
*
- * nbuckets doesn't change over the course of the join.
+ * nbuckets and log2_nbuckets may change while nbatch == 1 because of dynamic
+ * bucket count growth. Once we start batching, the value is fixed and does
+ * not change over the course of the join (making it possible to compute batch
+ * number the way we do here).
*
* nbatch is always a power of 2; we increase it only by doubling it. This
* effectively adds one more bit to the top of the batchno.
diff --git a/src/backend/libpq/be-secure-openssl.c b/src/backend/libpq/be-secure-openssl.c
index 8d8f12952a..b05364ced0 100644
--- a/src/backend/libpq/be-secure-openssl.c
+++ b/src/backend/libpq/be-secure-openssl.c
@@ -614,7 +614,7 @@ be_tls_write(Port *port, void *ptr, size_t len)
if (retries >= 20)
ereport(FATAL,
(errcode(ERRCODE_PROTOCOL_VIOLATION),
- errmsg("unable to complete SSL handshake")));
+ errmsg("could not complete SSL handshake on renegotiation, too many failures")));
}
}
}
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
index 7a38f2f150..092cf8fe43 100644
--- a/src/backend/storage/buffer/buf_table.c
+++ b/src/backend/storage/buffer/buf_table.c
@@ -21,8 +21,10 @@
*/
#include "postgres.h"
+#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/buf_internals.h"
+#include "utils/chash.h"
/* entry for buffer lookup hashtable */
@@ -32,8 +34,13 @@ typedef struct
int id; /* Associated buffer ID */
} BufferLookupEnt;
-static HTAB *SharedBufHash;
-
+static CHashDescriptor SharedBufDescriptor = {
+ "buffer lookup table",
+ 0,
+ sizeof(BufferLookupEnt),
+ sizeof(BufferTag)
+};
+static CHashTable SharedBufHash;
/*
* Estimate space needed for mapping hashtable
@@ -42,7 +49,13 @@ static HTAB *SharedBufHash;
Size
BufTableShmemSize(int size)
{
- return hash_estimate_size(size, sizeof(BufferLookupEnt));
+ if (SharedBufHash == NULL)
+ {
+ SharedBufDescriptor.capacity = size;
+ SharedBufHash = CHashBootstrap(&SharedBufDescriptor);
+ }
+
+ return CHashEstimateSize(SharedBufHash);
}
/*
@@ -52,59 +65,29 @@ BufTableShmemSize(int size)
void
InitBufTable(int size)
{
- HASHCTL info;
-
- /* assume no locking is needed yet */
-
- /* BufferTag maps to Buffer */
- info.keysize = sizeof(BufferTag);
- info.entrysize = sizeof(BufferLookupEnt);
- info.hash = tag_hash;
- info.num_partitions = NUM_BUFFER_PARTITIONS;
-
- SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
- size, size,
- &info,
- HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
-}
-
-/*
- * BufTableHashCode
- * Compute the hash code associated with a BufferTag
- *
- * This must be passed to the lookup/insert/delete routines along with the
- * tag. We do it like this because the callers need to know the hash code
- * in order to determine which buffer partition to lock, and we don't want
- * to do the hash computation twice (hash_any is a bit slow).
- */
-uint32
-BufTableHashCode(BufferTag *tagPtr)
-{
- return get_hash_value(SharedBufHash, (void *) tagPtr);
+ if (SharedBufHash == NULL || !IsUnderPostmaster)
+ {
+ Assert(SharedBufDescriptor.capacity == 0 ||
+ SharedBufDescriptor.capacity == size);
+ SharedBufDescriptor.capacity = size;
+ SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor);
+ }
}
/*
* BufTableLookup
* Lookup the given BufferTag; return buffer ID, or -1 if not found
- *
- * Caller must hold at least share lock on BufMappingLock for tag's partition
*/
int
-BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
+BufTableLookup(BufferTag *tagPtr)
{
- BufferLookupEnt *result;
-
- result = (BufferLookupEnt *)
- hash_search_with_hash_value(SharedBufHash,
- (void *) tagPtr,
- hashcode,
- HASH_FIND,
- NULL);
+ BufferLookupEnt ent;
- if (!result)
+ ent.key = *tagPtr;
+ if (!CHashSearch(SharedBufHash, &ent))
return -1;
- return result->id;
+ return ent.id;
}
/*
@@ -118,27 +101,20 @@ BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
* Caller must hold exclusive lock on BufMappingLock for tag's partition
*/
int
-BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
+BufTableInsert(BufferTag *tagPtr, int buf_id)
{
- BufferLookupEnt *result;
- bool found;
+ BufferLookupEnt ent;
+
+ ent.key = *tagPtr;
+ ent.id = buf_id;
Assert(buf_id >= 0); /* -1 is reserved for not-in-table */
Assert(tagPtr->blockNum != P_NEW); /* invalid tag */
- result = (BufferLookupEnt *)
- hash_search_with_hash_value(SharedBufHash,
- (void *) tagPtr,
- hashcode,
- HASH_ENTER,
- &found);
-
- if (found) /* found something already in the table */
- return result->id;
-
- result->id = buf_id;
+ if (CHashInsert(SharedBufHash, &ent))
+ return -1;
- return -1;
+ return ent.id;
}
/*
@@ -148,17 +124,8 @@ BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
* Caller must hold exclusive lock on BufMappingLock for tag's partition
*/
void
-BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
+BufTableDelete(BufferTag *tagPtr)
{
- BufferLookupEnt *result;
-
- result = (BufferLookupEnt *)
- hash_search_with_hash_value(SharedBufHash,
- (void *) tagPtr,
- hashcode,
- HASH_REMOVE,
- NULL);
-
- if (!result) /* shouldn't happen */
+ if (!CHashDelete(SharedBufHash, tagPtr))
elog(ERROR, "shared buffer hash table corrupted");
}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 45d1d61d95..437deb905c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -429,22 +429,14 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
else
{
BufferTag newTag; /* identity of requested block */
- uint32 newHash; /* hash value for newTag */
- LWLock *newPartitionLock; /* buffer partition lock for it */
int buf_id;
/* create a tag so we can lookup the buffer */
INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
forkNum, blockNum);
- /* determine its hash code and partition lock ID */
- newHash = BufTableHashCode(&newTag);
- newPartitionLock = BufMappingPartitionLock(newHash);
-
/* see if the block is in the buffer pool already */
- LWLockAcquire(newPartitionLock, LW_SHARED);
- buf_id = BufTableLookup(&newTag, newHash);
- LWLockRelease(newPartitionLock);
+ buf_id = BufTableLookup(&newTag);
/* If not in buffers, initiate prefetch */
if (buf_id < 0)
@@ -822,11 +814,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
bool *foundPtr)
{
BufferTag newTag; /* identity of requested block */
- uint32 newHash; /* hash value for newTag */
- LWLock *newPartitionLock; /* buffer partition lock for it */
BufferTag oldTag; /* previous identity of selected buffer */
- uint32 oldHash; /* hash value for oldTag */
- LWLock *oldPartitionLock; /* buffer partition lock for it */
BufFlags oldFlags;
int buf_id;
volatile BufferDesc *buf;
@@ -835,29 +823,31 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
/* create a tag so we can lookup the buffer */
INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
- /* determine its hash code and partition lock ID */
- newHash = BufTableHashCode(&newTag);
- newPartitionLock = BufMappingPartitionLock(newHash);
-
/* see if the block is in the buffer pool already */
- LWLockAcquire(newPartitionLock, LW_SHARED);
- buf_id = BufTableLookup(&newTag, newHash);
+start:
+ buf_id = BufTableLookup(&newTag);
if (buf_id >= 0)
{
+ BufferDesc *foundbuf;
+
/*
* Found it. Now, pin the buffer so no one can steal it from the
- * buffer pool, and check to see if the correct data has been loaded
- * into the buffer.
+ * buffer pool.
*/
- buf = &BufferDescriptors[buf_id];
+ foundbuf = &BufferDescriptors[buf_id];
- valid = PinBuffer(buf, strategy);
+ valid = PinBuffer(foundbuf, strategy);
- /* Can release the mapping lock as soon as we've pinned it */
- LWLockRelease(newPartitionLock);
+ /* Check whether someone recycled the buffer before we pinned it. */
+ if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag))
+ {
+ UnpinBuffer(foundbuf, true);
+ goto start;
+ }
*foundPtr = TRUE;
+ /* Check to see if the correct data has been loaded into the buffer. */
if (!valid)
{
/*
@@ -867,7 +857,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* own read attempt if the page is still not BM_VALID.
* StartBufferIO does it all.
*/
- if (StartBufferIO(buf, true))
+ if (StartBufferIO(foundbuf, true))
{
/*
* If we get here, previous attempts to read the buffer must
@@ -877,15 +867,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
}
}
- return buf;
+ return foundbuf;
}
- /*
- * Didn't find it in the buffer pool. We'll have to initialize a new
- * buffer. Remember to unlock the mapping lock while doing the work.
- */
- LWLockRelease(newPartitionLock);
-
/* Loop here in case we have to try another victim buffer */
for (;;)
{
@@ -986,42 +970,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
*/
if (oldFlags & BM_TAG_VALID)
{
- /*
- * Need to compute the old tag's hashcode and partition lock ID.
- * XXX is it worth storing the hashcode in BufferDesc so we need
- * not recompute it here? Probably not.
- */
+ /* Save old tag. */
oldTag = buf->tag;
- oldHash = BufTableHashCode(&oldTag);
- oldPartitionLock = BufMappingPartitionLock(oldHash);
-
- /*
- * Must lock the lower-numbered partition first to avoid
- * deadlocks.
- */
- if (oldPartitionLock < newPartitionLock)
- {
- LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
- LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
- }
- else if (oldPartitionLock > newPartitionLock)
- {
- LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
- LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
- }
- else
- {
- /* only one partition, only one lock */
- LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
- }
- }
- else
- {
- /* if it wasn't valid, we need only the new partition */
- LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
- /* these just keep the compiler quiet about uninit variables */
- oldHash = 0;
- oldPartitionLock = 0;
}
/*
@@ -1031,32 +981,34 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* Note that we have not yet removed the hashtable entry for the old
* tag.
*/
- buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
+enter:
+ buf_id = BufTableInsert(&newTag, buf->buf_id);
if (buf_id >= 0)
{
+ BufferDesc *foundbuf;
+
/*
- * Got a collision. Someone has already done what we were about to
- * do. We'll just handle this as if it were found in the buffer
- * pool in the first place. First, give up the buffer we were
- * planning to use.
+ * We've got a collision, apparently: it looks like someone else
+ * did what we were about to do. We can handle this as if we had
+ * found the buffer in the pool in the first place, but we must
+ * recheck the buffer tag after pinning it, because it could still
+ * get renamed under us.
+ */
+ foundbuf = &BufferDescriptors[buf_id];
+ valid = PinBuffer(foundbuf, strategy);
+ if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag))
+ {
+ UnpinBuffer(foundbuf, true);
+ goto enter;
+ }
+
+ /*
+ * Collision confirmed. Give up the buffer we were planning to
+ * use.
*/
UnpinBuffer(buf, true);
- /* Can give up that buffer's mapping partition lock now */
- if ((oldFlags & BM_TAG_VALID) &&
- oldPartitionLock != newPartitionLock)
- LWLockRelease(oldPartitionLock);
-
- /* remaining code should match code at top of routine */
-
- buf = &BufferDescriptors[buf_id];
-
- valid = PinBuffer(buf, strategy);
-
- /* Can release the mapping lock as soon as we've pinned it */
- LWLockRelease(newPartitionLock);
-
*foundPtr = TRUE;
if (!valid)
@@ -1068,7 +1020,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* then set up our own read attempt if the page is still not
* BM_VALID. StartBufferIO does it all.
*/
- if (StartBufferIO(buf, true))
+ if (StartBufferIO(foundbuf, true))
{
/*
* If we get here, previous attempts to read the buffer
@@ -1078,7 +1030,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
}
}
- return buf;
+ return foundbuf;
}
/*
@@ -1097,11 +1049,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
break;
UnlockBufHdr(buf);
- BufTableDelete(&newTag, newHash);
- if ((oldFlags & BM_TAG_VALID) &&
- oldPartitionLock != newPartitionLock)
- LWLockRelease(oldPartitionLock);
- LWLockRelease(newPartitionLock);
+ BufTableDelete(&newTag);
UnpinBuffer(buf, true);
}
@@ -1124,13 +1072,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
UnlockBufHdr(buf);
if (oldFlags & BM_TAG_VALID)
- {
- BufTableDelete(&oldTag, oldHash);
- if (oldPartitionLock != newPartitionLock)
- LWLockRelease(oldPartitionLock);
- }
-
- LWLockRelease(newPartitionLock);
+ BufTableDelete(&oldTag);
/*
* Buffer contents are currently invalid. Try to get the io_in_progress
@@ -1166,42 +1108,11 @@ static void
InvalidateBuffer(volatile BufferDesc *buf)
{
BufferTag oldTag;
- uint32 oldHash; /* hash value for oldTag */
- LWLock *oldPartitionLock; /* buffer partition lock for it */
BufFlags oldFlags;
/* Save the original buffer tag before dropping the spinlock */
oldTag = buf->tag;
- UnlockBufHdr(buf);
-
- /*
- * Need to compute the old tag's hashcode and partition lock ID. XXX is it
- * worth storing the hashcode in BufferDesc so we need not recompute it
- * here? Probably not.
- */
- oldHash = BufTableHashCode(&oldTag);
- oldPartitionLock = BufMappingPartitionLock(oldHash);
-
-retry:
-
- /*
- * Acquire exclusive mapping lock in preparation for changing the buffer's
- * association.
- */
- LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-
- /* Re-lock the buffer header */
- LockBufHdr(buf);
-
- /* If it's changed while we were waiting for lock, do nothing */
- if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
- {
- UnlockBufHdr(buf);
- LWLockRelease(oldPartitionLock);
- return;
- }
-
/*
* We assume the only reason for it to be pinned is that someone else is
* flushing the page out. Wait for them to finish. (This could be an
@@ -1211,15 +1122,21 @@ retry:
* yet done StartBufferIO, WaitIO will fall through and we'll effectively
* be busy-looping here.)
*/
- if (buf->refcount != 0)
+ while (buf->refcount != 0)
{
UnlockBufHdr(buf);
- LWLockRelease(oldPartitionLock);
/* safety check: should definitely not be our *own* pin */
if (GetPrivateRefCount(buf->buf_id) > 0)
elog(ERROR, "buffer is pinned in InvalidateBuffer");
WaitIO(buf);
- goto retry;
+ LockBufHdr(buf);
+
+ /* If it's changed while we were waiting for lock, do nothing */
+ if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
+ {
+ UnlockBufHdr(buf);
+ return;
+ }
}
/*
@@ -1237,12 +1154,7 @@ retry:
* Remove the buffer from the lookup hashtable, if it was in there.
*/
if (oldFlags & BM_TAG_VALID)
- BufTableDelete(&oldTag, oldHash);
-
- /*
- * Done with mapping lock.
- */
- LWLockRelease(oldPartitionLock);
+ BufTableDelete(&oldTag);
/*
* Insert the buffer at the head of the list of free buffers.
diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c
index d96627a774..90df5930e1 100644
--- a/src/backend/storage/ipc/shm_mq.c
+++ b/src/backend/storage/ipc/shm_mq.c
@@ -139,7 +139,7 @@ struct shm_mq_handle
};
static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mq, Size nbytes,
- void *data, bool nowait, Size *bytes_written);
+ const void *data, bool nowait, Size *bytes_written);
static shm_mq_result shm_mq_receive_bytes(shm_mq *mq, Size bytes_needed,
bool nowait, Size *nbytesp, void **datap);
static bool shm_mq_wait_internal(volatile shm_mq *mq, PGPROC *volatile * ptr,
@@ -301,7 +301,33 @@ shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle)
}
/*
+ * Associate a BackgroundWorkerHandle with a shm_mq_handle just as if it had
+ * been passed to shm_mq_attach.
+ */
+void
+shm_mq_set_handle(shm_mq_handle *mqh, BackgroundWorkerHandle *handle)
+{
+ Assert(mqh->mqh_handle == NULL);
+ mqh->mqh_handle = handle;
+}
+
+/*
* Write a message into a shared message queue.
+ */
+shm_mq_result
+shm_mq_send(shm_mq_handle *mqh, Size nbytes, const void *data, bool nowait)
+{
+ shm_mq_iovec iov;
+
+ iov.data = data;
+ iov.len = nbytes;
+
+ return shm_mq_sendv(mqh, &iov, 1, nowait);
+}
+
+/*
+ * Write a message into a shared message queue, gathered from multiple
+ * addresses.
*
* When nowait = false, we'll wait on our process latch when the ring buffer
* fills up, and then continue writing once the receiver has drained some data.
@@ -315,14 +341,22 @@ shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle)
* the length or payload will corrupt the queue.)
*/
shm_mq_result
-shm_mq_send(shm_mq_handle *mqh, Size nbytes, void *data, bool nowait)
+shm_mq_sendv(shm_mq_handle *mqh, shm_mq_iovec *iov, int iovcnt, bool nowait)
{
shm_mq_result res;
shm_mq *mq = mqh->mqh_queue;
+ Size nbytes = 0;
Size bytes_written;
+ int i;
+ int which_iov = 0;
+ Size offset;
Assert(mq->mq_sender == MyProc);
+ /* Compute total size of write. */
+ for (i = 0; i < iovcnt; ++i)
+ nbytes += iov[i].len;
+
/* Try to write, or finish writing, the length word into the buffer. */
while (!mqh->mqh_length_word_complete)
{
@@ -348,18 +382,80 @@ shm_mq_send(shm_mq_handle *mqh, Size nbytes, void *data, bool nowait)
/* Write the actual data bytes into the buffer. */
Assert(mqh->mqh_partial_bytes <= nbytes);
- res = shm_mq_send_bytes(mqh, nbytes - mqh->mqh_partial_bytes,
- ((char *) data) + mqh->mqh_partial_bytes,
- nowait, &bytes_written);
- if (res == SHM_MQ_WOULD_BLOCK)
- mqh->mqh_partial_bytes += bytes_written;
- else
+ offset = mqh->mqh_partial_bytes;
+ do
{
- mqh->mqh_partial_bytes = 0;
- mqh->mqh_length_word_complete = false;
- }
- if (res != SHM_MQ_SUCCESS)
- return res;
+ Size chunksize;
+
+ /* Figure out which bytes need to be sent next. */
+ if (offset >= iov[which_iov].len)
+ {
+ offset -= iov[which_iov].len;
+ ++which_iov;
+ if (which_iov >= iovcnt)
+ break;
+ continue;
+ }
+
+ /*
+ * We want to avoid copying the data if at all possible, but every
+ * chunk of bytes we write into the queue has to be MAXALIGN'd,
+ * except the last. Thus, if a chunk other than the last one ends
+ * on a non-MAXALIGN'd boundary, we have to combine the tail end of
+ * its data with data from one or more following chunks until we
+ * either reach the last chunk or accumulate a number of bytes which
+ * is MAXALIGN'd.
+ */
+ if (which_iov + 1 < iovcnt &&
+ offset + MAXIMUM_ALIGNOF > iov[which_iov].len)
+ {
+ char tmpbuf[MAXIMUM_ALIGNOF];
+ int j = 0;
+
+ for (;;)
+ {
+ if (offset < iov[which_iov].len)
+ {
+ tmpbuf[j] = iov[which_iov].data[offset];
+ j++;
+ offset++;
+ if (j == MAXIMUM_ALIGNOF)
+ break;
+ }
+ else
+ {
+ offset -= iov[which_iov].len;
+ which_iov++;
+ if (which_iov >= iovcnt)
+ break;
+ }
+ }
+ res = shm_mq_send_bytes(mqh, j, tmpbuf, nowait, &bytes_written);
+ mqh->mqh_partial_bytes += bytes_written;
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+ continue;
+ }
+
+ /*
+ * If this is the last chunk, we can write all the data, even if it
+ * isn't a multiple of MAXIMUM_ALIGNOF. Otherwise, we need to
+ * MAXALIGN_DOWN the write size.
+ */
+ chunksize = iov[which_iov].len - offset;
+ if (which_iov + 1 < iovcnt)
+ chunksize = MAXALIGN_DOWN(chunksize);
+ res = shm_mq_send_bytes(mqh, chunksize, &iov[which_iov].data[offset],
+ nowait, &bytes_written);
+ mqh->mqh_partial_bytes += bytes_written;
+ offset += bytes_written;
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+ } while (mqh->mqh_partial_bytes < nbytes);
+
+ /* Reset for next message. */
+ mqh->mqh_partial_bytes = 0;
+ mqh->mqh_length_word_complete = false;
/* Notify receiver of the newly-written data, and return. */
return shm_mq_notify_receiver(mq);
@@ -653,8 +749,8 @@ shm_mq_detach(shm_mq *mq)
* Write bytes into a shared message queue.
*/
static shm_mq_result
-shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, void *data, bool nowait,
- Size *bytes_written)
+shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data,
+ bool nowait, Size *bytes_written)
{
shm_mq *mq = mqh->mqh_queue;
Size sent = 0;
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 2ea2216a65..38614a449d 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -423,6 +423,29 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
return structPtr;
}
+/*
+ * ShmemInitStruct -- Attach to an existing structure in shared memory.
+ */
+void *
+ShmemAttachStruct(const char *name)
+{
+ ShmemIndexEnt *result;
+ void *ptr;
+ bool found;
+
+ LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+ result = (ShmemIndexEnt *)
+ hash_search(ShmemIndex, name, HASH_FIND, &found);
+ if (!found || result == NULL)
+ elog(ERROR, "shared memory structure %s not found", name);
+ ptr = result->location;
+ Assert(ptr != NULL);
+
+ LWLockRelease(ShmemIndexLock);
+
+ return ptr;
+}
/*
* Add two Size values, checking for overflow
diff --git a/src/backend/utils/adt/jsonb_op.c b/src/backend/utils/adt/jsonb_op.c
index 2d071b2523..d9aaac9ac2 100644
--- a/src/backend/utils/adt/jsonb_op.c
+++ b/src/backend/utils/adt/jsonb_op.c
@@ -57,7 +57,7 @@ jsonb_exists_any(PG_FUNCTION_ARGS)
for (i = 0; i < elem_count; i++)
{
- JsonbValue strVal;
+ JsonbValue strVal;
if (key_nulls[i])
continue;
@@ -90,7 +90,7 @@ jsonb_exists_all(PG_FUNCTION_ARGS)
for (i = 0; i < elem_count; i++)
{
- JsonbValue strVal;
+ JsonbValue strVal;
if (key_nulls[i])
continue;
@@ -117,8 +117,7 @@ jsonb_contains(PG_FUNCTION_ARGS)
JsonbIterator *it1,
*it2;
- if (JB_ROOT_COUNT(val) < JB_ROOT_COUNT(tmpl) ||
- JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl))
+ if (JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl))
PG_RETURN_BOOL(false);
it1 = JsonbIteratorInit(&val->root);
@@ -137,8 +136,7 @@ jsonb_contained(PG_FUNCTION_ARGS)
JsonbIterator *it1,
*it2;
- if (JB_ROOT_COUNT(val) < JB_ROOT_COUNT(tmpl) ||
- JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl))
+ if (JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl))
PG_RETURN_BOOL(false);
it1 = JsonbIteratorInit(&val->root);
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index f157df3532..2ff85396d0 100644
--- a/src/backend/utils/adt/jsonb_util.c
+++ b/src/backend/utils/adt/jsonb_util.c
@@ -957,13 +957,24 @@ JsonbDeepContains(JsonbIterator **val, JsonbIterator **mContained)
}
else if (rcont == WJB_BEGIN_OBJECT)
{
- JsonbValue *lhsVal; /* lhsVal is from pair in lhs object */
-
+ Assert(vval.type == jbvObject);
Assert(vcontained.type == jbvObject);
+ /*
+ * If the lhs has fewer pairs than the rhs, it can't possibly contain
+ * the rhs. (This conclusion is safe only because we de-duplicate
+ * keys in all Jsonb objects; thus there can be no corresponding
+ * optimization in the array case.) The case probably won't arise
+ * often, but since it's such a cheap check we may as well make it.
+ */
+ if (vval.val.object.nPairs < vcontained.val.object.nPairs)
+ return false;
+
/* Work through rhs "is it contained within?" object */
for (;;)
{
+ JsonbValue *lhsVal; /* lhsVal is from pair in lhs object */
+
rcont = JsonbIteratorNext(mContained, &vcontained, false);
/*
@@ -1047,6 +1058,7 @@ JsonbDeepContains(JsonbIterator **val, JsonbIterator **mContained)
JsonbValue *lhsConts = NULL;
uint32 nLhsElems = vval.val.array.nElems;
+ Assert(vval.type == jbvArray);
Assert(vcontained.type == jbvArray);
/*
diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c
index 4eeb6314fa..67539ecde9 100644
--- a/src/backend/utils/adt/misc.c
+++ b/src/backend/utils/adt/misc.c
@@ -35,6 +35,7 @@
#include "storage/proc.h"
#include "storage/procarray.h"
#include "utils/lsyscache.h"
+#include "utils/ruleutils.h"
#include "tcop/tcopprot.h"
#include "utils/builtins.h"
#include "utils/timestamp.h"
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 6e41cbd142..24ade6cc20 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -55,6 +55,7 @@
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
+#include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/tqual.h"
diff --git a/src/backend/utils/hash/Makefile b/src/backend/utils/hash/Makefile
index 05d347c856..5d5338266d 100644
--- a/src/backend/utils/hash/Makefile
+++ b/src/backend/utils/hash/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/hash
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = dynahash.o hashfn.o
+OBJS = chash.o dynahash.o hashfn.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/hash/chash.c b/src/backend/utils/hash/chash.c
new file mode 100644
index 0000000000..0d4dc78a4e
--- /dev/null
+++ b/src/backend/utils/hash/chash.c
@@ -0,0 +1,1075 @@
+/*-------------------------------------------------------------------------
+ *
+ * chash.c
+ * concurrent hash tables
+ *
+ * A concurrent hash table stores a collection of fixed-size objects.
+ * From the point of view of this module, such objects are merely an
+ * opaque array of bytes, but the caller will typically implement them as
+ * a C "struct". Some fixed-size, leading portion of each object is
+ * designated as the key, which must be distinct for all objects in the
+ * collection. Since PostgreSQL's shared memory model does not permit
+ * dynamic shared-memory allocation, we preallocate shared-memory space
+ * for the maximum number of entities which can be stored (plus a few
+ * extra, for reasons that will be further explained below). This space
+ * is allocated as a single large array called the arena, and we often
+ * refer to entities by their position in the arena rather than via an
+ * ordinary pointer. This saves a considerable amount of memory, since
+ * most modern architectures are 64-bit and therefore use 8-byte pointers,
+ * while arena offsets can be stored in a 32-bit word. In fact, we
+ * reserve one bit in each such word as a mark bit, so the maximum size
+ * of the arena is 2^31 elements, a restriction that does not currently
+ * appear to be problematic. An additional advantage of this representation
+ * is that aligned 32-bit loads and stores are atomic on all architectures
+ * we support, but 64-bit loads and stores are not.
+ *
+ * When an element is inserted, we copy the data from the backend-private
+ * object supplied by the caller into one of these shared-memory entities.
+ * When the hash table is searched, the caller passes a backend-private
+ * entity with just the key filled in; if a matching element is found,
+ * data is copied from the shared memory entity into the non-key portion
+ * of the user-supplied entity. In this way, clients of this module
+ * never use pointers into shared memory directly.
+ *
+ * As normal, we structure the hash table as an array of buckets, whose
+ * size is always a power of two, so that the low-order bytes of the
+ * hash code can be used to select a bucket. If multiple entities has
+ * to the same bucket, we use separate chaining: each entity in the
+ * arena has an 8-byte header that stores the 4-byte arena offset of the
+ * next item in the bucket and the hash value of the entity's key.
+ * Bucket chains are maintained in order by ascending hash value and
+ * then by ascending entity key (as per memcmp) so that there is
+ * precisely one legal location at which a given new item can be inserted
+ * into a bucket.
+ *
+ * Items are inserted into buckets using compare-and-swap (CAS). Thus, this
+ * module cannot be used on architectures where we do not have 4-byte
+ * compare-and-swap. When an item is deleted, we first set its mark bit,
+ * which is stored within the next-pointer, again using CAS. Once this
+ * step is completed, the node is deleted. As a cleanup operation, we then
+ * use CAS to modify the next-pointer of the previous node to point to the
+ * node following the deleted node. Note that, even once this cleanup
+ * operation has been performed, some other backend concurrently walking the
+ * chain might still be visiting the deleted node. Thus, we must be certain
+ * not to overwrite the deleted node's next-pointer until all concurrent
+ * bucket scans have completed. This means, in particular, that we cannot
+ * immediately view the deleted node as available for reuse.
+ *
+ * Instead, when a delete-marked node is removed from the bucket chain, it
+ * is added to one of many garbage lists. There is a many-to-one mapping from
+ * buckets to garbage lists, so that the choice of bucket determines the
+ * garbage list but not visca versa. Any process which wishes to scan a bucket
+ * must first advertise in shared memory the corresponding garbage list number.
+ * When a backend wishes to move entries from a garbage list to a free list,
+ * it must first wait (by spinning) for any backends scanning that garbage
+ * list to complete their scans.
+ *
+ * Ideally, it would be nice to make this completely lock-free, but because
+ * of the above-described choice of garbage collection algorithm, it currently
+ * isn't. For an algorithm to be lock-free, it must be possible to suspend
+ * the execution of any number of processes for an arbitrary period of time
+ * without impeding the overall progress of the system. For this code, that
+ * is true except when garbage collection occurs. In that case, an insert,
+ * search, or delete operation can obstruct an inserting thread attempting to
+ * perform garbage collection for an unbounded period of time. The algorithm
+ * could be adapted as to be completely lock-free, essentially by guaranteeing
+ * that even in the worst case no combination of processes can obstruct garbage
+ * collection to a sufficient degree as to prevent an inserter from finding
+ * an available entry in a hash table containing fewer live elements than its
+ * declared maximum capacity. However, it's not clear that this is a good
+ * idea, because it would likely slow down operation in the non-contended
+ * case to solve a problem that we hope won't happen anyway.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/utils/hash/chash.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "access/hash.h"
+#include "storage/barrier.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "utils/chash.h"
+#include "utils/memutils.h"
+
+/*
+ * CHashPtr represents an offset into the arena, plus a mark bit that is
+ * used to implement concurrent deletion.
+ */
+typedef uint32 CHashPtr;
+#define InvalidCHashPtr ((uint32) -2)
+#define CHashPtrIsInvalid(x) ((x) >= InvalidCHashPtr)
+#define CHashPtrIsMarked(x) ((x) & 1)
+#define CHashPtrGetOffset(x) ((x) >> 1)
+#define CHashPtrMark(x) ((x) | 1)
+#define CHashPtrUnmark(x) ((x) & ~1)
+#define MakeCHashPtr(x) ((x) << 1)
+#define CHashMaxCapacity CHashPtrGetOffset(InvalidCHashPtr)
+
+/*
+ * Each object stored in the hash table is represented by a CHashNode, which
+ * stores a pointer to the next item in the same bucket, and the exact hash
+ * value of the current item. Each CHashNode is followed by space for the
+ * item itself.
+ */
+typedef struct
+{
+ CHashPtr next; /* arena offset of next element */
+ union
+ {
+ uint32 hashcode; /* hash(key) */
+ CHashPtr gcnext; /* arena offset of next garbage item */
+ } un;
+} CHashNode;
+
+#define SizeOfCHashNode MAXALIGN(sizeof(CHashNode))
+#define CHashNodeGetItem(x) (((char *) x) + SizeOfCHashNode)
+
+/*
+ * CHashTableData stores all the information that we need in order to access
+ * a concurrent hash table. We store one copy of this data in shared memory,
+ * and an additional copy in the private memory of each backend accessing the
+ * table.
+ */
+typedef struct CHashTableData
+{
+ /*
+ * These fields do not change after initialization.
+ */
+ CHashDescriptor desc; /* descriptor for this hash table */
+ uint32 bucket_mask; /* # of buckets, minus one */
+ uint8 garbage_shift; /* log2(# of buckets/# of garbage lists) */
+ uint8 freelist_shift; /* log2(# of garbage lists/# freelists) */
+ uint16 nfreelists; /* # of freelists */
+ uint32 arena_limit; /* # of arena elements */
+ uint32 arena_stride; /* bytes allocated per arena element */
+ CHashPtr *bucket; /* array with 1 entry per bucket */
+ CHashPtr *extra; /* entries for garbage and free lists */
+ char *arena; /* arena */
+
+ /*
+ * These fields will be different in each backend; the shared copy is
+ * irrelevant.
+ */
+ int gc_pid; /* PID that set gc_next */
+ uint32 gc_next; /* next garbage list to reclaim */
+ uint64 stats[CHS_NumberOfStatistics]; /* statistics */
+} CHashTableData;
+
+/* Compute # of buckets, garbage lists, or free lists. */
+#define CHashTableNBuckets(table) \
+ ((table)->bucket_mask + 1)
+#define CHashTableNGarbage(table) \
+ (CHashTableNBuckets((table)) >> (table)->garbage_shift)
+#define CHashTableNFreeLists(table) \
+ ((table)->nfreelists)
+
+/*
+ * Garbage lists and free lists are interleaved to reduce cache line
+ * contention on the free lists, so the calculation of where an individual
+ * list is located is a bit complex.
+ */
+#define CHashTableGetGarbageList(table, n) \
+ (&(table)->extra[(n) + ((n) >> (table)->freelist_shift)])
+#define CHashTableGetGarbageByBucket(table, n) \
+ (CHashTableGetGarbageList((table), (n) >> (table)->garbage_shift))
+#define CHashTableGetFreeList(table, n) \
+ (&(table)->extra[(n) + (((n) + 1) << (table)->freelist_shift)])
+
+/* Access macros for arena nodes. */
+#define CHashTableGetRaw(table, offset) \
+ (AssertMacro((offset) < (table)->arena_limit), \
+ (CHashNode *) ((table)->arena + (table)->arena_stride * (offset)))
+#define CHashTableGetNode(table, ptr) \
+ (AssertMacro(!CHashPtrIsInvalid(ptr)), \
+ CHashTableGetRaw((table), CHashPtrGetOffset((ptr))))
+
+/* Statistics macros. */
+#define CHashTableIncrementStatistic(table, stat) \
+ ((table)->stats[(stat)]++)
+
+/*
+ * Bucket scan.
+ */
+typedef struct
+{
+ CHashPtr target;
+ CHashPtr next;
+ CHashPtr *pointer_to_target;
+ CHashNode *target_node;
+ bool found;
+} CHashScanResult;
+
+/* Human-readable statistics names. */
+char *CHashStatisticsNames[] = {
+ "searches",
+ "searches failed",
+ "inserts",
+ "inserts failed",
+ "inserts retried",
+ "deletions",
+ "deletions failed",
+ "deletions retried",
+ "scan expunges",
+ "scan expunges failed",
+ "scans restarted",
+ "cleanup scans",
+ "allocations failed",
+ "garbage enqueues retried",
+ "garbage dequeues failed",
+ "garbage collections",
+ "garbage collection spins",
+ "garbage collection reclaims skipped",
+ "garbage collection fast reclaims",
+ "garbage collection reclaims retried",
+ "<end>"
+};
+
+/* Function prototypes. */
+static CHashPtr CHashAllocate(CHashTable table);
+static CHashPtr CHashAllocateViaGC(CHashTable table);
+static void CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c);
+static void CHashBucketScan(CHashTable table,
+ CHashPtr *start,
+ uint32 hashcode,
+ const void *key,
+ CHashScanResult *res);
+
+/*
+ * First stage of CHashTable initialization. We fill in all the constants
+ * here, but not the pointers.
+ */
+CHashTable
+CHashBootstrap(CHashDescriptor *desc)
+{
+ CHashTable table;
+ uint32 bucket_shift;
+
+ /* Sanity check. */
+ Assert(!strcmp(CHashStatisticsNames[CHS_NumberOfStatistics], "<end>"));
+
+ /* Allocate table and copy descriptor. */
+ table = MemoryContextAllocZero(TopMemoryContext, sizeof(CHashTableData));
+ memcpy(&table->desc, desc, sizeof(CHashDescriptor));
+
+ /* Sanity checks. */
+ if (desc->capacity < 1 || desc->capacity > CHashMaxCapacity)
+ elog(ERROR, "invalid capacity for concurrent hash");
+ if (desc->key_size < 1 || desc->key_size > desc->element_size)
+ elog(ERROR, "invalid key size for concurrent hash");
+
+ /*
+ * The number of buckets must be a power of two. To avoid (as much as
+ * possible) having to traverse long bucket chains, we aim for a load
+ * factor <= 1.0, so this is a pretty simple calculation: we just find the
+ * smallest power of two greater than or equal to the target capacity.
+ */
+ bucket_shift = fls(desc->capacity) - 1;
+ table->bucket_mask = (1 << bucket_shift) - 1;
+
+ /*
+ * We choose to have one garbage list for every 64 hash table buckets
+ * (that is, garbage_shift = 6) unless there are fewer than 64 buckets in
+ * total, in which case we still have a minimum of one garbage list.
+ *
+ * Increasing the garbage_shift would reduce the likelihood of a backend
+ * performing garbage collection needing to wait for a backend walking a
+ * bucket to finish its scan. On the other hand, decreasing the garbage
+ * shift would allow more items to be recovered in a single garbage
+ * collection cycle. It's not clear what the optimal value is.
+ */
+ table->garbage_shift = Min(bucket_shift, 6);
+ table->gc_next = 0;
+ table->gc_pid = 0;
+
+ /*
+ * Experimentation reveals that the free list manipulation is both one of
+ * the slowest parts of this algorithm and the most vulnerable to
+ * contention. Therefore, we want to have as many free lists as possible,
+ * but there's no need to have more than the number of CPU cores, so we
+ * limit the number of freelists to 64. There might be a benefit in some
+ * larger limit on a really big system, but we'll cap it here pending some
+ * actual test results. We're also limited to having no more freelists
+ * than we do garbage lists.
+ */
+#define LOG2_MAX_FREELIST 6
+ if (bucket_shift - table->garbage_shift < LOG2_MAX_FREELIST)
+ table->freelist_shift = 0;
+ else
+ table->freelist_shift =
+ (bucket_shift - table->garbage_shift) - LOG2_MAX_FREELIST;
+ table->nfreelists =
+ 1 << (bucket_shift - (table->garbage_shift + table->freelist_shift));
+
+ /*
+ * To make garbage collection efficient, we overallocate. Normally, we
+ * overallocate by one-eighth, but if that would be less than 15 elements,
+ * then we allocate 15 elements instead. This extra capacity can actually
+ * be used, but for best performance, it shouldn't be. It's the caller's
+ * responsibility to avoid this.
+ */
+ table->arena_limit = desc->capacity;
+ if (desc->capacity < 120)
+ table->arena_limit += 15;
+ else
+ table->arena_limit += table->arena_limit / 8;
+
+ /* Each arena element must be MAXALIGN'd and include per-node space. */
+ table->arena_stride = SizeOfCHashNode + MAXALIGN(desc->element_size);
+
+ /* Zero out statistics. */
+ memset(table->stats, 0, sizeof(uint64) * CHS_NumberOfStatistics);
+
+ return table;
+}
+
+/*
+ * Estimate shared memory requirements.
+ */
+Size
+CHashEstimateSize(CHashTable table)
+{
+ Size total_buckets;
+ Size size;
+ Size nbuckets = CHashTableNBuckets(table);
+ Size ngarbage = CHashTableNGarbage(table);
+ Size nfreelists = CHashTableNFreeLists(table);
+
+ Assert(nbuckets > 0 && ngarbage > 0 && nfreelists > 0);
+ total_buckets = add_size(nbuckets, ngarbage);
+ total_buckets = add_size(total_buckets, nfreelists);
+
+ size = MAXALIGN(sizeof(CHashTableData));
+ size = add_size(size, mul_size(sizeof(CHashPtr), total_buckets));
+ size = add_size(size, mul_size(table->arena_stride, table->arena_limit));
+
+ return size;
+}
+
+/*
+ * Create a concurrent hash table in shared memory, or attach to an existing
+ * table.
+ */
+CHashTable
+CHashInitialize(CHashTable table, CHashDescriptor *desc)
+{
+ Size size;
+ bool found;
+ void *shmem;
+ uint32 i;
+ uint32 nbuckets;
+ uint32 nfreelists;
+ uint32 ngarbage;
+ uint32 nextra;
+
+ /*
+ * If we're under the postmaster, this must be the EXEC_BACKEND case where
+ * we need to attach to an existing shared-memory segment.
+ */
+ if (IsUnderPostmaster)
+ {
+ void *shmem;
+
+ Assert(table == NULL);
+ table = MemoryContextAlloc(TopMemoryContext, sizeof(CHashTableData));
+ shmem = ShmemAttachStruct(desc->shmem_name);
+ memcpy(table, shmem, sizeof(CHashTableData));
+ return table;
+ }
+
+ /*
+ * Otherwise, the hash table should not already exist, and we must
+ * create it. But the table should already be bootstrapped, since we
+ * must previously have computed its size when figuring out our shared
+ * memory allocation.
+ */
+ Assert(table != NULL);
+ size = CHashEstimateSize(table);
+ shmem = ShmemInitStruct(table->desc.shmem_name, size, &found);
+ Assert(!found);
+
+ /* Bucket, garbage, and freelist arrays follow table info. */
+ table->bucket = (CHashPtr *)
+ (((char *) shmem) + MAXALIGN(sizeof(CHashTableData)));
+ nbuckets = CHashTableNBuckets(table);
+ table->extra = &table->bucket[nbuckets];
+
+ /* Arena follows the various lists. */
+ ngarbage = CHashTableNGarbage(table);
+ nfreelists = CHashTableNFreeLists(table);
+ nextra = ngarbage + nfreelists;
+ table->arena = (void *) (&table->extra[nextra]);
+
+ /* Initialize all three sets of lists to empty. */
+ for (i = 0; i < nbuckets; ++i)
+ table->bucket[i] = InvalidCHashPtr;
+ for (i = 0; i < nextra; ++i)
+ table->extra[i] = InvalidCHashPtr;
+
+ /* Put all arena elements on the free lists. */
+ for (i = 0; i < table->arena_limit; ++i)
+ {
+ CHashPtr *f = CHashTableGetFreeList(table, i % nfreelists);
+ CHashNode *n = CHashTableGetRaw(table, i);
+
+ n->un.gcnext = *f;
+ *f = MakeCHashPtr(i);
+ }
+
+ /*
+ * Copy table (with pointers now filled in) to shared memory. This is
+ * arguably unnecessary when not using EXEC_BACKEND, but we do it anyway.
+ */
+ memcpy(shmem, table, sizeof(CHashTableData));
+
+ return table;
+}
+
+/*
+ * Search a concurrent hash table. entry should be a block of memory large
+ * enough to hold a complete entry, with just the key portion filled in. If
+ * a matching entry is found, this function will fill in the rest of the entry
+ * from the data in the hash table and return true. If not, it will return
+ * false.
+ */
+bool
+CHashSearch(CHashTable table, void *entry)
+{
+ uint32 hashcode = hash_any(entry, table->desc.key_size);
+ uint32 bucket = hashcode & table->bucket_mask;
+ CHashPtr *b = &table->bucket[bucket];
+ CHashScanResult scan;
+
+ /* Prevent garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] == NULL);
+ MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+ pg_memory_barrier();
+
+ /* Scan bucket and return data from any matching entry. */
+ CHashBucketScan(table, b, hashcode, entry, &scan);
+ if (scan.found)
+ memcpy(((char *) entry) + table->desc.key_size,
+ CHashNodeGetItem(scan.target_node) + table->desc.key_size,
+ table->desc.element_size - table->desc.key_size);
+
+ /* Allow garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] != NULL);
+ pg_memory_barrier();
+ MyProc->hazard[0] = NULL;
+
+ CHashTableIncrementStatistic(table, CHS_Search);
+ if (!scan.found)
+ CHashTableIncrementStatistic(table, CHS_Search_Failed);
+ return scan.found;
+}
+
+/*
+ * Insert into a concurrent hash table. entry should be the complete entry
+ * to be inserted. If no duplicate entry is found in the table, this function
+ * will insert the entry and return true. Otherwise, the duplicate entry's
+ * value will be copied into the supplied entry and the function will return
+ * false.
+ *
+ * The caller is responsible for ensuring that no inserts are performed into
+ * a hash table which is at capacity. The behavor of such an insert is
+ * undefined (the actual behavior is that the insert may either succeed,
+ * degrading performance; or CHashAllocate may enter a tight loop until such
+ * time as an element is deleted).
+ */
+bool
+CHashInsert(CHashTable table, void *entry)
+{
+ uint32 hashcode = hash_any(entry, table->desc.key_size);
+ uint32 bucket = hashcode & table->bucket_mask;
+ CHashPtr *b = &table->bucket[bucket];
+ CHashPtr new;
+ CHashNode *nnew;
+ CHashScanResult scan;
+
+ /*
+ * Allocate and initialize a new entry, on the assumption that the insert
+ * will succeed. If it ends up failing, we must be sure to put this back
+ * on some free list, lest it be permanently leaked.
+ */
+ new = CHashAllocate(table);
+ nnew = CHashTableGetNode(table, new);
+ nnew->un.hashcode = hashcode;
+ memcpy(CHashNodeGetItem(nnew), entry, table->desc.element_size);
+
+ /* Prevent garbage collection for this bucket. */
+ MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+ pg_memory_barrier();
+
+ /*
+ * Scan the bucket. If we don't find a match, use compare-and-swap to
+ * insert the new node at the insert position. If we do find a match,
+ * return the data to the caller.
+ */
+retry:
+ CHashBucketScan(table, b, hashcode, entry, &scan);
+ if (scan.found)
+ memcpy(((char *) entry) + table->desc.key_size,
+ CHashNodeGetItem(scan.target_node) + table->desc.key_size,
+ table->desc.element_size - table->desc.key_size);
+ else
+ {
+ /*
+ * We didn't find a matching element, so use compare-and-swap to
+ * attempt to insert the new element we've prepared. If this fails,
+ * someone currently inserted or deleted an element. The correct
+ * insertion point might have changed, or the key we're trying to
+ * insert might now be present when it wasn't before, so we'll have
+ * to search the bucket chain anew.
+ *
+ * There is a risk of starvation here, but we hope it will not happen
+ * in practice. Contention for inserting new elements should be
+ * spread out pretty much evenly across N+M possible insertion points,
+ * where N is the number of buckets and M is the number of elements
+ * in the table. Even for a quite modestly size table this is likely
+ * to exceed the number of CPU cores.
+ */
+ Assert(!CHashPtrIsMarked(scan.target));
+ nnew->next = scan.target;
+ if (!__sync_bool_compare_and_swap(scan.pointer_to_target,
+ scan.target, new))
+ {
+ CHashTableIncrementStatistic(table, CHS_Insert_Retry);
+ goto retry;
+ }
+ }
+
+ /* Allow garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] != NULL);
+ pg_memory_barrier();
+ MyProc->hazard[0] = NULL;
+
+ /*
+ * If the insert failed, add the entry we found to the appropriate
+ * garbage list. We can't simply put this back on the freelist,
+ * because that leads to an ABA problem: some other backend could
+ * read the head of the freelist, go into the tank, and then use
+ * CAS to pop an element. If at that point, it finds the same
+ * element at the head of the freelist but with a different successor,
+ * we're hosed. To prevent that, we ensure that elements are added
+ * to a given freelist only after verifying that any allocations in
+ * progress at the time we popped the freelist has completed. This
+ * guarantees that any allocation still in progress at the time this
+ * element makes it back to the freelist is trying to allocate some
+ * other node.
+ */
+ CHashTableIncrementStatistic(table, CHS_Insert);
+ if (scan.found)
+ {
+ CHashTableIncrementStatistic(table, CHS_Insert_Failed);
+ CHashAddToGarbage(table, bucket, new);
+ }
+
+ /* The insert succeeded if and only if no duplicate was found. */
+ return !scan.found;
+}
+
+/*
+ * Delete from a concurrent hash table. entry need only contain the key field.
+ * Returns true if we find and delete a matching key and false otherwise.
+ */
+bool
+CHashDelete(CHashTable table, void *entry)
+{
+ uint32 hashcode = hash_any(entry, table->desc.key_size);
+ uint32 bucket = hashcode & table->bucket_mask;
+ CHashPtr *b = &table->bucket[bucket];
+ CHashScanResult scan;
+
+ /* Prevent garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] == NULL);
+ MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+ pg_memory_barrier();
+
+ /* Scan bucket. */
+retry:
+ CHashBucketScan(table, b, hashcode, entry, &scan);
+
+ /* If we found it, try to delete it. */
+ if (scan.found)
+ {
+ Assert(!CHashPtrIsMarked(scan.next));
+
+ /* Attempt to apply delete-mark. */
+ if (!__sync_bool_compare_and_swap(&scan.target_node->next,
+ scan.next,
+ CHashPtrMark(scan.next)))
+ {
+ CHashTableIncrementStatistic(table, CHS_Delete_Retry);
+ goto retry;
+ }
+
+ /* Deletion is done; attempt to remove node from list. */
+ if (__sync_bool_compare_and_swap(scan.pointer_to_target,
+ scan.target,
+ scan.next))
+ CHashAddToGarbage(table, bucket, scan.target);
+ else
+ {
+ CHashScanResult cleanup_scan;
+
+ /*
+ * If we weren't able to remove the deleted item, rescan
+ * the bucket to make sure it's really gone. This is just
+ * like a regular bucket scan, except that we don't care
+ * about the results. We're just doing it to achieve the
+ * side-effect of removing delete-marked nodes from the
+ * bucket chain.
+ */
+ CHashTableIncrementStatistic(table, CHS_Cleanup_Scan);
+ CHashBucketScan(table, b, hashcode, entry, &cleanup_scan);
+ }
+ }
+
+ /* Allow garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] != NULL);
+ pg_memory_barrier();
+ MyProc->hazard[0] = NULL;
+
+ /* We're done. */
+ CHashTableIncrementStatistic(table, CHS_Delete);
+ if (!scan.found)
+ CHashTableIncrementStatistic(table, CHS_Delete_Failed);
+ return scan.found;
+}
+
+/*
+ * Provide backend-local statistics to caller.
+ */
+void
+CHashStatistics(CHashTable table, uint64 *stats)
+{
+ memcpy(stats, &table->stats, sizeof(uint64) * CHS_NumberOfStatistics);
+}
+
+/*
+ * Scan one bucket of a concurrent hash table, storing the results in a
+ * CHashResult object provided by the caller.
+ *
+ * Caller must suppress garbage collection for the target bucket before
+ * calling this function, and resume it afterwards.
+ *
+ * On return, res->found will be true if a matching item was found and false
+ * otherwise. res->target will be a CHashPtr referencing the matching item,
+ * or the first one following the position where the matching item should have
+ * been; res->pointer_to_target will be a pointer to the memory address from
+ * which res->target was read.
+ *
+ * If res->target is not invalid, then res->target_node is a pointer to its
+ * location in memory. If res->found, then res->next will be the next pointer
+ * of res->target_node; otherwise, it's undefined.
+ */
+static void
+CHashBucketScan(CHashTable table,
+ CHashPtr *start,
+ uint32 hashcode,
+ const void *key,
+ CHashScanResult *res)
+{
+ CHashPtr target;
+ CHashPtr *pointer_to_target;
+ CHashNode *target_node = NULL;
+
+retry:
+ pointer_to_target = start;
+ target = *pointer_to_target;
+ for (;;)
+ {
+ CHashPtr next;
+ uint32 h;
+ int cmp;
+
+ /*
+ * If we've reached the end of the bucket chain, stop; otherwise,
+ * figure out the actual address of the next item.
+ */
+ if (CHashPtrIsInvalid(target))
+ {
+ res->found = false;
+ break;
+ }
+ target_node = CHashTableGetNode(table, target);
+
+ /*
+ * target may have been fetched from an arena entry that could be
+ * concurrently modified, so a dependency barrier is required before
+ * dereferencing the derived pointer.
+ */
+ pg_read_barrier_depends();
+ next = target_node->next;
+
+ /*
+ * For simplicity, any bucket scan, even if it's servicing a search,
+ * will attempt to remove delete-marked items from the bucket. This
+ * ensures that delete-marked elements are removed from bucket chains
+ * as quickly as possible and reduces code duplication. See
+ * CHashDelete for further comments about why delete-marking is
+ * necessary and how it allows safe deletion.
+ */
+ if (CHashPtrIsMarked(next))
+ {
+zap:
+ if (__sync_bool_compare_and_swap(pointer_to_target,
+ target,
+ CHashPtrUnmark(next)))
+ {
+ /*
+ * No one else can possibly have modified target_node->next,
+ * because such modifications are not allowed after a
+ * delete-mark has been applied. Thus, if we just keep
+ * following the next pointers, we're guaranteed to visit
+ * all non-deleted items (and possibly some deleted items)
+ * that were present at the time we began the scan.
+ */
+ CHashTableIncrementStatistic(table, CHS_Scan_Expunge);
+ CHashAddToGarbage(table, hashcode & table->bucket_mask,
+ target);
+ target = CHashPtrUnmark(next);
+ }
+ else
+ {
+ /*
+ * If the previous node has been delete-marked, we can't
+ * further update the next-pointer, so restart the scan.
+ * Otherwise, we know that some other backend removed one
+ * or more deleted nodes from the bucket chain just as we
+ * were trying to do, and we can simply continue the scan
+ * from wherever the previous node is pointing now. It's
+ * possible that some concurrent inserts have happened, too,
+ * but that's OK; we can view those as having happened "before"
+ * whatever operation this scan is supporting.
+ *
+ * Although starvation is a theoretical possibility here if
+ * we are forced to retry repeatedly, even a single retry is
+ * vanishingly unlikely in practice. It requires some other
+ * backend to delete both the node that we're looking at and
+ * the node which precedes it before we advance to the next
+ * node. That could certainly happen occasionally, but we'd
+ * have to be pretty unlucky to have it happen even twice in
+ * a row.
+ */
+ CHashTableIncrementStatistic(table, CHS_Scan_Expunge_Fail);
+ target = *pointer_to_target;
+ if (CHashPtrIsMarked(target))
+ {
+ CHashTableIncrementStatistic(table, CHS_Scan_Restart);
+ goto retry;
+ }
+ }
+ continue;
+ }
+
+ /*
+ * Bucket chains are kept in order, so that there is exactly one legal
+ * point at which any given key can be inserted. The ordering is by
+ * hashcode first, and then by memcmp ordering of the keys involved.
+ */
+ h = target_node->un.hashcode;
+ if (h == hashcode)
+ cmp = memcmp(CHashNodeGetItem(target_node), key,
+ table->desc.key_size);
+ else if (h > hashcode)
+ cmp = 1;
+ else
+ cmp = -1;
+
+ /*
+ * If cmp < 0, then we haven't yet reached the point at which we'd
+ * expect to find the key, so we must continue the scan. If cmp == 0,
+ * we've found the key and can stop. If cmp > 0, we've either passed
+ * the point where we expect to find the key OR someone delete-marked
+ * the item and overwrote the hashcode with a gcnext pointer. In the
+ * latter case we must take care not to be fooled into stopping the
+ * scan early.
+ */
+ if (cmp >= 0)
+ {
+ if (cmp == 0)
+ {
+ res->found = true;
+ res->next = next;
+ break;
+ }
+ else
+ {
+ /*
+ * pg_read_barrier() prevents the reread of the next pointer
+ * from being speculated ahead of the read of the hash value.
+ */
+ pg_read_barrier();
+ next = target_node->next;
+ if (CHashPtrIsMarked(next))
+ goto zap;
+ res->found = false;
+ break;
+ }
+ }
+
+ /* Continue scan from next node. */
+ pointer_to_target = &target_node->next;
+ target = next;
+ }
+
+ /* Send results back to caller. */
+ res->target = target;
+ res->pointer_to_target = pointer_to_target;
+ res->target_node = target_node;
+}
+
+/*
+ * Allocate an arena slot for a new item to be inserted into a hash table.
+ *
+ * We don't want to wait until every single free-list is completely empty
+ * before beginning to garbage collect, because that could result in very
+ * fast allocation followed by a storm of garbage collection activity.
+ * It could also lead to every inserting backend ganging up on the only
+ * non-empty freelist.
+ *
+ * To avoid that, we check free lists and garbage lists in alternation.
+ * We always start with the same free list - which one is based on our
+ * backend ID - but we try to round-robin over all the available garbage
+ * lists. Whenever we successfully garbage collect, we put the recovered
+ * items on our own free list. In this way, if there's only one backend
+ * active, it will typically find a free buffer in the first place it looks:
+ * its own free list. It will also settle into a pattern of garbage
+ * collecting the garbage list which it has visited least recently, which
+ * is what we want.
+ */
+static CHashPtr
+CHashAllocate(CHashTable table)
+{
+ uint32 f_current;
+ CHashPtr new;
+
+ /* Pick a starting freelist base on our backend ID. */
+ f_current = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
+
+ /* If this process hasn't initialized gc_next yet, do that now. */
+ if (table->gc_pid != MyProcPid)
+ {
+ table->gc_pid = MyProcPid;
+ table->gc_next = ((uint32) MyProcPid) % CHashTableNGarbage(table);
+ }
+
+ /* Loop until we allocate a buffer. */
+ for (;;)
+ {
+ CHashPtr *b;
+
+ /*
+ * Try to pop a buffer from a freelist using compare-and-swap.
+ *
+ * Note that this is only safe if it's impossible for the next pointer
+ * of the first element on the list to change between the time when
+ * we read it and the time we use CAS to pop it off the list. This
+ * means that we can't allow any element that is currently on this
+ * freelist to be allocated, marked as garbage, garbage collected,
+ * and returned back to this freelist before we finish the CAS.
+ *
+ * If we attempt to pop the free-list and fail, we retry immediately
+ * with the same free-list. This reduces the frequency with which
+ * we're obliged to update our hazard pointers, which is a material
+ * savings due to the associated memory barrier.
+ */
+ b = CHashTableGetFreeList(table, f_current);
+ MyProc->hazard[0] = b;
+ pg_memory_barrier();
+ new = *b;
+ while (!CHashPtrIsInvalid(new))
+ {
+ CHashNode *n = CHashTableGetNode(table, new);
+
+ /*
+ * n is computed from table->freelist[f_current], which could
+ * be modified by concurrent activity, so we need a dependency
+ * barrier here.
+ */
+ pg_read_barrier_depends();
+ if (__sync_bool_compare_and_swap(b, new, n->un.gcnext))
+ return new;
+ CHashTableIncrementStatistic(table, CHS_Allocate_Fail);
+ new = *b;
+ }
+
+ /* Attempt garbage collection. */
+ new = CHashAllocateViaGC(table);
+ if (!CHashPtrIsInvalid(new))
+ return new;
+
+ /* Advance to next freelist. */
+ f_current = (f_current + 1) % CHashTableNFreeLists(table);
+ }
+}
+
+/*
+ * Attempt to satisfy an allocation request via garbage collection.
+ */
+static CHashPtr
+CHashAllocateViaGC(CHashTable table)
+{
+ uint32 f_home;
+ CHashPtr *b;
+ CHashPtr *fh;
+ CHashPtr fhead;
+ CHashPtr garbage;
+ CHashPtr new;
+ CHashNode *n;
+ uint32 i;
+
+ /* Pick a target freelist based on our backend ID. */
+ f_home = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
+ fh = CHashTableGetFreeList(table, f_home);
+
+ /* Select target garbage list. */
+ table->gc_next = (table->gc_next + 1) % CHashTableNGarbage(table);
+ b = CHashTableGetGarbageList(table, table->gc_next);
+ garbage = *b;
+
+ /* If list is empty, fail. */
+ if (CHashPtrIsInvalid(garbage))
+ return InvalidCHashPtr;
+
+ /* If we're unable to empty the list via compare-and-swap, fail. */
+ if (!__sync_bool_compare_and_swap(b, garbage, InvalidCHashPtr))
+ {
+ CHashTableIncrementStatistic(table, CHS_Garbage_Dequeue_Fail);
+ return InvalidCHashPtr;
+ }
+
+ /*
+ * Before removing elements removed from the garbage list to the
+ * freelist, we must wait until (1) all bucket scans that might
+ * still see elements on the freelist as part of the bucket chain
+ * have completed and (2) all allocations that might see an old
+ * version of the freelist containing one of the elements to be
+ * garbage collected have completed.
+ *
+ * Note: We can't begin this operation until the clearing of the
+ * garbage list has been committed to memory, but since that was
+ * done using an atomic operation no explicit barrier is needed
+ * here.
+ *
+ * Note: We could have a "soft" version of this that merely
+ * requeues the garbage if it's not immediately recycleable, but
+ * it's not clear that we need such a thing. On the flip side we
+ * might want to eventually enter a longer sleep here, or PANIC,
+ * but it's not clear exactly how to calibrate that.
+ */
+ CHashTableIncrementStatistic(table, CHS_GC);
+ MyProc->hazard[0] = NULL;
+ for (i = 0; i < ProcGlobal->allProcCount; i++)
+ {
+ volatile PGPROC *proc = &ProcGlobal->allProcs[i];
+ void *hazard;
+
+ hazard = proc->hazard[0];
+ if (hazard == b || hazard == fh)
+ {
+ CHashTableIncrementStatistic(table, CHS_GC_Spin);
+ do
+ {
+ hazard = proc->hazard[0];
+ } while (hazard == b || hazard == fh);
+ }
+ }
+
+ /* Remove one item from list to satisfy current allocation. */
+ new = garbage;
+ n = CHashTableGetNode(table, new);
+ pg_read_barrier_depends();
+ fhead = n->un.gcnext;
+
+ if (CHashPtrIsInvalid(fhead))
+ {
+ /*
+ * We have reclaimed exactly node, so there's nothing to put
+ * back on the free list. In this case (only) we need a
+ * memory barrier, because the reads above must complete
+ * before we overwrite n->un.gcnext with a new hashcode.
+ * (This is only needed when we reclaim exactly one node,
+ * because in any other case we'll do a compare-and-swap
+ * before returning, which implies a full barrier.)
+ */
+ pg_memory_barrier();
+ CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Skipped);
+ }
+ else if (__sync_bool_compare_and_swap(fh, InvalidCHashPtr, fhead))
+ {
+ /*
+ * Our free list is empty, and we've succesfully pushed the
+ * reclaimed nodes onto it. So we're done.
+ */
+ CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Fast);
+ }
+ else
+ {
+ CHashPtr fcurrent;
+ CHashPtr fnext;
+ CHashPtr oldhead;
+
+ /* Walk list of reclaimed elements to end. */
+ fcurrent = fhead;
+ for (;;)
+ {
+ n = CHashTableGetNode(table, fcurrent);
+ fnext = n->un.gcnext;
+ if (CHashPtrIsInvalid(fnext))
+ break;
+ fcurrent = fnext;
+ }
+
+ /* Push reclaimed elements onto home free list. */
+ for (;;)
+ {
+ oldhead = *fh;
+ n->un.gcnext = oldhead;
+ if (__sync_bool_compare_and_swap(fh, oldhead, fhead))
+ break;
+ CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Retry);
+ }
+ }
+
+ /* Return the element we saved for ourselves. */
+ return new;
+}
+
+/*
+ * Add an arena slot to the appropriate garbage list.
+ *
+ * The next garbage collection cycle for the affected bucket will move it
+ * to the free list. We can't do that immediately because there might be
+ * someone traversing the list who is counting on being able to follow the
+ * next pointer. It's OK to clobber the hash value, though, since a spurious
+ * failure to match an already-deleted item shouldn't cause any problems;
+ * this is why gcnext can share space with the hash value.
+ */
+static void
+CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c)
+{
+ CHashPtr g;
+ CHashNode *n;
+ CHashPtr *garbage;
+
+ n = CHashTableGetNode(table, c);
+ garbage = CHashTableGetGarbageByBucket(table, bucket);
+
+ while (1)
+ {
+ g = *garbage;
+ n->un.gcnext = g;
+ if (__sync_bool_compare_and_swap(garbage, g, c))
+ break;
+ CHashTableIncrementStatistic(table, CHS_Garbage_Enqueue_Retry);
+ }
+}
diff --git a/src/bin/pg_basebackup/pg_receivexlog.c b/src/bin/pg_basebackup/pg_receivexlog.c
index e6f69e4edd..7374cc8eb4 100644
--- a/src/bin/pg_basebackup/pg_receivexlog.c
+++ b/src/bin/pg_basebackup/pg_receivexlog.c
@@ -66,9 +66,12 @@ usage(void)
printf(_(" %s [OPTION]...\n"), progname);
printf(_("\nOptions:\n"));
printf(_(" -D, --directory=DIR receive transaction log files into this directory\n"));
+ printf(_(" -F --fsync-interval=SECS\n"
+ " time between fsyncs to transaction log files (default: %d)\n"), (fsync_interval / 1000));
printf(_(" -n, --no-loop do not loop on connection lost\n"));
- printf(_(" -F --fsync-interval=INTERVAL\n"
- " frequency of syncs to transaction log files (in seconds)\n"));
+ printf(_(" -s, --status-interval=SECS\n"
+ " time between status packets sent to server (default: %d)\n"), (standby_message_timeout / 1000));
+ printf(_(" -S, --slot=SLOTNAME replication slot to use\n"));
printf(_(" -v, --verbose output verbose messages\n"));
printf(_(" -V, --version output version information, then exit\n"));
printf(_(" -?, --help show this help, then exit\n"));
@@ -76,12 +79,9 @@ usage(void)
printf(_(" -d, --dbname=CONNSTR connection string\n"));
printf(_(" -h, --host=HOSTNAME database server host or socket directory\n"));
printf(_(" -p, --port=PORT database server port number\n"));
- printf(_(" -s, --status-interval=INTERVAL\n"
- " time between status packets sent to server (in seconds)\n"));
printf(_(" -U, --username=NAME connect as specified database user\n"));
printf(_(" -w, --no-password never prompt for password\n"));
printf(_(" -W, --password force password prompt (should happen automatically)\n"));
- printf(_(" -S, --slot=SLOTNAME replication slot to use\n"));
printf(_("\nOptional actions:\n"));
printf(_(" --create-slot create a new replication slot (for the slot's name see --slot)\n"));
printf(_(" --drop-slot drop the replication slot (for the slot's name see --slot)\n"));
diff --git a/src/bin/pg_basebackup/pg_recvlogical.c b/src/bin/pg_basebackup/pg_recvlogical.c
index 1a01167912..0d97638851 100644
--- a/src/bin/pg_basebackup/pg_recvlogical.c
+++ b/src/bin/pg_basebackup/pg_recvlogical.c
@@ -62,15 +62,27 @@ static void disconnect_and_exit(int code);
static void
usage(void)
{
- printf(_("%s receives PostgreSQL logical change stream.\n\n"),
+ printf(_("%s receives PostgreSQL logical change streams.\n\n"),
progname);
printf(_("Usage:\n"));
printf(_(" %s [OPTION]...\n"), progname);
+ printf(_("\nAction to be performed:\n"));
+ printf(_(" --create-slot create a new replication slot (for the slot's name see --slot)\n"));
+ printf(_(" --drop-slot drop the replication slot (for the slot's name see --slot)\n"));
+ printf(_(" --start start streaming in a replication slot (for the slot's name see --slot)\n"));
printf(_("\nOptions:\n"));
- printf(_(" -f, --file=FILE receive log into this file. - for stdout\n"));
+ printf(_(" -f, --file=FILE receive log into this file, - for stdout\n"));
printf(_(" -F --fsync-interval=SECS\n"
- " frequency of syncs to the output file (default: %d)\n"), (fsync_interval / 1000));
+ " time between fsyncs to the output file (default: %d)\n"), (fsync_interval / 1000));
+ printf(_(" -I, --startpos=LSN where in an existing slot should the streaming start\n"));
printf(_(" -n, --no-loop do not loop on connection lost\n"));
+ printf(_(" -o, --option=NAME[=VALUE]\n"
+ " pass option NAME with optional value VALUE to the\n"
+ " output plugin\n"));
+ printf(_(" -P, --plugin=PLUGIN use output plugin PLUGIN (default: %s)\n"), plugin);
+ printf(_(" -s, --status-interval=SECS\n"
+ " time between status packets sent to server (default: %d)\n"), (standby_message_timeout / 1000));
+ printf(_(" -S, --slot=SLOTNAME name of the logical replication slot\n"));
printf(_(" -v, --verbose output verbose messages\n"));
printf(_(" -V, --version output version information, then exit\n"));
printf(_(" -?, --help show this help, then exit\n"));
@@ -81,19 +93,6 @@ usage(void)
printf(_(" -U, --username=NAME connect as specified database user\n"));
printf(_(" -w, --no-password never prompt for password\n"));
printf(_(" -W, --password force password prompt (should happen automatically)\n"));
- printf(_("\nReplication options:\n"));
- printf(_(" -I, --startpos=PTR where in an existing slot should the streaming start\n"));
- printf(_(" -o, --option=NAME[=VALUE]\n"
- " specify option NAME with optional value VALUE, to be passed\n"
- " to the output plugin\n"));
- printf(_(" -P, --plugin=PLUGIN use output plugin PLUGIN (default: %s)\n"), plugin);
- printf(_(" -s, --status-interval=SECS\n"
- " time between status packets sent to server (default: %d)\n"), (standby_message_timeout / 1000));
- printf(_(" -S, --slot=SLOT name of the logical replication slot\n"));
- printf(_("\nAction to be performed:\n"));
- printf(_(" --create-slot create a new replication slot (for the slot's name see --slot)\n"));
- printf(_(" --drop-slot drop the replication slot (for the slot's name see --slot)\n"));
- printf(_(" --start start streaming in a replication slot (for the slot's name see --slot)\n"));
printf(_("\nReport bugs to <pgsql-bugs@postgresql.org>.\n"));
}
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index a46ca53ba6..733f1cbc86 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -1456,7 +1456,9 @@ pgwin32_doRegister(void)
NULL, NULL, "RPCSS\0", register_username, register_password)) == NULL)
{
CloseServiceHandle(hSCM);
- write_stderr(_("%s: could not register service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError());
+ write_stderr(_("%s: could not register service \"%s\": error code %lu\n"),
+ progname, register_servicename,
+ (unsigned long) GetLastError());
exit(1);
}
CloseServiceHandle(hService);
@@ -1484,14 +1486,18 @@ pgwin32_doUnregister(void)
if ((hService = OpenService(hSCM, register_servicename, DELETE)) == NULL)
{
CloseServiceHandle(hSCM);
- write_stderr(_("%s: could not open service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError());
+ write_stderr(_("%s: could not open service \"%s\": error code %lu\n"),
+ progname, register_servicename,
+ (unsigned long) GetLastError());
exit(1);
}
if (!DeleteService(hService))
{
CloseServiceHandle(hService);
CloseServiceHandle(hSCM);
- write_stderr(_("%s: could not unregister service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError());
+ write_stderr(_("%s: could not unregister service \"%s\": error code %lu\n"),
+ progname, register_servicename,
+ (unsigned long) GetLastError());
exit(1);
}
CloseServiceHandle(hService);
@@ -1627,7 +1633,9 @@ pgwin32_doRunAsService(void)
if (StartServiceCtrlDispatcher(st) == 0)
{
- write_stderr(_("%s: could not start service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError());
+ write_stderr(_("%s: could not start service \"%s\": error code %lu\n"),
+ progname, register_servicename,
+ (unsigned long) GetLastError());
exit(1);
}
}
@@ -1708,7 +1716,14 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser
/* Open the current token to use as a base for the restricted one */
if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ALL_ACCESS, &origToken))
{
- write_stderr(_("%s: could not open process token: error code %lu\n"), progname, GetLastError());
+ /*
+ * Most Windows targets make DWORD a 32-bit unsigned long. Cygwin
+ * x86_64, an LP64 target, makes it a 32-bit unsigned int. In code
+ * built for Cygwin as well as for native Windows targets, cast DWORD
+ * before printing.
+ */
+ write_stderr(_("%s: could not open process token: error code %lu\n"),
+ progname, (unsigned long) GetLastError());
return 0;
}
@@ -1721,7 +1736,8 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser
SECURITY_BUILTIN_DOMAIN_RID, DOMAIN_ALIAS_RID_POWER_USERS, 0, 0, 0, 0, 0,
0, &dropSids[1].Sid))
{
- write_stderr(_("%s: could not allocate SIDs: error code %lu\n"), progname, GetLastError());
+ write_stderr(_("%s: could not allocate SIDs: error code %lu\n"),
+ progname, (unsigned long) GetLastError());
return 0;
}
@@ -1740,7 +1756,8 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser
if (!b)
{
- write_stderr(_("%s: could not create restricted token: error code %lu\n"), progname, GetLastError());
+ write_stderr(_("%s: could not create restricted token: error code %lu\n"),
+ progname, (unsigned long) GetLastError());
return 0;
}
@@ -1791,7 +1808,8 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser
HANDLE job;
char jobname[128];
- sprintf(jobname, "PostgreSQL_%lu", processInfo->dwProcessId);
+ sprintf(jobname, "PostgreSQL_%lu",
+ (unsigned long) processInfo->dwProcessId);
job = _CreateJobObject(NULL, jobname);
if (job)
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index c9e61dfa39..0e1e0cd5f0 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -127,6 +127,10 @@ typedef struct HashJoinTableData
int nbuckets; /* # buckets in the in-memory hash table */
int log2_nbuckets; /* its log2 (nbuckets must be a power of 2) */
+ int nbuckets_original; /* # buckets when starting the first hash */
+ int nbuckets_optimal; /* optimal # buckets (per batch) */
+ int log2_nbuckets_optimal; /* same as log2_nbuckets optimal */
+
/* buckets[i] is head of list of tuples in i'th in-memory bucket */
struct HashJoinTupleData **buckets;
/* buckets array is per-batch storage, as are all the tuples */
@@ -148,6 +152,7 @@ typedef struct HashJoinTableData
bool growEnabled; /* flag to shut off nbatch increases */
double totalTuples; /* # tuples obtained from inner plan */
+ double skewTuples; /* # tuples inserted into skew tuples */
/*
* These arrays are allocated for the life of the hash join, but only if
diff --git a/src/include/storage/barrier.h b/src/include/storage/barrier.h
index b36705b862..6ef779bf95 100644
--- a/src/include/storage/barrier.h
+++ b/src/include/storage/barrier.h
@@ -20,4 +20,12 @@
*/
#include "port/atomics.h"
+/*
+ * If dependency barriers are undefined, we define them as a no-op. The only
+ * known platform where anything more is required is DEC Alpha.
+ */
+#if !defined(pg_read_barrier_depends)
+#define pg_read_barrier_depends()
+#endif
+
#endif /* BARRIER_H */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 0e69b633c3..4c6fac8052 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -96,20 +96,6 @@ typedef struct buftag
)
/*
- * The shared buffer mapping table is partitioned to reduce contention.
- * To determine which partition lock a given tag requires, compute the tag's
- * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
- * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
- */
-#define BufTableHashPartition(hashcode) \
- ((hashcode) % NUM_BUFFER_PARTITIONS)
-#define BufMappingPartitionLock(hashcode) \
- (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \
- BufTableHashPartition(hashcode)].lock)
-#define BufMappingPartitionLockByIndex(i) \
- (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock)
-
-/*
* BufferDesc -- shared descriptor/state data for a single shared buffer.
*
* Note: buf_hdr_lock must be held to examine or change the tag, flags,
@@ -200,9 +186,9 @@ extern void StrategyInitialize(bool init);
extern Size BufTableShmemSize(int size);
extern void InitBufTable(int size);
extern uint32 BufTableHashCode(BufferTag *tagPtr);
-extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
-extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
-extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
+extern int BufTableLookup(BufferTag *tagPtr);
+extern int BufTableInsert(BufferTag *tagPtr, int buf_id);
+extern void BufTableDelete(BufferTag *tagPtr);
/* localbuf.c */
extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 595e69da48..8e98425ca4 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -140,7 +140,7 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
*/
/* Number of partitions of the shared buffer mapping hashtable */
-#define NUM_BUFFER_PARTITIONS 128
+#define NUM_BUFFER_PARTITIONS 0
/* Number of partitions the shared lock tables are divided into */
#define LOG2_NUM_LOCK_PARTITIONS 4
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 38758d3ea5..cdf2f268fd 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -59,6 +59,14 @@ struct XidCache
#define FP_LOCK_SLOTS_PER_BACKEND 16
/*
+ * Some lock-free algorithms require each backend process to be able to
+ * advertise a certain number of "hazard pointers" in shared memory.
+ * Right now one per backend is enough for our purpose, but some
+ * algorithms require more.
+ */
+#define NUM_HAZARD_POINTERS 1
+
+/*
* Each backend has a PGPROC struct in shared memory. There is also a list of
* currently-unused PGPROC structs that will be reallocated to new backends.
*
@@ -143,6 +151,12 @@ struct PGPROC
bool fpVXIDLock; /* are we holding a fast-path VXID lock? */
LocalTransactionId fpLocalTransactionId; /* lxid for fast-path VXID
* lock */
+
+ /*
+ * Hazard pointers. Currently one is enough, though some algorithms
+ * require a few more.
+ */
+ void *hazard[NUM_HAZARD_POINTERS];
};
/* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
diff --git a/src/include/storage/shm_mq.h b/src/include/storage/shm_mq.h
index 5bae3807af..063400ae28 100644
--- a/src/include/storage/shm_mq.h
+++ b/src/include/storage/shm_mq.h
@@ -25,6 +25,13 @@ typedef struct shm_mq shm_mq;
struct shm_mq_handle;
typedef struct shm_mq_handle shm_mq_handle;
+/* Descriptors for a single write spanning multiple locations. */
+typedef struct
+{
+ const char *data;
+ Size len;
+} shm_mq_iovec;
+
/* Possible results of a send or receive operation. */
typedef enum
{
@@ -52,12 +59,17 @@ extern PGPROC *shm_mq_get_sender(shm_mq *);
extern shm_mq_handle *shm_mq_attach(shm_mq *mq, dsm_segment *seg,
BackgroundWorkerHandle *handle);
+/* Associate worker handle with shm_mq. */
+extern void shm_mq_set_handle(shm_mq_handle *, BackgroundWorkerHandle *);
+
/* Break connection. */
extern void shm_mq_detach(shm_mq *);
/* Send or receive messages. */
extern shm_mq_result shm_mq_send(shm_mq_handle *mqh,
- Size nbytes, void *data, bool nowait);
+ Size nbytes, const void *data, bool nowait);
+extern shm_mq_result shm_mq_sendv(shm_mq_handle *mqh,
+ shm_mq_iovec *iov, int iovcnt, bool nowait);
extern shm_mq_result shm_mq_receive(shm_mq_handle *mqh,
Size *nbytesp, void **datap, bool nowait);
diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h
index 745eb7e576..4ff8415fac 100644
--- a/src/include/storage/shmem.h
+++ b/src/include/storage/shmem.h
@@ -40,6 +40,7 @@ extern void InitShmemIndex(void);
extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size,
HASHCTL *infoP, int hash_flags);
extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr);
+extern void *ShmemAttachStruct(const char *name);
extern Size add_size(Size s1, Size s2);
extern Size mul_size(Size s1, Size s2);
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index d88e7a3b26..fb1b4a42dd 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -676,13 +676,10 @@ extern Datum pg_get_viewdef_name(PG_FUNCTION_ARGS);
extern Datum pg_get_viewdef_name_ext(PG_FUNCTION_ARGS);
extern Datum pg_get_indexdef(PG_FUNCTION_ARGS);
extern Datum pg_get_indexdef_ext(PG_FUNCTION_ARGS);
-extern char *pg_get_indexdef_string(Oid indexrelid);
-extern char *pg_get_indexdef_columns(Oid indexrelid, bool pretty);
extern Datum pg_get_triggerdef(PG_FUNCTION_ARGS);
extern Datum pg_get_triggerdef_ext(PG_FUNCTION_ARGS);
extern Datum pg_get_constraintdef(PG_FUNCTION_ARGS);
extern Datum pg_get_constraintdef_ext(PG_FUNCTION_ARGS);
-extern char *pg_get_constraintdef_string(Oid constraintId);
extern Datum pg_get_expr(PG_FUNCTION_ARGS);
extern Datum pg_get_expr_ext(PG_FUNCTION_ARGS);
extern Datum pg_get_userbyid(PG_FUNCTION_ARGS);
@@ -692,17 +689,9 @@ extern Datum pg_get_function_arguments(PG_FUNCTION_ARGS);
extern Datum pg_get_function_identity_arguments(PG_FUNCTION_ARGS);
extern Datum pg_get_function_result(PG_FUNCTION_ARGS);
extern Datum pg_get_function_arg_default(PG_FUNCTION_ARGS);
-extern char *deparse_expression(Node *expr, List *dpcontext,
- bool forceprefix, bool showimplicit);
-extern List *deparse_context_for(const char *aliasname, Oid relid);
-extern List *deparse_context_for_planstate(Node *planstate, List *ancestors,
- List *rtable, List *rtable_names);
-extern List *select_rtable_names_for_explain(List *rtable,
- Bitmapset *rels_used);
extern const char *quote_identifier(const char *ident);
extern char *quote_qualified_identifier(const char *qualifier,
const char *ident);
-extern char *generate_collation_name(Oid collid);
/* tid.c */
diff --git a/src/include/utils/chash.h b/src/include/utils/chash.h
new file mode 100644
index 0000000000..ee0573c9c7
--- /dev/null
+++ b/src/include/utils/chash.h
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * chash.h
+ * Concurrent shared-memory hash table.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/chash.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CHASH_H
+#define CHASH_H
+
+/* Everything caller must supply to set up a concurrent hash table. */
+typedef struct
+{
+ const char *shmem_name; /* shared memory name for this hash table */
+ uint32 capacity; /* maximum size of hash table */
+ uint16 element_size; /* size of each element */
+ uint16 key_size; /* size of each key */
+} CHashDescriptor;
+
+/* Concurrent hash table statistics. */
+typedef enum
+{
+ CHS_Search, /* search */
+ CHS_Search_Failed, /* search failed (no such key) */
+ CHS_Insert, /* insert */
+ CHS_Insert_Failed, /* insert failed (duplicate key) */
+ CHS_Insert_Retry, /* insert retried (concurrent update) */
+ CHS_Delete, /* delete */
+ CHS_Delete_Failed, /* delete failed (no such key) */
+ CHS_Delete_Retry, /* delete retried (concurrent update) */
+ CHS_Scan_Expunge, /* scan expunged deleted item */
+ CHS_Scan_Expunge_Fail, /* scan failed to expunge */
+ CHS_Scan_Restart, /* concurrent deletes forced a scan restart */
+ CHS_Cleanup_Scan, /* concurrent update forced a cleanup scan */
+ CHS_Allocate_Fail, /* allocation failed to pop freelist */
+ CHS_Garbage_Enqueue_Retry, /* enqueue on garbage list retried */
+ CHS_Garbage_Dequeue_Fail, /* dequeue of garbage failed */
+ CHS_GC, /* garbage collection cycle */
+ CHS_GC_Spin, /* GC spun waiting for concurrent process */
+ CHS_GC_Reclaim_Skipped, /* GC recovered only one item */
+ CHS_GC_Reclaim_Fast, /* GC put garbage on freelist via fast path */
+ CHS_GC_Reclaim_Retry, /* enqueue of garbage on freelist retried */
+ CHS_NumberOfStatistics /* number of statistics */
+} CHashStatisticsType;
+
+/* Human-readable names for statistics. */
+extern char *CHashStatisticsNames[];
+
+/* Opaque handle for a concurrent hash table. */
+struct CHashTableData;
+typedef struct CHashTableData *CHashTable;
+
+/* Initialization functions. */
+extern CHashTable CHashBootstrap(CHashDescriptor *desc);
+extern Size CHashEstimateSize(CHashTable table);
+extern CHashTable CHashInitialize(CHashTable table, CHashDescriptor *desc);
+
+/* Accessor functions. */
+extern bool CHashInsert(CHashTable table, void *entry);
+extern bool CHashDelete(CHashTable table, void *key);
+extern bool CHashSearch(CHashTable table, void *entry);
+extern void CHashStatistics(CHashTable table, uint64 *stats);
+
+#endif /* CHASH_H */
diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h
new file mode 100644
index 0000000000..520b06653c
--- /dev/null
+++ b/src/include/utils/ruleutils.h
@@ -0,0 +1,34 @@
+/*-------------------------------------------------------------------------
+ *
+ * ruleutils.h
+ * Declarations for ruleutils.c
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/ruleutils.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef RULEUTILS_H
+#define RULEUTILS_H
+
+#include "nodes/nodes.h"
+#include "nodes/parsenodes.h"
+#include "nodes/pg_list.h"
+
+
+extern char *pg_get_indexdef_string(Oid indexrelid);
+extern char *pg_get_indexdef_columns(Oid indexrelid, bool pretty);
+
+extern char *pg_get_constraintdef_string(Oid constraintId);
+extern char *deparse_expression(Node *expr, List *dpcontext,
+ bool forceprefix, bool showimplicit);
+extern List *deparse_context_for(const char *aliasname, Oid relid);
+extern List *deparse_context_for_planstate(Node *planstate, List *ancestors,
+ List *rtable, List *rtable_names);
+extern List *select_rtable_names_for_explain(List *rtable,
+ Bitmapset *rels_used);
+extern char *generate_collation_name(Oid collid);
+
+#endif /* RULEUTILS_H */
diff --git a/src/port/crypt.c b/src/port/crypt.c
index ef8bf46338..6a902ef0fc 100644
--- a/src/port/crypt.c
+++ b/src/port/crypt.c
@@ -87,7 +87,7 @@ static int des_cipher(const char *in, char *out, long salt, int num_iter);
* define "B64" to be the declaration for a 64 bit integer.
* XXX this feature is currently unused, see "endian" comment below.
*/
-#define B64 __int64
+/* #define B64 int64 */
/*
* define "LARGEDATA" to get faster permutations, by using about 72 kilobytes
diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out
index eb37da7168..9146f59435 100644
--- a/src/test/regress/expected/jsonb.out
+++ b/src/test/regress/expected/jsonb.out
@@ -707,6 +707,42 @@ SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b", "c":"q"}';
f
(1 row)
+SELECT '[1,2]'::jsonb @> '[1,2,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[1,1,2]'::jsonb @> '[1,2,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[[1,2]]'::jsonb @> '[[1,2,2]]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[1,2,2]'::jsonb <@ '[1,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[1,2,2]'::jsonb <@ '[1,1,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[[1,2,2]]'::jsonb <@ '[[1,2]]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
SELECT jsonb_contained('{"a":"b"}', '{"a":"b", "b":1, "c":null}');
jsonb_contained
-----------------
diff --git a/src/test/regress/expected/jsonb_1.out b/src/test/regress/expected/jsonb_1.out
index f3bfc7bcf5..83d61f8c7e 100644
--- a/src/test/regress/expected/jsonb_1.out
+++ b/src/test/regress/expected/jsonb_1.out
@@ -707,6 +707,42 @@ SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b", "c":"q"}';
f
(1 row)
+SELECT '[1,2]'::jsonb @> '[1,2,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[1,1,2]'::jsonb @> '[1,2,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[[1,2]]'::jsonb @> '[[1,2,2]]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[1,2,2]'::jsonb <@ '[1,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[1,2,2]'::jsonb <@ '[1,1,2]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '[[1,2,2]]'::jsonb <@ '[[1,2]]'::jsonb;
+ ?column?
+----------
+ t
+(1 row)
+
SELECT jsonb_contained('{"a":"b"}', '{"a":"b", "b":1, "c":null}');
jsonb_contained
-----------------
diff --git a/src/test/regress/expected/matview.out b/src/test/regress/expected/matview.out
index 1076b76210..b04510c599 100644
--- a/src/test/regress/expected/matview.out
+++ b/src/test/regress/expected/matview.out
@@ -411,7 +411,7 @@ REFRESH MATERIALIZED VIEW mv;
ERROR: could not create unique index "mv_a_idx"
DETAIL: Key (a)=(1) is duplicated.
REFRESH MATERIALIZED VIEW CONCURRENTLY mv;
-ERROR: new data for "mv" contains duplicate rows without any NULL columns
+ERROR: new data for "mv" contains duplicate rows without any null columns
DETAIL: Row: (1,10)
DROP TABLE foo CASCADE;
NOTICE: drop cascades to materialized view mv
diff --git a/src/test/regress/expected/polygon.out b/src/test/regress/expected/polygon.out
index b252902720..33388eb909 100644
--- a/src/test/regress/expected/polygon.out
+++ b/src/test/regress/expected/polygon.out
@@ -3,15 +3,15 @@
--
-- polygon logic
--
--- 3 o
--- |
--- 2 + |
--- / |
--- 1 # o +
--- / |
--- 0 #-----o-+
+-- 3 o
+-- |
+-- 2 + |
+-- / |
+-- 1 # +
+-- / o |
+-- 0 #-----o-+
--
--- 0 1 2 3 4
+-- 0 1 2 3 4
--
CREATE TABLE POLYGON_TBL(f1 polygon);
INSERT INTO POLYGON_TBL(f1) VALUES ('(2.0,0.0),(2.0,4.0),(0.0,0.0)');
@@ -128,15 +128,16 @@ SELECT '' AS one, p.*
--
-- polygon logic
--
--- 3 o
--- |
--- 2 + |
--- / |
--- 1 / o +
+-- 3 o
+-- /|
+-- 2 + |
+-- / |
+-- 1 / o +
-- / |
--- 0 +-----o-+
+-- 0 +-----o-+
+--
+-- 0 1 2 3 4
--
--- 0 1 2 3 4
--
-- left of
SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' << polygon '(3.0,1.0),(3.0,3.0),(1.0,0.0)' AS false;
@@ -248,11 +249,11 @@ SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' && polygon '(3.0,1.0),(3.0,3.0),(
(1 row)
-- +--------------------+
--- | *---* 1
+-- | *---* 1
-- | + | |
-- | 2 *---*
-- +--------------------+
--- 3
+-- 3
-- Edges 1-2, 2-3 are not shown on picture
SELECT '((0,4),(6,4),(1,2),(6,0),(0,0))'::polygon && '((2,1),(2,3),(3,3),(3,1))'::polygon AS "true";
true
diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql
index ed266d5c88..f1ed021be2 100644
--- a/src/test/regress/sql/jsonb.sql
+++ b/src/test/regress/sql/jsonb.sql
@@ -156,6 +156,13 @@ SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"c"}';
SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b"}';
SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b", "c":"q"}';
+SELECT '[1,2]'::jsonb @> '[1,2,2]'::jsonb;
+SELECT '[1,1,2]'::jsonb @> '[1,2,2]'::jsonb;
+SELECT '[[1,2]]'::jsonb @> '[[1,2,2]]'::jsonb;
+SELECT '[1,2,2]'::jsonb <@ '[1,2]'::jsonb;
+SELECT '[1,2,2]'::jsonb <@ '[1,1,2]'::jsonb;
+SELECT '[[1,2,2]]'::jsonb <@ '[[1,2]]'::jsonb;
+
SELECT jsonb_contained('{"a":"b"}', '{"a":"b", "b":1, "c":null}');
SELECT jsonb_contained('{"a":"b", "c":null}', '{"a":"b", "b":1, "c":null}');
SELECT jsonb_contained('{"a":"b", "g":null}', '{"a":"b", "b":1, "c":null}');
diff --git a/src/test/regress/sql/polygon.sql b/src/test/regress/sql/polygon.sql
index 2dad566f37..d95fa96447 100644
--- a/src/test/regress/sql/polygon.sql
+++ b/src/test/regress/sql/polygon.sql
@@ -3,15 +3,15 @@
--
-- polygon logic
--
--- 3 o
--- |
--- 2 + |
--- / |
--- 1 # o +
--- / |
--- 0 #-----o-+
+-- 3 o
+-- |
+-- 2 + |
+-- / |
+-- 1 # +
+-- / o |
+-- 0 #-----o-+
--
--- 0 1 2 3 4
+-- 0 1 2 3 4
--
CREATE TABLE POLYGON_TBL(f1 polygon);
@@ -83,15 +83,16 @@ SELECT '' AS one, p.*
--
-- polygon logic
--
--- 3 o
--- |
--- 2 + |
--- / |
--- 1 / o +
+-- 3 o
+-- /|
+-- 2 + |
+-- / |
+-- 1 / o +
-- / |
--- 0 +-----o-+
+-- 0 +-----o-+
+--
+-- 0 1 2 3 4
--
--- 0 1 2 3 4
--
-- left of
SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' << polygon '(3.0,1.0),(3.0,3.0),(1.0,0.0)' AS false;
@@ -155,11 +156,11 @@ SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' ~= polygon '(3.0,1.0),(3.0,3.0),(
SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' && polygon '(3.0,1.0),(3.0,3.0),(1.0,0.0)' AS true;
-- +--------------------+
--- | *---* 1
+-- | *---* 1
-- | + | |
-- | 2 *---*
-- +--------------------+
--- 3
+-- 3
-- Edges 1-2, 2-3 are not shown on picture
SELECT '((0,4),(6,4),(1,2),(6,0),(0,0))'::polygon && '((2,1),(2,3),(3,3),(3,1))'::polygon AS "true";