55 files changed, 2493 insertions, 530 deletions
diff --git a/contrib/Makefile b/contrib/Makefile
index b37d0dd2c3..0b91ac10ee 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -20,6 +20,7 @@ SUBDIRS = \
 		earthdistance	\
 		file_fdw	\
 		fuzzystrmatch	\
+		hashtest	\
 		hstore		\
 		intagg		\
 		intarray	\
diff --git a/contrib/hashtest/Makefile b/contrib/hashtest/Makefile
new file mode 100644
index 0000000000..3ee42f87d8
--- /dev/null
+++ b/contrib/hashtest/Makefile
@@ -0,0 +1,18 @@
+# contrib/hashtest/Makefile
+
+MODULE_big = hashtest
+OBJS = hashtest.o
+
+EXTENSION = hashtest
+DATA = hashtest--1.0.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/hashtest
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/hashtest/hashtest--1.0.sql b/contrib/hashtest/hashtest--1.0.sql
new file mode 100644
index 0000000000..e271baff0f
--- /dev/null
+++ b/contrib/hashtest/hashtest--1.0.sql
@@ -0,0 +1,52 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION hashtest" to load this file. \quit
+
+CREATE FUNCTION chash_insert_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_insert_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_search_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_search_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_delete_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_delete_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_concurrent_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_concurrent_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_collision_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_collision_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_insert_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_insert_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_search_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_search_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_delete_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_delete_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_concurrent_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_concurrent_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_collision_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_collision_test'
+LANGUAGE C;
diff --git a/contrib/hashtest/hashtest.c b/contrib/hashtest/hashtest.c
new file mode 100644
index 0000000000..172a5bb156
--- /dev/null
+++ b/contrib/hashtest/hashtest.c
@@ -0,0 +1,527 @@
+/*-------------------------------------------------------------------------
+ * hashtest.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "funcapi.h"
+#include "libpq/auth.h"
+#include "lib/stringinfo.h"
+#include "miscadmin.h"
+#include "portability/instr_time.h"
+#include "storage/ipc.h"
+#include "utils/chash.h"
+
+PG_MODULE_MAGIC;
+
+void		_PG_init(void);
+Datum		chash_insert_test(PG_FUNCTION_ARGS);
+Datum		chash_search_test(PG_FUNCTION_ARGS);
+Datum		chash_delete_test(PG_FUNCTION_ARGS);
+Datum		chash_concurrent_test(PG_FUNCTION_ARGS);
+Datum		chash_collision_test(PG_FUNCTION_ARGS);
+Datum		dynahash_insert_test(PG_FUNCTION_ARGS);
+Datum		dynahash_search_test(PG_FUNCTION_ARGS);
+Datum		dynahash_delete_test(PG_FUNCTION_ARGS);
+Datum		dynahash_concurrent_test(PG_FUNCTION_ARGS);
+Datum		dynahash_collision_test(PG_FUNCTION_ARGS);
+static void hashtest_shmem_startup(void);
+
+PG_FUNCTION_INFO_V1(chash_insert_test);
+PG_FUNCTION_INFO_V1(chash_search_test);
+PG_FUNCTION_INFO_V1(chash_delete_test);
+PG_FUNCTION_INFO_V1(chash_concurrent_test);
+PG_FUNCTION_INFO_V1(chash_collision_test);
+PG_FUNCTION_INFO_V1(dynahash_insert_test);
+PG_FUNCTION_INFO_V1(dynahash_search_test);
+PG_FUNCTION_INFO_V1(dynahash_delete_test);
+PG_FUNCTION_INFO_V1(dynahash_concurrent_test);
+PG_FUNCTION_INFO_V1(dynahash_collision_test);
+
+typedef struct
+{
+	uint32	key;
+	uint32	val;
+} hentry;
+
+static CHashDescriptor cdesc = {
+	"hashtest-chash",	/* name */
+	1048576,			/* capacity */
+	sizeof(hentry),		/* element size */
+	sizeof(uint32)		/* key size */
+};
+
+#define DYNAHASH_PARTITIONS		16
+
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+static CHashTable chash;
+static HTAB *dynahash;
+static LWLockId dynahash_lock[DYNAHASH_PARTITIONS];
+static ClientAuthentication_hook_type original_client_auth_hook = NULL;
+
+static void hashtest_client_auth_hook(Port *port, int status);
+static void chash_write_stats_to_log(int code, Datum dummy);
+
+#define dynahash_get_lock(hashcode) \
+	(dynahash_lock[(hashcode) % DYNAHASH_PARTITIONS])
+
+void
+_PG_init(void)
+{
+	Size	cs;
+	Size	ds;
+
+	if (!process_shared_preload_libraries_in_progress)
+		return;
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = hashtest_shmem_startup;
+	chash = CHashBootstrap(&cdesc);
+	cs = CHashEstimateSize(chash);
+	RequestAddinShmemSpace(cs);
+	ds = hash_estimate_size(cdesc.capacity, cdesc.element_size);
+	RequestAddinShmemSpace(ds);
+	elog(LOG, "chash: %u bytes; dynahash: %u bytes", (unsigned) cs,
+		 (unsigned) ds);
+	RequestAddinLWLocks(DYNAHASH_PARTITIONS);
+	original_client_auth_hook = ClientAuthentication_hook;
+	ClientAuthentication_hook = hashtest_client_auth_hook;
+	
+}
+
+static void
+hashtest_client_auth_hook(Port *port, int status)
+{
+	if (original_client_auth_hook)
+		original_client_auth_hook(port, status);
+	on_proc_exit(chash_write_stats_to_log, (Datum) 0);
+}
+
+static void
+chash_write_stats_to_log(int code, Datum dummy)
+{
+	uint64	stats[CHS_NumberOfStatistics];
+	CHashStatisticsType i;
+	StringInfoData	buf;
+
+	CHashStatistics(chash, stats);
+	initStringInfo(&buf);
+
+	for (i = 0; i < CHS_NumberOfStatistics; ++i)
+	{
+		if (stats[i] == 0)
+			continue;
+		appendStringInfo(&buf, UINT64_FORMAT " %s; ", stats[i],
+						 CHashStatisticsNames[i]);
+	}
+
+	if (buf.len > 1)
+	{
+		buf.data[buf.len-2] = '\0';
+		elog(LOG, "chash statistics: %s", buf.data);
+	}
+}
+
+static void
+hashtest_shmem_startup(void)
+{
+	HASHCTL		info;
+	uint32		i;
+
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+
+	/* Initialize concurrent hash table. */
+	chash = CHashInitialize(chash, &cdesc);
+
+	/* Initialize shared dynahash table. */
+	info.keysize = cdesc.key_size;
+	info.entrysize = cdesc.element_size;
+	info.hash = tag_hash;
+	info.num_partitions = DYNAHASH_PARTITIONS;
+
+	dynahash = ShmemInitHash("hashtest-dynahash",
+							 cdesc.capacity, cdesc.capacity,
+							 &info,
+							 HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+
+	for (i = 0; i < DYNAHASH_PARTITIONS; ++i)
+		dynahash_lock[i] = LWLockAssign();
+}
+
+Datum
+chash_insert_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool ok;
+
+		e.key = i;
+		e.val = i * 31;
+		ok = CHashInsert(chash, &e);
+		if (!ok)
+			elog(LOG, "insert %u: failed", i);
+		ok = CHashInsert(chash, &e);
+		if (ok)
+			elog(LOG, "insert %u: worked twice", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+chash_search_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool ok;
+
+		e.key = i;
+		ok = CHashSearch(chash, &e);
+		if (!ok)
+			elog(LOG, "search %u: not found", i);
+		else if (e.val != e.key * 31)
+			elog(LOG, "search %u: found %u", i, e.val);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+chash_delete_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool ok;
+
+		e.key = i;
+		ok = CHashDelete(chash, &e);
+		if (!ok)
+			elog(LOG, "delete %u: not found", i);
+		ok = CHashDelete(chash, &e);
+		if (ok)
+			elog(LOG, "delete %u: found twice", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+chash_concurrent_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+	uint32	seed = MyProcPid << 16;
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		e.key = seed | i;
+		e.val = MyProcPid;
+		ok = CHashInsert(chash, &e);
+		if (!ok)
+			elog(LOG, "insert %u: found", i);
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		e.key = seed | i;
+		e.val = 0;
+		ok = CHashSearch(chash, &e);
+		if (!ok)
+		{
+			uint64	retry = 1;
+			elog(LOG, "search %u: not found", i);
+			while (!CHashSearch(chash, &e))
+				++retry;
+			elog(LOG, "search %u: eventually found it after "
+				UINT64_FORMAT " retries", i, retry);
+		}
+		if (e.val != MyProcPid)
+			elog(LOG, "search %u: expected %u found %u", i, (unsigned) MyProcPid, e.val);
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		e.key = seed | i;
+		ok = CHashDelete(chash, &e);
+		if (!ok)
+		{
+			uint64	retry = 1;
+			elog(LOG, "delete %u: not found", i);
+			while (!CHashDelete(chash, &e))
+				++retry;
+			elog(LOG, "delete %u: eventually deleted it after "
+				UINT64_FORMAT " retries", i, retry);
+		}
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+chash_collision_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+
+	/* Don't stack-allocate this. */
+	static bool mine[10000];
+
+	memset(mine, 0, 10000 * sizeof(bool));
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		e.key = i;
+		e.val = MyProcPid;
+		ok = CHashInsert(chash, &e);
+		if (ok)
+			mine[i] = true;
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		if (!mine[i])
+			continue;
+		e.key = i;
+		ok = CHashSearch(chash, &e);
+		if (!ok)
+			elog(LOG, "search %u: not found", i);
+		else if (e.val != MyProcPid)
+			elog(LOG, "search %u: expected %u found %u",
+				 i, (unsigned) MyProcPid, e.val);
+		ok = CHashDelete(chash, &e);
+		if (!ok)
+			elog(LOG, "delete %u: not found", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+static bool
+dynahash_insert(uint32 key, uint32 val)
+{
+	bool	found;
+	uint32	hashcode;
+	hentry *e;
+	LWLockId	lockid;
+
+	hashcode = get_hash_value(dynahash, (void *) &key);
+ 	lockid = dynahash_get_lock(hashcode);
+	LWLockAcquire(lockid, LW_EXCLUSIVE);
+	e = hash_search_with_hash_value(dynahash, (void *) &key,
+									hashcode, HASH_ENTER, &found);
+	if (!found)
+		e->val = val;
+	LWLockRelease(lockid);
+
+	return !found;
+}
+
+static bool
+dynahash_search(uint32 key, uint32 *val)
+{
+	uint32	hashcode;
+	hentry *e;
+	LWLockId	lockid;
+
+	hashcode = get_hash_value(dynahash, (void *) &key);
+ 	lockid = dynahash_get_lock(hashcode);
+	LWLockAcquire(lockid, LW_SHARED);
+	e = hash_search_with_hash_value(dynahash, (void *) &key,
+									hashcode, HASH_FIND, NULL);
+	if (e)
+		*val = e->val;
+	LWLockRelease(lockid);
+
+	return e != NULL;
+}
+
+static bool
+dynahash_delete(uint32 key)
+{
+	uint32	hashcode;
+	hentry *e;
+	LWLockId	lockid;
+
+	hashcode = get_hash_value(dynahash, (void *) &key);
+ 	lockid = dynahash_get_lock(hashcode);
+	LWLockAcquire(lockid, LW_EXCLUSIVE);
+	e = hash_search_with_hash_value(dynahash, (void *) &key,
+									hashcode, HASH_REMOVE, NULL);
+	LWLockRelease(lockid);
+
+	return e != NULL;
+}
+
+Datum
+dynahash_insert_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool	ok;
+
+		ok = dynahash_insert(i, i * 31);
+		if (!ok)
+			elog(LOG, "insert %u: failed", i);
+		ok = dynahash_insert(i, i * 31);
+		if (ok)
+			elog(LOG, "insert %u: worked twice", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+dynahash_search_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool	ok;
+		uint32	val;
+
+		ok = dynahash_search(i, &val);
+		if (!ok)
+			elog(LOG, "search %u: not found", i);
+		else if (val != i* 31)
+			elog(LOG, "search %u: found %u", i, val);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+dynahash_delete_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool	ok;
+
+		ok = dynahash_delete(i);
+		if (!ok)
+			elog(LOG, "delete %u: not found", i);
+		ok = dynahash_delete(i);
+		if (ok)
+			elog(LOG, "delete %u: found twice", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+dynahash_concurrent_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	uint32	val;
+	uint32	seed = MyProcPid << 16;
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		ok = dynahash_insert(seed | i, MyProcPid);
+		if (!ok)
+			elog(LOG, "insert %u: found", i);
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		ok = dynahash_search(seed | i, &val);
+		if (!ok)
+		{
+			uint64	retry = 1;
+			elog(LOG, "search %u: not found", i);
+			while (!dynahash_search(seed | i, &val))
+				++retry;
+			elog(LOG, "search %u: eventually found it after "
+				UINT64_FORMAT " retries", i, retry);
+		}
+		if (val != MyProcPid)
+			elog(LOG, "search %u: expected %u found %u",
+				 i, (unsigned) MyProcPid, val);
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		ok = dynahash_delete(seed | i);
+		if (!ok)
+		{
+			uint64	retry = 1;
+			elog(LOG, "delete %u: not found", i);
+			while (!dynahash_delete(seed | i))
+				++retry;
+			elog(LOG, "delete %u: eventually deleted it after "
+				UINT64_FORMAT " retries", i, retry);
+		}
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+dynahash_collision_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	uint32	val;
+
+	/* Don't stack-allocate this. */
+	static bool mine[10000];
+
+	memset(mine, 0, 10000 * sizeof(bool));
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		ok = dynahash_insert(i, MyProcPid);
+		if (ok)
+			mine[i] = true;
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		if (!mine[i])
+			continue;
+		ok = dynahash_search(i, &val);
+		if (!ok)
+			elog(LOG, "search %u: not found", i);
+		else if (val != MyProcPid)
+			elog(LOG, "search %u: expected %u found %u",
+				 i, (unsigned) MyProcPid, val);
+		ok = dynahash_delete(i);
+		if (!ok)
+			elog(LOG, "delete %u: not found", i);
+	}
+
+	PG_RETURN_VOID();
+}
diff --git a/contrib/hashtest/hashtest.control b/contrib/hashtest/hashtest.control
new file mode 100644
index 0000000000..b8e0f01346
--- /dev/null
+++ b/contrib/hashtest/hashtest.control
@@ -0,0 +1,4 @@
+comment = 'hash testing code'
+default_version = '1.0'
+module_pathname = '$libdir/hashtest'
+relocatable = true
diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c
index bbfcab71ce..56db0dd654 100644
--- a/contrib/pg_upgrade/check.c
+++ b/contrib/pg_upgrade/check.c
@@ -14,12 +14,10 @@
 #include "pg_upgrade.h"
 
 
-static void set_locale_and_encoding(ClusterInfo *cluster);
 static void check_new_cluster_is_empty(void);
-static void check_locale_and_encoding(ControlData *oldctrl,
-						  ControlData *newctrl);
-static bool equivalent_locale(const char *loca, const char *locb);
-static bool equivalent_encoding(const char *chara, const char *charb);
+static void check_databases_are_compatible(void);
+static void check_locale_and_encoding(DbInfo *olddb, DbInfo *newdb);
+static bool equivalent_locale(int category, const char *loca, const char *locb);
 static void check_is_install_user(ClusterInfo *cluster);
 static void check_for_prepared_transactions(ClusterInfo *cluster);
 static void check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster);
@@ -81,8 +79,6 @@ check_and_dump_old_cluster(bool live_check)
 	if (!live_check)
 		start_postmaster(&old_cluster, true);
 
-	set_locale_and_encoding(&old_cluster);
-
 	get_pg_database_relfilenode(&old_cluster);
 
 	/* Extract a list of databases and tables from the old cluster */
@@ -127,13 +123,10 @@ check_and_dump_old_cluster(bool live_check)
 void
 check_new_cluster(void)
 {
-	set_locale_and_encoding(&new_cluster);
-
-	check_locale_and_encoding(&old_cluster.controldata, &new_cluster.controldata);
-
 	get_db_and_rel_infos(&new_cluster);
 
 	check_new_cluster_is_empty();
+	check_databases_are_compatible();
 
 	check_loadable_libraries();
 
@@ -279,93 +272,25 @@ check_cluster_compatibility(bool live_check)
 
 
 /*
- * set_locale_and_encoding()
- *
- * query the database to get the template0 locale
- */
-static void
-set_locale_and_encoding(ClusterInfo *cluster)
-{
-	ControlData *ctrl = &cluster->controldata;
-	PGconn	   *conn;
-	PGresult   *res;
-	int			i_encoding;
-	int			cluster_version = cluster->major_version;
-
-	conn = connectToServer(cluster, "template1");
-
-	/* for pg < 80400, we got the values from pg_controldata */
-	if (cluster_version >= 80400)
-	{
-		int			i_datcollate;
-		int			i_datctype;
-
-		res = executeQueryOrDie(conn,
-								"SELECT datcollate, datctype "
-								"FROM	pg_catalog.pg_database "
-								"WHERE	datname = 'template0' ");
-		assert(PQntuples(res) == 1);
-
-		i_datcollate = PQfnumber(res, "datcollate");
-		i_datctype = PQfnumber(res, "datctype");
-
-		if (GET_MAJOR_VERSION(cluster->major_version) < 902)
-		{
-			/*
-			 * Pre-9.2 did not canonicalize the supplied locale names to match
-			 * what the system returns, while 9.2+ does, so convert pre-9.2 to
-			 * match.
-			 */
-			ctrl->lc_collate = get_canonical_locale_name(LC_COLLATE,
-								pg_strdup(PQgetvalue(res, 0, i_datcollate)));
-			ctrl->lc_ctype = get_canonical_locale_name(LC_CTYPE,
-								  pg_strdup(PQgetvalue(res, 0, i_datctype)));
-		}
-		else
-		{
-			ctrl->lc_collate = pg_strdup(PQgetvalue(res, 0, i_datcollate));
-			ctrl->lc_ctype = pg_strdup(PQgetvalue(res, 0, i_datctype));
-		}
-
-		PQclear(res);
-	}
-
-	res = executeQueryOrDie(conn,
-							"SELECT pg_catalog.pg_encoding_to_char(encoding) "
-							"FROM	pg_catalog.pg_database "
-							"WHERE	datname = 'template0' ");
-	assert(PQntuples(res) == 1);
-
-	i_encoding = PQfnumber(res, "pg_encoding_to_char");
-	ctrl->encoding = pg_strdup(PQgetvalue(res, 0, i_encoding));
-
-	PQclear(res);
-
-	PQfinish(conn);
-}
-
-
-/*
  * check_locale_and_encoding()
  *
- * Check that old and new locale and encoding match.  Even though the backend
- * tries to canonicalize stored locale names, the platform often doesn't
- * cooperate, so it's entirely possible that one DB thinks its locale is
- * "en_US.UTF-8" while the other says "en_US.utf8".  Try to be forgiving.
+ * Check that locale and encoding of a database in the old and new clusters
+ * are compatible.
  */
 static void
-check_locale_and_encoding(ControlData *oldctrl,
-						  ControlData *newctrl)
+check_locale_and_encoding(DbInfo *olddb, DbInfo *newdb)
 {
-	if (!equivalent_locale(oldctrl->lc_collate, newctrl->lc_collate))
-		pg_fatal("lc_collate cluster values do not match:  old \"%s\", new \"%s\"\n",
-				 oldctrl->lc_collate, newctrl->lc_collate);
-	if (!equivalent_locale(oldctrl->lc_ctype, newctrl->lc_ctype))
-		pg_fatal("lc_ctype cluster values do not match:  old \"%s\", new \"%s\"\n",
-				 oldctrl->lc_ctype, newctrl->lc_ctype);
-	if (!equivalent_encoding(oldctrl->encoding, newctrl->encoding))
-		pg_fatal("encoding cluster values do not match:  old \"%s\", new \"%s\"\n",
-				 oldctrl->encoding, newctrl->encoding);
+	if (olddb->db_encoding != newdb->db_encoding)
+		pg_fatal("encodings for database \"%s\" do not match:  old \"%s\", new \"%s\"\n",
+				 olddb->db_name,
+				 pg_encoding_to_char(olddb->db_encoding),
+				 pg_encoding_to_char(newdb->db_encoding));
+	if (!equivalent_locale(LC_COLLATE, olddb->db_collate, newdb->db_collate))
+		pg_fatal("lc_collate values for database \"%s\" do not match:  old \"%s\", new \"%s\"\n",
+				 olddb->db_name, olddb->db_collate, newdb->db_collate);
+	if (!equivalent_locale(LC_CTYPE, olddb->db_ctype, newdb->db_ctype))
+		pg_fatal("lc_ctype values for database \"%s\" do not match:  old \"%s\", new \"%s\"\n",
+				 olddb->db_name, olddb->db_ctype, newdb->db_ctype);
 }
 
 /*
@@ -373,61 +298,46 @@ check_locale_and_encoding(ControlData *oldctrl,
  *
  * Best effort locale-name comparison.  Return false if we are not 100% sure
  * the locales are equivalent.
+ *
+ * Note: The encoding parts of the names are ignored. This function is
+ * currently used to compare locale names stored in pg_database, and
+ * pg_database contains a separate encoding field. That's compared directly
+ * in check_locale_and_encoding().
  */
 static bool
-equivalent_locale(const char *loca, const char *locb)
+equivalent_locale(int category, const char *loca, const char *locb)
 {
-	const char *chara = strrchr(loca, '.');
-	const char *charb = strrchr(locb, '.');
-	int			lencmp;
-
-	/* If they don't both contain an encoding part, just do strcasecmp(). */
-	if (!chara || !charb)
-		return (pg_strcasecmp(loca, locb) == 0);
+	const char *chara;
+	const char *charb;
+	char	   *canona;
+	char	   *canonb;
+	int			lena;
+	int			lenb;
 
 	/*
-	 * Compare the encoding parts.  Windows tends to use code page numbers for
-	 * the encoding part, which equivalent_encoding() won't like, so accept if
-	 * the strings are case-insensitive equal; otherwise use
-	 * equivalent_encoding() to compare.
+	 * If the names are equal, the locales are equivalent. Checking this
+	 * first avoids calling setlocale() in the common case that the names
+	 * are equal. That's a good thing, if setlocale() is buggy, for example.
 	 */
-	if (pg_strcasecmp(chara + 1, charb + 1) != 0 &&
-		!equivalent_encoding(chara + 1, charb + 1))
-		return false;
+	if (pg_strcasecmp(loca, locb) == 0)
+		return true;
 
 	/*
-	 * OK, compare the locale identifiers (e.g. en_US part of en_US.utf8).
-	 *
-	 * It's tempting to ignore non-alphanumeric chars here, but for now it's
-	 * not clear that that's necessary; just do case-insensitive comparison.
+	 * Not identical. Canonicalize both names, remove the encoding parts,
+	 * and try again.
 	 */
-	lencmp = chara - loca;
-	if (lencmp != charb - locb)
-		return false;
+	canona = get_canonical_locale_name(category, loca);
+	chara = strrchr(canona, '.');
+	lena = chara ? (chara - canona) : strlen(canona);
 
-	return (pg_strncasecmp(loca, locb, lencmp) == 0);
-}
+	canonb = get_canonical_locale_name(category, locb);
+	charb = strrchr(canonb, '.');
+	lenb = charb ? (charb - canonb) : strlen(canonb);
 
-/*
- * equivalent_encoding()
- *
- * Best effort encoding-name comparison.  Return true only if the encodings
- * are valid server-side encodings and known equivalent.
- *
- * Because the lookup in pg_valid_server_encoding() does case folding and
- * ignores non-alphanumeric characters, this will recognize many popular
- * variant spellings as equivalent, eg "utf8" and "UTF-8" will match.
- */
-static bool
-equivalent_encoding(const char *chara, const char *charb)
-{
-	int			enca = pg_valid_server_encoding(chara);
-	int			encb = pg_valid_server_encoding(charb);
+	if (lena == lenb && pg_strncasecmp(canona, canonb, lena) == 0)
+		return true;
 
-	if (enca < 0 || encb < 0)
-		return false;
-
-	return (enca == encb);
+	return false;
 }
 
 
@@ -450,7 +360,35 @@ check_new_cluster_is_empty(void)
 						 new_cluster.dbarr.dbs[dbnum].db_name);
 		}
 	}
+}
+
+/*
+ * Check that every database that already exists in the new cluster is
+ * compatible with the corresponding database in the old one.
+ */
+static void
+check_databases_are_compatible(void)
+{
+	int			newdbnum;
+	int			olddbnum;
+	DbInfo	   *newdbinfo;
+	DbInfo	   *olddbinfo;
 
+	for (newdbnum = 0; newdbnum < new_cluster.dbarr.ndbs; newdbnum++)
+	{
+		newdbinfo = &new_cluster.dbarr.dbs[newdbnum];
+
+		/* Find the corresponding database in the old cluster */
+		for (olddbnum = 0; olddbnum < old_cluster.dbarr.ndbs; olddbnum++)
+		{
+			olddbinfo = &old_cluster.dbarr.dbs[olddbnum];
+			if (strcmp(newdbinfo->db_name, olddbinfo->db_name) == 0)
+			{
+				check_locale_and_encoding(olddbinfo, newdbinfo);
+				break;
+			}
+		}
+	}
 }
 
 
@@ -470,7 +408,8 @@ create_script_for_cluster_analyze(char **analyze_script_file_name)
 	if (os_info.user_specified)
 		user_specification = psprintf("-U \"%s\" ", os_info.user);
 
-	*analyze_script_file_name = psprintf("analyze_new_cluster.%s", SCRIPT_EXT);
+	*analyze_script_file_name = psprintf("%sanalyze_new_cluster.%s",
+										 SCRIPT_PREFIX, SCRIPT_EXT);
 
 	if ((script = fopen_priv(*analyze_script_file_name, "w")) == NULL)
 		pg_fatal("Could not open file \"%s\": %s\n",
@@ -551,7 +490,8 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name)
 	int			tblnum;
 	char		old_cluster_pgdata[MAXPGPATH];
 
-	*deletion_script_file_name = psprintf("delete_old_cluster.%s", SCRIPT_EXT);
+	*deletion_script_file_name = psprintf("%sdelete_old_cluster.%s",
+										  SCRIPT_PREFIX, SCRIPT_EXT);
 
 	/*
 	 * Some users (oddly) create tablespaces inside the cluster data
diff --git a/contrib/pg_upgrade/controldata.c b/contrib/pg_upgrade/controldata.c
index 8379ebd71b..4e9d5948fa 100644
--- a/contrib/pg_upgrade/controldata.c
+++ b/contrib/pg_upgrade/controldata.c
@@ -122,10 +122,6 @@ get_control_data(ClusterInfo *cluster, bool live_check)
 		pg_fatal("Could not get control data using %s: %s\n",
 				 cmd, getErrorText(errno));
 
-	/* Only pre-8.4 has these so if they are not set below we will check later */
-	cluster->controldata.lc_collate = NULL;
-	cluster->controldata.lc_ctype = NULL;
-
 	/* Only in <= 9.2 */
 	if (GET_MAJOR_VERSION(cluster->major_version) <= 902)
 	{
@@ -404,36 +400,6 @@ get_control_data(ClusterInfo *cluster, bool live_check)
 			cluster->controldata.data_checksum_version = str2uint(p);
 			got_data_checksum_version = true;
 		}
-		/* In pre-8.4 only */
-		else if ((p = strstr(bufin, "LC_COLLATE:")) != NULL)
-		{
-			p = strchr(p, ':');
-
-			if (p == NULL || strlen(p) <= 1)
-				pg_fatal("%d: controldata retrieval problem\n", __LINE__);
-
-			p++;				/* remove ':' char */
-			/* skip leading spaces and remove trailing newline */
-			p += strspn(p, " ");
-			if (strlen(p) > 0 && *(p + strlen(p) - 1) == '\n')
-				*(p + strlen(p) - 1) = '\0';
-			cluster->controldata.lc_collate = pg_strdup(p);
-		}
-		/* In pre-8.4 only */
-		else if ((p = strstr(bufin, "LC_CTYPE:")) != NULL)
-		{
-			p = strchr(p, ':');
-
-			if (p == NULL || strlen(p) <= 1)
-				pg_fatal("%d: controldata retrieval problem\n", __LINE__);
-
-			p++;				/* remove ':' char */
-			/* skip leading spaces and remove trailing newline */
-			p += strspn(p, " ");
-			if (strlen(p) > 0 && *(p + strlen(p) - 1) == '\n')
-				*(p + strlen(p) - 1) = '\0';
-			cluster->controldata.lc_ctype = pg_strdup(p);
-		}
 	}
 
 	if (output)
diff --git a/contrib/pg_upgrade/info.c b/contrib/pg_upgrade/info.c
index a1773aa8e5..c347dfc493 100644
--- a/contrib/pg_upgrade/info.c
+++ b/contrib/pg_upgrade/info.c
@@ -239,11 +239,15 @@ get_db_infos(ClusterInfo *cluster)
 	DbInfo	   *dbinfos;
 	int			i_datname,
 				i_oid,
+				i_encoding,
+				i_datcollate,
+				i_datctype,
 				i_spclocation;
 	char		query[QUERY_ALLOC];
 
 	snprintf(query, sizeof(query),
-			 "SELECT d.oid, d.datname, %s "
+			 "SELECT d.oid, d.datname, d.encoding, d.datcollate, d.datctype, "
+			 "%s AS spclocation "
 			 "FROM pg_catalog.pg_database d "
 			 " LEFT OUTER JOIN pg_catalog.pg_tablespace t "
 			 " ON d.dattablespace = t.oid "
@@ -252,12 +256,15 @@ get_db_infos(ClusterInfo *cluster)
 			 "ORDER BY 2",
 	/* 9.2 removed the spclocation column */
 			 (GET_MAJOR_VERSION(cluster->major_version) <= 901) ?
-			 "t.spclocation" : "pg_catalog.pg_tablespace_location(t.oid) AS spclocation");
+			 "t.spclocation" : "pg_catalog.pg_tablespace_location(t.oid)");
 
 	res = executeQueryOrDie(conn, "%s", query);
 
 	i_oid = PQfnumber(res, "oid");
 	i_datname = PQfnumber(res, "datname");
+	i_encoding = PQfnumber(res, "encoding");
+	i_datcollate = PQfnumber(res, "datcollate");
+	i_datctype = PQfnumber(res, "datctype");
 	i_spclocation = PQfnumber(res, "spclocation");
 
 	ntups = PQntuples(res);
@@ -267,6 +274,9 @@ get_db_infos(ClusterInfo *cluster)
 	{
 		dbinfos[tupnum].db_oid = atooid(PQgetvalue(res, tupnum, i_oid));
 		dbinfos[tupnum].db_name = pg_strdup(PQgetvalue(res, tupnum, i_datname));
+		dbinfos[tupnum].db_encoding = atoi(PQgetvalue(res, tupnum, i_encoding));
+		dbinfos[tupnum].db_collate = pg_strdup(PQgetvalue(res, tupnum, i_datcollate));
+		dbinfos[tupnum].db_ctype = pg_strdup(PQgetvalue(res, tupnum, i_datctype));
 		snprintf(dbinfos[tupnum].db_tablespace, sizeof(dbinfos[tupnum].db_tablespace), "%s",
 				 PQgetvalue(res, tupnum, i_spclocation));
 	}
diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h
index 56a7505a96..c3b81e4a08 100644
--- a/contrib/pg_upgrade/pg_upgrade.h
+++ b/contrib/pg_upgrade/pg_upgrade.h
@@ -76,6 +76,7 @@ extern char *output_files[];
 #define PATH_SEPARATOR		'/'
 #define RM_CMD				"rm -f"
 #define RMDIR_CMD			"rm -rf"
+#define SCRIPT_PREFIX		"./"
 #define SCRIPT_EXT			"sh"
 #define ECHO_QUOTE	"'"
 #define ECHO_BLANK	""
@@ -86,6 +87,7 @@ extern char *output_files[];
 #define PATH_SEPARATOR		'\\'
 #define RM_CMD				"DEL /q"
 #define RMDIR_CMD			"RMDIR /s/q"
+#define SCRIPT_PREFIX		""
 #define SCRIPT_EXT			"bat"
 #define EXE_EXT				".exe"
 #define ECHO_QUOTE	""
@@ -180,6 +182,9 @@ typedef struct
 	char	   *db_name;		/* database name */
 	char		db_tablespace[MAXPGPATH];		/* database default tablespace
 												 * path */
+	char	   *db_collate;
+	char	   *db_ctype;
+	int			db_encoding;
 	RelInfoArr	rel_arr;		/* array of all user relinfos */
 } DbInfo;
 
@@ -218,9 +223,6 @@ typedef struct
 	bool		date_is_int;
 	bool		float8_pass_by_value;
 	bool		data_checksum_version;
-	char	   *lc_collate;
-	char	   *lc_ctype;
-	char	   *encoding;
 } ControlData;
 
 /*
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index f4617b67e9..f98e282741 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -1227,8 +1227,7 @@
       <entry><type>bool</type></entry>
       <entry></entry>
       <entry>
-       This represents a not-null constraint.  It is possible to
-       change this column to enable or disable the constraint.
+       This represents a not-null constraint.
       </entry>
      </row>
 
diff --git a/doc/src/sgml/json.sgml b/doc/src/sgml/json.sgml
index 37dd611aeb..8feb2fbf0a 100644
--- a/doc/src/sgml/json.sgml
+++ b/doc/src/sgml/json.sgml
@@ -269,6 +269,12 @@ SELECT '"foo"'::jsonb @> '"foo"'::jsonb;
 -- The array on the right side is contained within the one on the left:
 SELECT '[1, 2, 3]'::jsonb @> '[1, 3]'::jsonb;
 
+-- Order of array elements is not significant, so this is also true:
+SELECT '[1, 2, 3]'::jsonb @> '[3, 1]'::jsonb;
+
+-- Duplicate array elements don't matter either:
+SELECT '[1, 2, 3]'::jsonb @> '[1, 2, 2]'::jsonb;
+
 -- The object with a single pair on the right side is contained
 -- within the object on the left side:
 SELECT '{"product": "PostgreSQL", "version": 9.4, "jsonb":true}'::jsonb @> '{"version":9.4}'::jsonb;
@@ -288,8 +294,10 @@ SELECT '{"foo": {"bar": "baz"}}'::jsonb @> '{"bar": "baz"}'::jsonb;  -- yields f
    The general principle is that the contained object must match the
    containing object as to structure and data contents, possibly after
    discarding some non-matching array elements or object key/value pairs
-   from the containing object.  However, the order of array elements is
-   not significant when doing a containment match.
+   from the containing object.
+   But remember that the order of array elements is not significant when
+   doing a containment match, and duplicate array elements are effectively
+   considered only once.
   </para>
 
   <para>
diff --git a/doc/src/sgml/plpgsql.sgml b/doc/src/sgml/plpgsql.sgml
index f008e937ee..f195495520 100644
--- a/doc/src/sgml/plpgsql.sgml
+++ b/doc/src/sgml/plpgsql.sgml
@@ -487,8 +487,8 @@ $$ LANGUAGE plpgsql;
 CREATE FUNCTION extended_sales(p_itemno int)
 RETURNS TABLE(quantity int, total numeric) AS $$
 BEGIN
-    RETURN QUERY SELECT quantity, quantity * price FROM sales
-                 WHERE itemno = p_itemno;
+    RETURN QUERY SELECT s.quantity, s.quantity * s.price FROM sales AS s
+                 WHERE s.itemno = p_itemno;
 END;
 $$ LANGUAGE plpgsql;
 </programlisting>
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 2af9413f21..e76b22fb2d 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -302,7 +302,7 @@ PROVE_FLAGS = --verbose
 
 # prepend to path if already set, else just set it
 define add_to_path
-$(1)='$(if $($(1)),$(2):$$$(1),$(2))'
+$(1)="$(if $($(1)),$(2):$$$(1),$(2))"
 endef
 
 # platform-specific environment variable to set shared library path
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 850008b340..8849c08e54 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -28,6 +28,7 @@
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
+#include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
 #include "utils/tqual.h"
 
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index bab5a49187..b71f65de2c 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1186,7 +1186,7 @@ _bt_pagedel(Relation rel, Buffer buf)
 						(errcode(ERRCODE_INDEX_CORRUPTED),
 					errmsg("index \"%s\" contains a half-dead internal page",
 						   RelationGetRelationName(rel)),
-						 errhint("This can be caused by an interrupt VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
+						 errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
 			_bt_relbuf(rel, buf);
 			return ndeleted;
 		}
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 5a4dbb9c53..235b442296 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5193,8 +5193,8 @@ readRecoveryCommandFile(void)
 			else
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-						 errmsg("invalid recovery_target parameter"),
-						 errhint("The only allowed value is 'immediate'")));
+						 errmsg("invalid value for recovery parameter \"recovery_target\""),
+						 errhint("The only allowed value is \"immediate\".")));
 			ereport(DEBUG2,
 					(errmsg_internal("recovery_target = '%s'",
 									 item->value)));
@@ -5257,7 +5257,7 @@ readRecoveryCommandFile(void)
 								"recovery_min_apply_delay"),
 						 hintmsg ? errhint("%s", _(hintmsg)) : 0));
 			ereport(DEBUG2,
-					(errmsg("recovery_min_apply_delay = '%s'", item->value)));
+					(errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
 		}
 		else
 			ereport(FATAL,
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 55c1e79563..c0eade0a3d 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -69,6 +69,7 @@
 #include "utils/inval.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
+#include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 #include "utils/tqual.h"
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 7831e900ba..b52e6b9bc0 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -839,7 +839,7 @@ dropdb(const char *dbname, bool missing_ok)
 	if (ReplicationSlotsCountDBSlots(db_id, &nslots, &nslots_active))
 		ereport(ERROR,
 				(errcode(ERRCODE_OBJECT_IN_USE),
-				 errmsg("database \"%s\" is used by a logical decoding slot",
+				 errmsg("database \"%s\" is used by a logical replication slot",
 						dbname),
 				 errdetail_plural("There is %d slot, %d of them active.",
 								  "There are %d slots, %d of them active.",
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 781a736115..387d263e87 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -28,6 +28,7 @@
 #include "utils/json.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
+#include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
 #include "utils/tuplesort.h"
 #include "utils/xml.h"
@@ -1900,18 +1901,21 @@ show_hash_info(HashState *hashstate, ExplainState *es)
 		if (es->format != EXPLAIN_FORMAT_TEXT)
 		{
 			ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es);
+			ExplainPropertyLong("Original Hash Buckets",
+								hashtable->nbuckets_original, es);
 			ExplainPropertyLong("Hash Batches", hashtable->nbatch, es);
 			ExplainPropertyLong("Original Hash Batches",
 								hashtable->nbatch_original, es);
 			ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
 		}
-		else if (hashtable->nbatch_original != hashtable->nbatch)
+		else if ((hashtable->nbatch_original != hashtable->nbatch) ||
+				 (hashtable->nbuckets_original != hashtable->nbuckets))
 		{
 			appendStringInfoSpaces(es->str, es->indent * 2);
 			appendStringInfo(es->str,
-			"Buckets: %d  Batches: %d (originally %d)  Memory Usage: %ldkB\n",
-							 hashtable->nbuckets, hashtable->nbatch,
-							 hashtable->nbatch_original, spacePeakKb);
+			"Buckets: %d (originally %d)  Batches: %d (originally %d)  Memory Usage: %ldkB\n",
+							 hashtable->nbuckets, hashtable->nbuckets_original,
+							 hashtable->nbatch, hashtable->nbatch_original, spacePeakKb);
 		}
 		else
 		{
diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c
index d1c8bb0d53..30bd40db18 100644
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@ -597,7 +597,7 @@ refresh_by_match_merge(Oid matviewOid, Oid tempOid, Oid relowner,
 	{
 		ereport(ERROR,
 				(errcode(ERRCODE_CARDINALITY_VIOLATION),
-				 errmsg("new data for \"%s\" contains duplicate rows without any NULL columns",
+				 errmsg("new data for \"%s\" contains duplicate rows without any null columns",
 						RelationGetRelationName(matviewRel)),
 				 errdetail("Row: %s",
 			SPI_getvalue(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1))));
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index cb16c53a60..ecdff1e5e3 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -85,6 +85,7 @@
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/relcache.h"
+#include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 #include "utils/tqual.h"
@@ -9045,7 +9046,7 @@ ATExecSetRelOptions(Relation rel, List *defList, AlterTableType operation,
 			if (view_updatable_error)
 				ereport(ERROR,
 						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-						 errmsg("WITH CHECK OPTION is supported only on auto-updatable views"),
+						 errmsg("WITH CHECK OPTION is supported only on automatically updatable views"),
 						 errhint("%s", view_updatable_error)));
 		}
 	}
diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c
index ad364efbcb..55a68810f2 100644
--- a/src/backend/commands/typecmds.c
+++ b/src/backend/commands/typecmds.c
@@ -72,6 +72,7 @@
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
+#include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 #include "utils/tqual.h"
diff --git a/src/backend/commands/view.c b/src/backend/commands/view.c
index 9d0039c42a..184bcd0582 100644
--- a/src/backend/commands/view.c
+++ b/src/backend/commands/view.c
@@ -471,7 +471,7 @@ DefineView(ViewStmt *stmt, const char *queryString)
 		if (view_updatable_error)
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("WITH CHECK OPTION is supported only on auto-updatable views"),
+					 errmsg("WITH CHECK OPTION is supported only on automatically updatable views"),
 					 errhint("%s", view_updatable_error)));
 	}
 
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index b428c18b5c..7c5bb77b0c 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -39,6 +39,7 @@
 
 
 static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
+static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable);
 static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
 					  int mcvsToUse);
 static void ExecHashSkewTableInsert(HashJoinTable hashtable,
@@ -117,6 +118,7 @@ MultiExecHash(HashState *node)
 				/* It's a skew tuple, so put it into that hash table */
 				ExecHashSkewTableInsert(hashtable, slot, hashvalue,
 										bucketNumber);
+				hashtable->skewTuples += 1;
 			}
 			else
 			{
@@ -127,6 +129,25 @@ MultiExecHash(HashState *node)
 		}
 	}
 
+	/* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
+	if (hashtable->nbuckets != hashtable->nbuckets_optimal)
+	{
+		/* We never decrease the number of buckets. */
+		Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
+
+#ifdef HJDEBUG
+		printf("Increasing nbuckets %d => %d\n",
+			   hashtable->nbuckets, hashtable->nbuckets_optimal);
+#endif
+
+		ExecHashIncreaseNumBuckets(hashtable);
+	}
+
+	/* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */
+	hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple);
+	if (hashtable->spaceUsed > hashtable->spacePeak)
+		hashtable->spacePeak = hashtable->spaceUsed;
+
 	/* must provide our own instrumentation support */
 	if (node->ps.instrument)
 		InstrStopNode(node->ps.instrument, hashtable->totalTuples);
@@ -272,7 +293,10 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	 */
 	hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
 	hashtable->nbuckets = nbuckets;
+	hashtable->nbuckets_original = nbuckets;
+	hashtable->nbuckets_optimal = nbuckets;
 	hashtable->log2_nbuckets = log2_nbuckets;
+	hashtable->log2_nbuckets_optimal = log2_nbuckets;
 	hashtable->buckets = NULL;
 	hashtable->keepNulls = keepNulls;
 	hashtable->skewEnabled = false;
@@ -286,6 +310,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	hashtable->nbatch_outstart = nbatch;
 	hashtable->growEnabled = true;
 	hashtable->totalTuples = 0;
+	hashtable->skewTuples = 0;
 	hashtable->innerBatchFile = NULL;
 	hashtable->outerBatchFile = NULL;
 	hashtable->spaceUsed = 0;
@@ -620,6 +645,19 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 	 */
 	ninmemory = nfreed = 0;
 
+	/* If know we need to resize nbuckets, we can do it while rebatching. */
+	if (hashtable->nbuckets_optimal != hashtable->nbuckets)
+	{
+		/* we never decrease the number of buckets */
+		Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
+
+		hashtable->nbuckets = hashtable->nbuckets_optimal;
+		hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
+
+		hashtable->buckets = repalloc(hashtable->buckets,
+									  sizeof(HashJoinTuple) * hashtable->nbuckets);
+	}
+
 	/*
 	 * We will scan through the chunks directly, so that we can reset the
 	 * buckets now and not have to keep track which tuples in the buckets have
@@ -704,6 +742,78 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 }
 
 /*
+ * ExecHashIncreaseNumBuckets
+ *		increase the original number of buckets in order to reduce
+ *		number of tuples per bucket
+ */
+static void
+ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
+{
+	HashMemoryChunk	chunk;
+
+	/* do nothing if not an increase (it's called increase for a reason) */
+	if (hashtable->nbuckets >= hashtable->nbuckets_optimal)
+		return;
+
+	/*
+	 * We already know the optimal number of buckets, so let's just
+	 * compute the log2_nbuckets for it.
+	 */
+	hashtable->nbuckets = hashtable->nbuckets_optimal;
+	hashtable->log2_nbuckets = my_log2(hashtable->nbuckets_optimal);
+
+	Assert(hashtable->nbuckets > 1);
+	Assert(hashtable->nbuckets <= (INT_MAX / 2));
+	Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
+
+#ifdef HJDEBUG
+	printf("Increasing nbuckets to %d\n", hashtable->nbuckets);
+#endif
+
+	/*
+	 * Just reallocate the proper number of buckets - we don't need to
+	 * walk through them - we can walk the dense-allocated chunks
+	 * (just like in ExecHashIncreaseNumBatches, but without all the
+	 * copying into new chunks)
+	 */
+	hashtable->buckets =
+		(HashJoinTuple *) repalloc(hashtable->buckets,
+								   hashtable->nbuckets * sizeof(HashJoinTuple));
+
+	memset(hashtable->buckets, 0, sizeof(void *) * hashtable->nbuckets);
+
+	/* scan through all tuples in all chunks to rebuild the hash table */
+	for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next)
+	{
+		/* process all tuples stored in this chunk */
+		size_t idx = 0;
+		while (idx < chunk->used)
+		{
+			HashJoinTuple hashTuple = (HashJoinTuple) (chunk->data + idx);
+			int		bucketno;
+			int		batchno;
+
+			ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
+									  &bucketno, &batchno);
+
+			/* add the tuple to the proper bucket */
+			hashTuple->next = hashtable->buckets[bucketno];
+			hashtable->buckets[bucketno] = hashTuple;
+
+			/* advance index past the tuple */
+			idx += MAXALIGN(HJTUPLE_OVERHEAD +
+							HJTUPLE_MINTUPLE(hashTuple)->t_len);
+		}
+	}
+
+#ifdef HJDEBUG
+	printf("Nbuckets increased to %d, average items per bucket %.1f\n",
+		   hashtable->nbuckets, batchTuples / hashtable->nbuckets);
+#endif
+}
+
+
+/*
  * ExecHashTableInsert
  *		insert a tuple into the hash table depending on the hash value
  *		it may just go to a temp file for later batches
@@ -736,6 +846,7 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		 */
 		HashJoinTuple hashTuple;
 		int			hashTupleSize;
+		double		ntuples = (hashtable->totalTuples - hashtable->skewTuples);
 
 		/* Create the HashJoinTuple */
 		hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
@@ -756,11 +867,24 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		hashTuple->next = hashtable->buckets[bucketno];
 		hashtable->buckets[bucketno] = hashTuple;
 
+		/*
+		 * Increase the (optimal) number of buckets if we just exceeded the
+		 * NTUP_PER_BUCKET threshold, but only when there's still a single batch.
+		 */
+		if ((hashtable->nbatch == 1) &&
+			(hashtable->nbuckets_optimal <= INT_MAX/2) &&	/* overflow protection */
+			(ntuples >= (hashtable->nbuckets_optimal * NTUP_PER_BUCKET)))
+		{
+			hashtable->nbuckets_optimal *= 2;
+			hashtable->log2_nbuckets_optimal += 1;
+		}
+
 		/* Account for space used, and back off if we've used too much */
 		hashtable->spaceUsed += hashTupleSize;
 		if (hashtable->spaceUsed > hashtable->spacePeak)
 			hashtable->spacePeak = hashtable->spaceUsed;
-		if (hashtable->spaceUsed + hashtable->nbuckets * sizeof(HashJoinTuple)
+		if (hashtable->spaceUsed +
+			hashtable->nbuckets_optimal * sizeof(HashJoinTuple)
 			> hashtable->spaceAllowed)
 			ExecHashIncreaseNumBatches(hashtable);
 	}
@@ -885,7 +1009,10 @@ ExecHashGetHashValue(HashJoinTable hashtable,
  * functions are good about randomizing all their output bits, else we are
  * likely to have very skewed bucket or batch occupancy.)
  *
- * nbuckets doesn't change over the course of the join.
+ * nbuckets and log2_nbuckets may change while nbatch == 1 because of dynamic
+ * bucket count growth.  Once we start batching, the value is fixed and does
+ * not change over the course of the join (making it possible to compute batch
+ * number the way we do here).
  *
  * nbatch is always a power of 2; we increase it only by doubling it.  This
  * effectively adds one more bit to the top of the batchno.
diff --git a/src/backend/libpq/be-secure-openssl.c b/src/backend/libpq/be-secure-openssl.c
index 8d8f12952a..b05364ced0 100644
--- a/src/backend/libpq/be-secure-openssl.c
+++ b/src/backend/libpq/be-secure-openssl.c
@@ -614,7 +614,7 @@ be_tls_write(Port *port, void *ptr, size_t len)
 				if (retries >= 20)
 					ereport(FATAL,
 							(errcode(ERRCODE_PROTOCOL_VIOLATION),
-							 errmsg("unable to complete SSL handshake")));
+							 errmsg("could not complete SSL handshake on renegotiation, too many failures")));
 			}
 		}
 	}
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
index 7a38f2f150..092cf8fe43 100644
--- a/src/backend/storage/buffer/buf_table.c
+++ b/src/backend/storage/buffer/buf_table.c
@@ -21,8 +21,10 @@
  */
 #include "postgres.h"
 
+#include "miscadmin.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
+#include "utils/chash.h"
 
 
 /* entry for buffer lookup hashtable */
@@ -32,8 +34,13 @@ typedef struct
 	int			id;				/* Associated buffer ID */
 } BufferLookupEnt;
 
-static HTAB *SharedBufHash;
-
+static CHashDescriptor SharedBufDescriptor = {
+	"buffer lookup table",
+	0,
+	sizeof(BufferLookupEnt),
+	sizeof(BufferTag)
+};
+static CHashTable SharedBufHash;
 
 /*
  * Estimate space needed for mapping hashtable
@@ -42,7 +49,13 @@ static HTAB *SharedBufHash;
 Size
 BufTableShmemSize(int size)
 {
-	return hash_estimate_size(size, sizeof(BufferLookupEnt));
+	if (SharedBufHash == NULL)
+	{
+		SharedBufDescriptor.capacity = size;
+		SharedBufHash = CHashBootstrap(&SharedBufDescriptor);
+	}
+
+	return CHashEstimateSize(SharedBufHash);
 }
 
 /*
@@ -52,59 +65,29 @@ BufTableShmemSize(int size)
 void
 InitBufTable(int size)
 {
-	HASHCTL		info;
-
-	/* assume no locking is needed yet */
-
-	/* BufferTag maps to Buffer */
-	info.keysize = sizeof(BufferTag);
-	info.entrysize = sizeof(BufferLookupEnt);
-	info.hash = tag_hash;
-	info.num_partitions = NUM_BUFFER_PARTITIONS;
-
-	SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
-								  size, size,
-								  &info,
-								  HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
-}
-
-/*
- * BufTableHashCode
- *		Compute the hash code associated with a BufferTag
- *
- * This must be passed to the lookup/insert/delete routines along with the
- * tag.  We do it like this because the callers need to know the hash code
- * in order to determine which buffer partition to lock, and we don't want
- * to do the hash computation twice (hash_any is a bit slow).
- */
-uint32
-BufTableHashCode(BufferTag *tagPtr)
-{
-	return get_hash_value(SharedBufHash, (void *) tagPtr);
+	if (SharedBufHash == NULL || !IsUnderPostmaster)
+	{
+		Assert(SharedBufDescriptor.capacity == 0 ||
+			SharedBufDescriptor.capacity == size);
+		SharedBufDescriptor.capacity = size;
+		SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor);
+	}
 }
 
 /*
  * BufTableLookup
  *		Lookup the given BufferTag; return buffer ID, or -1 if not found
- *
- * Caller must hold at least share lock on BufMappingLock for tag's partition
  */
 int
-BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
+BufTableLookup(BufferTag *tagPtr)
 {
-	BufferLookupEnt *result;
-
-	result = (BufferLookupEnt *)
-		hash_search_with_hash_value(SharedBufHash,
-									(void *) tagPtr,
-									hashcode,
-									HASH_FIND,
-									NULL);
+	BufferLookupEnt ent;
 
-	if (!result)
+	ent.key = *tagPtr;
+	if (!CHashSearch(SharedBufHash, &ent))
 		return -1;
 
-	return result->id;
+	return ent.id;
 }
 
 /*
@@ -118,27 +101,20 @@ BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
  * Caller must hold exclusive lock on BufMappingLock for tag's partition
  */
 int
-BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
+BufTableInsert(BufferTag *tagPtr, int buf_id)
 {
-	BufferLookupEnt *result;
-	bool		found;
+	BufferLookupEnt ent;
+
+	ent.key = *tagPtr;
+	ent.id = buf_id;
 
 	Assert(buf_id >= 0);		/* -1 is reserved for not-in-table */
 	Assert(tagPtr->blockNum != P_NEW);	/* invalid tag */
 
-	result = (BufferLookupEnt *)
-		hash_search_with_hash_value(SharedBufHash,
-									(void *) tagPtr,
-									hashcode,
-									HASH_ENTER,
-									&found);
-
-	if (found)					/* found something already in the table */
-		return result->id;
-
-	result->id = buf_id;
+	if (CHashInsert(SharedBufHash, &ent))
+		return -1;
 
-	return -1;
+	return ent.id;
 }
 
 /*
@@ -148,17 +124,8 @@ BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
  * Caller must hold exclusive lock on BufMappingLock for tag's partition
  */
 void
-BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
+BufTableDelete(BufferTag *tagPtr)
 {
-	BufferLookupEnt *result;
-
-	result = (BufferLookupEnt *)
-		hash_search_with_hash_value(SharedBufHash,
-									(void *) tagPtr,
-									hashcode,
-									HASH_REMOVE,
-									NULL);
-
-	if (!result)				/* shouldn't happen */
+	if (!CHashDelete(SharedBufHash, tagPtr))
 		elog(ERROR, "shared buffer hash table corrupted");
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 45d1d61d95..437deb905c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -429,22 +429,14 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 	else
 	{
 		BufferTag	newTag;		/* identity of requested block */
-		uint32		newHash;	/* hash value for newTag */
-		LWLock	   *newPartitionLock;	/* buffer partition lock for it */
 		int			buf_id;
 
 		/* create a tag so we can lookup the buffer */
 		INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
 					   forkNum, blockNum);
 
-		/* determine its hash code and partition lock ID */
-		newHash = BufTableHashCode(&newTag);
-		newPartitionLock = BufMappingPartitionLock(newHash);
-
 		/* see if the block is in the buffer pool already */
-		LWLockAcquire(newPartitionLock, LW_SHARED);
-		buf_id = BufTableLookup(&newTag, newHash);
-		LWLockRelease(newPartitionLock);
+		buf_id = BufTableLookup(&newTag);
 
 		/* If not in buffers, initiate prefetch */
 		if (buf_id < 0)
@@ -822,11 +814,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			bool *foundPtr)
 {
 	BufferTag	newTag;			/* identity of requested block */
-	uint32		newHash;		/* hash value for newTag */
-	LWLock	   *newPartitionLock;		/* buffer partition lock for it */
 	BufferTag	oldTag;			/* previous identity of selected buffer */
-	uint32		oldHash;		/* hash value for oldTag */
-	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
 	BufFlags	oldFlags;
 	int			buf_id;
 	volatile BufferDesc *buf;
@@ -835,29 +823,31 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	/* create a tag so we can lookup the buffer */
 	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
 
-	/* determine its hash code and partition lock ID */
-	newHash = BufTableHashCode(&newTag);
-	newPartitionLock = BufMappingPartitionLock(newHash);
-
 	/* see if the block is in the buffer pool already */
-	LWLockAcquire(newPartitionLock, LW_SHARED);
-	buf_id = BufTableLookup(&newTag, newHash);
+start:
+	buf_id = BufTableLookup(&newTag);
 	if (buf_id >= 0)
 	{
+		BufferDesc *foundbuf;
+
 		/*
 		 * Found it.  Now, pin the buffer so no one can steal it from the
-		 * buffer pool, and check to see if the correct data has been loaded
-		 * into the buffer.
+		 * buffer pool.
 		 */
-		buf = &BufferDescriptors[buf_id];
+		foundbuf = &BufferDescriptors[buf_id];
 
-		valid = PinBuffer(buf, strategy);
+		valid = PinBuffer(foundbuf, strategy);
 
-		/* Can release the mapping lock as soon as we've pinned it */
-		LWLockRelease(newPartitionLock);
+		/* Check whether someone recycled the buffer before we pinned it. */
+		if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag))
+		{
+			UnpinBuffer(foundbuf, true);
+			goto start;
+		}
 
 		*foundPtr = TRUE;
 
+		/* Check to see if the correct data has been loaded into the buffer. */
 		if (!valid)
 		{
 			/*
@@ -867,7 +857,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			 * own read attempt if the page is still not BM_VALID.
 			 * StartBufferIO does it all.
 			 */
-			if (StartBufferIO(buf, true))
+			if (StartBufferIO(foundbuf, true))
 			{
 				/*
 				 * If we get here, previous attempts to read the buffer must
@@ -877,15 +867,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			}
 		}
 
-		return buf;
+		return foundbuf;
 	}
 
-	/*
-	 * Didn't find it in the buffer pool.  We'll have to initialize a new
-	 * buffer.  Remember to unlock the mapping lock while doing the work.
-	 */
-	LWLockRelease(newPartitionLock);
-
 	/* Loop here in case we have to try another victim buffer */
 	for (;;)
 	{
@@ -986,42 +970,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 */
 		if (oldFlags & BM_TAG_VALID)
 		{
-			/*
-			 * Need to compute the old tag's hashcode and partition lock ID.
-			 * XXX is it worth storing the hashcode in BufferDesc so we need
-			 * not recompute it here?  Probably not.
-			 */
+			/* Save old tag. */
 			oldTag = buf->tag;
-			oldHash = BufTableHashCode(&oldTag);
-			oldPartitionLock = BufMappingPartitionLock(oldHash);
-
-			/*
-			 * Must lock the lower-numbered partition first to avoid
-			 * deadlocks.
-			 */
-			if (oldPartitionLock < newPartitionLock)
-			{
-				LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			}
-			else if (oldPartitionLock > newPartitionLock)
-			{
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-				LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-			}
-			else
-			{
-				/* only one partition, only one lock */
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			}
-		}
-		else
-		{
-			/* if it wasn't valid, we need only the new partition */
-			LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			/* these just keep the compiler quiet about uninit variables */
-			oldHash = 0;
-			oldPartitionLock = 0;
 		}
 
 		/*
@@ -1031,32 +981,34 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 * Note that we have not yet removed the hashtable entry for the old
 		 * tag.
 		 */
-		buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
+enter:
+		buf_id = BufTableInsert(&newTag, buf->buf_id);
 
 		if (buf_id >= 0)
 		{
+			BufferDesc *foundbuf;
+
 			/*
-			 * Got a collision. Someone has already done what we were about to
-			 * do. We'll just handle this as if it were found in the buffer
-			 * pool in the first place.  First, give up the buffer we were
-			 * planning to use.
+			 * We've got a collision, apparently: it looks like someone else
+			 * did what we were about to do.  We can handle this as if we had
+			 * found the buffer in the pool in the first place, but we must
+			 * recheck the buffer tag after pinning it, because it could still
+			 * get renamed under us.
+		 	 */
+			foundbuf = &BufferDescriptors[buf_id];
+			valid = PinBuffer(foundbuf, strategy);
+			if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag))
+			{
+				UnpinBuffer(foundbuf, true);
+				goto enter;
+			}
+
+			/*
+			 * Collision confirmed.  Give up the buffer we were planning to
+			 * use.
 			 */
 			UnpinBuffer(buf, true);
 
-			/* Can give up that buffer's mapping partition lock now */
-			if ((oldFlags & BM_TAG_VALID) &&
-				oldPartitionLock != newPartitionLock)
-				LWLockRelease(oldPartitionLock);
-
-			/* remaining code should match code at top of routine */
-
-			buf = &BufferDescriptors[buf_id];
-
-			valid = PinBuffer(buf, strategy);
-
-			/* Can release the mapping lock as soon as we've pinned it */
-			LWLockRelease(newPartitionLock);
-
 			*foundPtr = TRUE;
 
 			if (!valid)
@@ -1068,7 +1020,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 				 * then set up our own read attempt if the page is still not
 				 * BM_VALID.  StartBufferIO does it all.
 				 */
-				if (StartBufferIO(buf, true))
+				if (StartBufferIO(foundbuf, true))
 				{
 					/*
 					 * If we get here, previous attempts to read the buffer
@@ -1078,7 +1030,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 				}
 			}
 
-			return buf;
+			return foundbuf;
 		}
 
 		/*
@@ -1097,11 +1049,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			break;
 
 		UnlockBufHdr(buf);
-		BufTableDelete(&newTag, newHash);
-		if ((oldFlags & BM_TAG_VALID) &&
-			oldPartitionLock != newPartitionLock)
-			LWLockRelease(oldPartitionLock);
-		LWLockRelease(newPartitionLock);
+		BufTableDelete(&newTag);
 		UnpinBuffer(buf, true);
 	}
 
@@ -1124,13 +1072,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	UnlockBufHdr(buf);
 
 	if (oldFlags & BM_TAG_VALID)
-	{
-		BufTableDelete(&oldTag, oldHash);
-		if (oldPartitionLock != newPartitionLock)
-			LWLockRelease(oldPartitionLock);
-	}
-
-	LWLockRelease(newPartitionLock);
+		BufTableDelete(&oldTag);
 
 	/*
 	 * Buffer contents are currently invalid.  Try to get the io_in_progress
@@ -1166,42 +1108,11 @@ static void
 InvalidateBuffer(volatile BufferDesc *buf)
 {
 	BufferTag	oldTag;
-	uint32		oldHash;		/* hash value for oldTag */
-	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
 	BufFlags	oldFlags;
 
 	/* Save the original buffer tag before dropping the spinlock */
 	oldTag = buf->tag;
 
-	UnlockBufHdr(buf);
-
-	/*
-	 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
-	 * worth storing the hashcode in BufferDesc so we need not recompute it
-	 * here?  Probably not.
-	 */
-	oldHash = BufTableHashCode(&oldTag);
-	oldPartitionLock = BufMappingPartitionLock(oldHash);
-
-retry:
-
-	/*
-	 * Acquire exclusive mapping lock in preparation for changing the buffer's
-	 * association.
-	 */
-	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-
-	/* Re-lock the buffer header */
-	LockBufHdr(buf);
-
-	/* If it's changed while we were waiting for lock, do nothing */
-	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
-	{
-		UnlockBufHdr(buf);
-		LWLockRelease(oldPartitionLock);
-		return;
-	}
-
 	/*
 	 * We assume the only reason for it to be pinned is that someone else is
 	 * flushing the page out.  Wait for them to finish.  (This could be an
@@ -1211,15 +1122,21 @@ retry:
 	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
 	 * be busy-looping here.)
 	 */
-	if (buf->refcount != 0)
+	while (buf->refcount != 0)
 	{
 		UnlockBufHdr(buf);
-		LWLockRelease(oldPartitionLock);
 		/* safety check: should definitely not be our *own* pin */
 		if (GetPrivateRefCount(buf->buf_id) > 0)
 			elog(ERROR, "buffer is pinned in InvalidateBuffer");
 		WaitIO(buf);
-		goto retry;
+		LockBufHdr(buf);
+
+		/* If it's changed while we were waiting for lock, do nothing */
+		if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
+		{
+			UnlockBufHdr(buf);
+			return;
+		}
 	}
 
 	/*
@@ -1237,12 +1154,7 @@ retry:
 	 * Remove the buffer from the lookup hashtable, if it was in there.
 	 */
 	if (oldFlags & BM_TAG_VALID)
-		BufTableDelete(&oldTag, oldHash);
-
-	/*
-	 * Done with mapping lock.
-	 */
-	LWLockRelease(oldPartitionLock);
+		BufTableDelete(&oldTag);
 
 	/*
 	 * Insert the buffer at the head of the list of free buffers.
diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c
index d96627a774..90df5930e1 100644
--- a/src/backend/storage/ipc/shm_mq.c
+++ b/src/backend/storage/ipc/shm_mq.c
@@ -139,7 +139,7 @@ struct shm_mq_handle
 };
 
 static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mq, Size nbytes,
-				  void *data, bool nowait, Size *bytes_written);
+				  const void *data, bool nowait, Size *bytes_written);
 static shm_mq_result shm_mq_receive_bytes(shm_mq *mq, Size bytes_needed,
 					 bool nowait, Size *nbytesp, void **datap);
 static bool shm_mq_wait_internal(volatile shm_mq *mq, PGPROC *volatile * ptr,
@@ -301,7 +301,33 @@ shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle)
 }
 
 /*
+ * Associate a BackgroundWorkerHandle with a shm_mq_handle just as if it had
+ * been passed to shm_mq_attach.
+ */
+void
+shm_mq_set_handle(shm_mq_handle *mqh, BackgroundWorkerHandle *handle)
+{
+	Assert(mqh->mqh_handle == NULL);
+	mqh->mqh_handle = handle;
+}
+
+/*
  * Write a message into a shared message queue.
+ */
+shm_mq_result
+shm_mq_send(shm_mq_handle *mqh, Size nbytes, const void *data, bool nowait)
+{
+	shm_mq_iovec	iov;
+
+	iov.data = data;
+	iov.len = nbytes;
+
+	return shm_mq_sendv(mqh, &iov, 1, nowait);
+}
+
+/*
+ * Write a message into a shared message queue, gathered from multiple
+ * addresses.
  *
  * When nowait = false, we'll wait on our process latch when the ring buffer
  * fills up, and then continue writing once the receiver has drained some data.
@@ -315,14 +341,22 @@ shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle)
  * the length or payload will corrupt the queue.)
  */
 shm_mq_result
-shm_mq_send(shm_mq_handle *mqh, Size nbytes, void *data, bool nowait)
+shm_mq_sendv(shm_mq_handle *mqh, shm_mq_iovec *iov, int iovcnt, bool nowait)
 {
 	shm_mq_result res;
 	shm_mq	   *mq = mqh->mqh_queue;
+	Size		nbytes = 0;
 	Size		bytes_written;
+	int			i;
+	int			which_iov = 0;
+	Size		offset;
 
 	Assert(mq->mq_sender == MyProc);
 
+	/* Compute total size of write. */
+	for (i = 0; i < iovcnt; ++i)
+		nbytes += iov[i].len;
+
 	/* Try to write, or finish writing, the length word into the buffer. */
 	while (!mqh->mqh_length_word_complete)
 	{
@@ -348,18 +382,80 @@ shm_mq_send(shm_mq_handle *mqh, Size nbytes, void *data, bool nowait)
 
 	/* Write the actual data bytes into the buffer. */
 	Assert(mqh->mqh_partial_bytes <= nbytes);
-	res = shm_mq_send_bytes(mqh, nbytes - mqh->mqh_partial_bytes,
-							((char *) data) + mqh->mqh_partial_bytes,
-							nowait, &bytes_written);
-	if (res == SHM_MQ_WOULD_BLOCK)
-		mqh->mqh_partial_bytes += bytes_written;
-	else
+	offset = mqh->mqh_partial_bytes;
+	do
 	{
-		mqh->mqh_partial_bytes = 0;
-		mqh->mqh_length_word_complete = false;
-	}
-	if (res != SHM_MQ_SUCCESS)
-		return res;
+		Size	chunksize;
+
+		/* Figure out which bytes need to be sent next. */
+		if (offset >= iov[which_iov].len)
+		{
+			offset -= iov[which_iov].len;
+			++which_iov;
+			if (which_iov >= iovcnt)
+				break;
+			continue;
+		}
+
+		/*
+		 * We want to avoid copying the data if at all possible, but every
+		 * chunk of bytes we write into the queue has to be MAXALIGN'd,
+		 * except the last.  Thus, if a chunk other than the last one ends
+		 * on a non-MAXALIGN'd boundary, we have to combine the tail end of
+		 * its data with data from one or more following chunks until we
+		 * either reach the last chunk or accumulate a number of bytes which
+		 * is MAXALIGN'd.
+		 */
+		if (which_iov + 1 < iovcnt &&
+			offset + MAXIMUM_ALIGNOF > iov[which_iov].len)
+		{
+			char	tmpbuf[MAXIMUM_ALIGNOF];
+			int		j = 0;
+
+			for (;;)
+			{
+				if (offset < iov[which_iov].len)
+				{
+					tmpbuf[j] = iov[which_iov].data[offset];
+					j++;
+					offset++;
+					if (j == MAXIMUM_ALIGNOF)
+						break;
+				}
+				else
+				{
+					offset -= iov[which_iov].len;
+					which_iov++;
+					if (which_iov >= iovcnt)
+						break;
+				}
+			}
+			res = shm_mq_send_bytes(mqh, j, tmpbuf, nowait, &bytes_written);
+			mqh->mqh_partial_bytes += bytes_written;
+			if (res != SHM_MQ_SUCCESS)
+				return res;
+			continue;
+		}
+
+		/*
+		 * If this is the last chunk, we can write all the data, even if it
+		 * isn't a multiple of MAXIMUM_ALIGNOF.  Otherwise, we need to
+		 * MAXALIGN_DOWN the write size.
+		 */
+		chunksize = iov[which_iov].len - offset;
+		if (which_iov + 1 < iovcnt)
+			chunksize = MAXALIGN_DOWN(chunksize);
+		res = shm_mq_send_bytes(mqh, chunksize, &iov[which_iov].data[offset],
+								nowait, &bytes_written);
+		mqh->mqh_partial_bytes += bytes_written;
+		offset += bytes_written;
+		if (res != SHM_MQ_SUCCESS)
+			return res;
+	} while (mqh->mqh_partial_bytes < nbytes);
+
+	/* Reset for next message. */
+	mqh->mqh_partial_bytes = 0;
+	mqh->mqh_length_word_complete = false;
 
 	/* Notify receiver of the newly-written data, and return. */
 	return shm_mq_notify_receiver(mq);
@@ -653,8 +749,8 @@ shm_mq_detach(shm_mq *mq)
  * Write bytes into a shared message queue.
  */
 static shm_mq_result
-shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, void *data, bool nowait,
-				  Size *bytes_written)
+shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data,
+				  bool nowait, Size *bytes_written)
 {
 	shm_mq	   *mq = mqh->mqh_queue;
 	Size		sent = 0;
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 2ea2216a65..38614a449d 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -423,6 +423,29 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
 	return structPtr;
 }
 
+/*
+ * ShmemInitStruct -- Attach to an existing structure in shared memory.
+ */
+void *
+ShmemAttachStruct(const char *name)
+{
+	ShmemIndexEnt *result;
+	void	   *ptr;
+	bool		found;
+
+	LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+	result = (ShmemIndexEnt *)
+		hash_search(ShmemIndex, name, HASH_FIND, &found);
+	if (!found || result == NULL)
+		elog(ERROR, "shared memory structure %s not found", name);
+	ptr = result->location;
+	Assert(ptr != NULL);
+
+	LWLockRelease(ShmemIndexLock);
+
+	return ptr;
+}
 
 /*
  * Add two Size values, checking for overflow
diff --git a/src/backend/utils/adt/jsonb_op.c b/src/backend/utils/adt/jsonb_op.c
index 2d071b2523..d9aaac9ac2 100644
--- a/src/backend/utils/adt/jsonb_op.c
+++ b/src/backend/utils/adt/jsonb_op.c
@@ -57,7 +57,7 @@ jsonb_exists_any(PG_FUNCTION_ARGS)
 
 	for (i = 0; i < elem_count; i++)
 	{
-		JsonbValue strVal;
+		JsonbValue	strVal;
 
 		if (key_nulls[i])
 			continue;
@@ -90,7 +90,7 @@ jsonb_exists_all(PG_FUNCTION_ARGS)
 
 	for (i = 0; i < elem_count; i++)
 	{
-		JsonbValue strVal;
+		JsonbValue	strVal;
 
 		if (key_nulls[i])
 			continue;
@@ -117,8 +117,7 @@ jsonb_contains(PG_FUNCTION_ARGS)
 	JsonbIterator *it1,
 			   *it2;
 
-	if (JB_ROOT_COUNT(val) < JB_ROOT_COUNT(tmpl) ||
-		JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl))
+	if (JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl))
 		PG_RETURN_BOOL(false);
 
 	it1 = JsonbIteratorInit(&val->root);
@@ -137,8 +136,7 @@ jsonb_contained(PG_FUNCTION_ARGS)
 	JsonbIterator *it1,
 			   *it2;
 
-	if (JB_ROOT_COUNT(val) < JB_ROOT_COUNT(tmpl) ||
-		JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl))
+	if (JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl))
 		PG_RETURN_BOOL(false);
 
 	it1 = JsonbIteratorInit(&val->root);
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index f157df3532..2ff85396d0 100644
--- a/src/backend/utils/adt/jsonb_util.c
+++ b/src/backend/utils/adt/jsonb_util.c
@@ -957,13 +957,24 @@ JsonbDeepContains(JsonbIterator **val, JsonbIterator **mContained)
 	}
 	else if (rcont == WJB_BEGIN_OBJECT)
 	{
-		JsonbValue *lhsVal;		/* lhsVal is from pair in lhs object */
-
+		Assert(vval.type == jbvObject);
 		Assert(vcontained.type == jbvObject);
 
+		/*
+		 * If the lhs has fewer pairs than the rhs, it can't possibly contain
+		 * the rhs.  (This conclusion is safe only because we de-duplicate
+		 * keys in all Jsonb objects; thus there can be no corresponding
+		 * optimization in the array case.)  The case probably won't arise
+		 * often, but since it's such a cheap check we may as well make it.
+		 */
+		if (vval.val.object.nPairs < vcontained.val.object.nPairs)
+			return false;
+
 		/* Work through rhs "is it contained within?" object */
 		for (;;)
 		{
+			JsonbValue *lhsVal; /* lhsVal is from pair in lhs object */
+
 			rcont = JsonbIteratorNext(mContained, &vcontained, false);
 
 			/*
@@ -1047,6 +1058,7 @@ JsonbDeepContains(JsonbIterator **val, JsonbIterator **mContained)
 		JsonbValue *lhsConts = NULL;
 		uint32		nLhsElems = vval.val.array.nElems;
 
+		Assert(vval.type == jbvArray);
 		Assert(vcontained.type == jbvArray);
 
 		/*
diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c
index 4eeb6314fa..67539ecde9 100644
--- a/src/backend/utils/adt/misc.c
+++ b/src/backend/utils/adt/misc.c
@@ -35,6 +35,7 @@
 #include "storage/proc.h"
 #include "storage/procarray.h"
 #include "utils/lsyscache.h"
+#include "utils/ruleutils.h"
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
 #include "utils/timestamp.h"
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 6e41cbd142..24ade6cc20 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -55,6 +55,7 @@
 #include "utils/fmgroids.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
+#include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 #include "utils/tqual.h"
diff --git a/src/backend/utils/hash/Makefile b/src/backend/utils/hash/Makefile
index 05d347c856..5d5338266d 100644
--- a/src/backend/utils/hash/Makefile
+++ b/src/backend/utils/hash/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/hash
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = dynahash.o hashfn.o
+OBJS = chash.o dynahash.o hashfn.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/hash/chash.c b/src/backend/utils/hash/chash.c
new file mode 100644
index 0000000000..0d4dc78a4e
--- /dev/null
+++ b/src/backend/utils/hash/chash.c
@@ -0,0 +1,1075 @@
+/*-------------------------------------------------------------------------
+ *
+ * chash.c
+ *	  concurrent hash tables
+ *
+ * A concurrent hash table stores a collection of fixed-size objects.
+ * From the point of view of this module, such objects are merely an
+ * opaque array of bytes, but the caller will typically implement them as
+ * a C "struct".  Some fixed-size, leading portion of each object is
+ * designated as the key, which must be distinct for all objects in the
+ * collection.  Since PostgreSQL's shared memory model does not permit
+ * dynamic shared-memory allocation, we preallocate shared-memory space
+ * for the maximum number of entities which can be stored (plus a few
+ * extra, for reasons that will be further explained below).  This space
+ * is allocated as a single large array called the arena, and we often
+ * refer to entities by their position in the arena rather than via an
+ * ordinary pointer.  This saves a considerable amount of memory, since
+ * most modern architectures are 64-bit and therefore use 8-byte pointers,
+ * while arena offsets can be stored in a 32-bit word.  In fact, we
+ * reserve one bit in each such word as a mark bit, so the maximum size
+ * of the arena is 2^31 elements, a restriction that does not currently
+ * appear to be problematic.  An additional advantage of this representation
+ * is that aligned 32-bit loads and stores are atomic on all architectures
+ * we support, but 64-bit loads and stores are not.
+ *
+ * When an element is inserted, we copy the data from the backend-private
+ * object supplied by the caller into one of these shared-memory entities.
+ * When the hash table is searched, the caller passes a backend-private
+ * entity with just the key filled in; if a matching element is found,
+ * data is copied from the shared memory entity into the non-key portion
+ * of the user-supplied entity.  In this way, clients of this module
+ * never use pointers into shared memory directly.
+ *
+ * As normal, we structure the hash table as an array of buckets, whose
+ * size is always a power of two, so that the low-order bytes of the
+ * hash code can be used to select a bucket.  If multiple entities has
+ * to the same bucket, we use separate chaining: each entity in the
+ * arena has an 8-byte header that stores the 4-byte arena offset of the
+ * next item in the bucket and the hash value of the entity's key.
+ * Bucket chains are maintained in order by ascending hash value and
+ * then by ascending entity key (as per memcmp) so that there is
+ * precisely one legal location at which a given new item can be inserted
+ * into a bucket.
+ *
+ * Items are inserted into buckets using compare-and-swap (CAS).  Thus, this
+ * module cannot be used on architectures where we do not have 4-byte
+ * compare-and-swap.  When an item is deleted, we first set its mark bit,
+ * which is stored within the next-pointer, again using CAS.  Once this
+ * step is completed, the node is deleted.  As a cleanup operation, we then
+ * use CAS to modify the next-pointer of the previous node to point to the
+ * node following the deleted node.  Note that, even once this cleanup
+ * operation has been performed, some other backend concurrently walking the
+ * chain might still be visiting the deleted node.  Thus, we must be certain
+ * not to overwrite the deleted node's next-pointer until all concurrent
+ * bucket scans have completed.  This means, in particular, that we cannot
+ * immediately view the deleted node as available for reuse.
+ *
+ * Instead, when a delete-marked node is removed from the bucket chain, it
+ * is added to one of many garbage lists.  There is a many-to-one mapping from
+ * buckets to garbage lists, so that the choice of bucket determines the
+ * garbage list but not visca versa.  Any process which wishes to scan a bucket
+ * must first advertise in shared memory the corresponding garbage list number.
+ * When a backend wishes to move entries from a garbage list to a free list,
+ * it must first wait (by spinning) for any backends scanning that garbage
+ * list to complete their scans.
+ *
+ * Ideally, it would be nice to make this completely lock-free, but because
+ * of the above-described choice of garbage collection algorithm, it currently
+ * isn't.  For an algorithm to be lock-free, it must be possible to suspend
+ * the execution of any number of processes for an arbitrary period of time
+ * without impeding the overall progress of the system.  For this code, that
+ * is true except when garbage collection occurs.  In that case, an insert,
+ * search, or delete operation can obstruct an inserting thread attempting to
+ * perform garbage collection for an unbounded period of time.  The algorithm
+ * could be adapted as to be completely lock-free, essentially by guaranteeing
+ * that even in the worst case no combination of processes can obstruct garbage
+ * collection to a sufficient degree as to prevent an inserter from finding
+ * an available entry in a hash table containing fewer live elements than its
+ * declared maximum capacity.  However, it's not clear that this is a good
+ * idea, because it would likely slow down operation in the non-contended
+ * case to solve a problem that we hope won't happen anyway.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/hash/chash.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "access/hash.h"
+#include "storage/barrier.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "utils/chash.h"
+#include "utils/memutils.h"
+
+/*
+ * CHashPtr represents an offset into the arena, plus a mark bit that is
+ * used to implement concurrent deletion.
+ */
+typedef uint32 CHashPtr;
+#define InvalidCHashPtr				((uint32) -2)
+#define CHashPtrIsInvalid(x)		((x) >= InvalidCHashPtr)
+#define CHashPtrIsMarked(x)			((x) & 1)
+#define CHashPtrGetOffset(x)		((x) >> 1)
+#define CHashPtrMark(x)				((x) | 1)
+#define CHashPtrUnmark(x)			((x) & ~1)
+#define MakeCHashPtr(x)				((x) << 1)
+#define CHashMaxCapacity			CHashPtrGetOffset(InvalidCHashPtr)
+
+/*
+ * Each object stored in the hash table is represented by a CHashNode, which
+ * stores a pointer to the next item in the same bucket, and the exact hash
+ * value of the current item.  Each CHashNode is followed by space for the
+ * item itself.
+ */
+typedef struct
+{
+	CHashPtr		next;			/* arena offset of next element */
+	union
+	{
+		uint32		hashcode;		/* hash(key) */
+		CHashPtr	gcnext;			/* arena offset of next garbage item */
+	} un;
+} CHashNode;
+
+#define SizeOfCHashNode		MAXALIGN(sizeof(CHashNode))
+#define CHashNodeGetItem(x)	(((char *) x) + SizeOfCHashNode)
+
+/*
+ * CHashTableData stores all the information that we need in order to access
+ * a concurrent hash table.  We store one copy of this data in shared memory,
+ * and an additional copy in the private memory of each backend accessing the
+ * table.
+ */
+typedef struct CHashTableData
+{
+	/*
+	 * These fields do not change after initialization.
+	 */
+	CHashDescriptor	desc;			/* descriptor for this hash table */
+	uint32			bucket_mask;	/* # of buckets, minus one */
+	uint8			garbage_shift;	/* log2(# of buckets/# of garbage lists) */
+	uint8			freelist_shift;	/* log2(# of garbage lists/# freelists) */
+	uint16			nfreelists;		/* # of freelists */
+	uint32			arena_limit;	/* # of arena elements */
+	uint32			arena_stride;	/* bytes allocated per arena element */
+	CHashPtr	   *bucket;			/* array with 1 entry per bucket */
+	CHashPtr	   *extra;			/* entries for garbage and free lists */
+	char		   *arena;			/* arena */
+
+	/*
+	 * These fields will be different in each backend; the shared copy is
+	 * irrelevant.
+	 */
+	int				gc_pid;			/* PID that set gc_next */
+	uint32			gc_next;		/* next garbage list to reclaim */
+	uint64			stats[CHS_NumberOfStatistics];	/* statistics */
+} CHashTableData;
+
+/* Compute # of buckets, garbage lists, or free lists. */
+#define CHashTableNBuckets(table) \
+	((table)->bucket_mask + 1)
+#define CHashTableNGarbage(table) \
+	(CHashTableNBuckets((table)) >> (table)->garbage_shift)
+#define CHashTableNFreeLists(table) \
+	((table)->nfreelists)
+
+/*
+ * Garbage lists and free lists are interleaved to reduce cache line
+ * contention on the free lists, so the calculation of where an individual
+ * list is located is a bit complex.
+ */
+#define CHashTableGetGarbageList(table, n) \
+	(&(table)->extra[(n) + ((n) >> (table)->freelist_shift)])
+#define CHashTableGetGarbageByBucket(table, n) \
+	(CHashTableGetGarbageList((table), (n) >> (table)->garbage_shift))
+#define CHashTableGetFreeList(table, n) \
+	(&(table)->extra[(n) + (((n) + 1) << (table)->freelist_shift)])
+
+/* Access macros for arena nodes. */
+#define CHashTableGetRaw(table, offset) \
+	(AssertMacro((offset) < (table)->arena_limit), \
+	 (CHashNode *) ((table)->arena + (table)->arena_stride * (offset)))
+#define CHashTableGetNode(table, ptr) \
+	(AssertMacro(!CHashPtrIsInvalid(ptr)), \
+	 CHashTableGetRaw((table), CHashPtrGetOffset((ptr))))
+
+/* Statistics macros. */
+#define CHashTableIncrementStatistic(table, stat) \
+	((table)->stats[(stat)]++)
+
+/*
+ * Bucket scan.
+ */
+typedef struct
+{
+	CHashPtr	target;
+	CHashPtr	next;
+	CHashPtr   *pointer_to_target;
+	CHashNode  *target_node;
+	bool		found;
+} CHashScanResult;
+
+/* Human-readable statistics names. */
+char *CHashStatisticsNames[] = {
+	"searches",
+	"searches failed",
+	"inserts",
+	"inserts failed",
+	"inserts retried",
+	"deletions",
+	"deletions failed",
+	"deletions retried",
+	"scan expunges",
+	"scan expunges failed",
+	"scans restarted",
+	"cleanup scans",
+	"allocations failed",
+	"garbage enqueues retried",
+	"garbage dequeues failed",
+	"garbage collections",
+	"garbage collection spins",
+	"garbage collection reclaims skipped",
+	"garbage collection fast reclaims",
+	"garbage collection reclaims retried",
+	"<end>"
+};
+
+/* Function prototypes. */
+static CHashPtr CHashAllocate(CHashTable table);
+static CHashPtr CHashAllocateViaGC(CHashTable table);
+static void CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c);
+static void CHashBucketScan(CHashTable table,
+				CHashPtr *start,
+				uint32 hashcode,
+				const void *key,
+				CHashScanResult *res);
+
+/*
+ * First stage of CHashTable initialization.  We fill in all the constants
+ * here, but not the pointers.
+ */
+CHashTable
+CHashBootstrap(CHashDescriptor *desc)
+{
+	CHashTable		table;
+	uint32			bucket_shift;
+
+	/* Sanity check. */
+	Assert(!strcmp(CHashStatisticsNames[CHS_NumberOfStatistics], "<end>"));
+
+	/* Allocate table and copy descriptor. */
+	table = MemoryContextAllocZero(TopMemoryContext, sizeof(CHashTableData));
+	memcpy(&table->desc, desc, sizeof(CHashDescriptor)); 
+
+	/* Sanity checks. */
+	if (desc->capacity < 1 || desc->capacity > CHashMaxCapacity)
+		elog(ERROR, "invalid capacity for concurrent hash");
+	if (desc->key_size < 1 || desc->key_size > desc->element_size)
+		elog(ERROR, "invalid key size for concurrent hash");
+
+ 	/*
+	 * The number of buckets must be a power of two.  To avoid (as much as
+	 * possible) having to traverse long bucket chains, we aim for a load
+	 * factor <= 1.0, so this is a pretty simple calculation: we just find the
+	 * smallest power of two greater than or equal to the target capacity.
+	 */
+	bucket_shift = fls(desc->capacity) - 1;
+	table->bucket_mask = (1 << bucket_shift) - 1;
+
+	/*
+	 * We choose to have one garbage list for every 64 hash table buckets
+	 * (that is, garbage_shift = 6) unless there are fewer than 64 buckets in
+	 * total, in which case we still have a minimum of one garbage list.
+	 *
+	 * Increasing the garbage_shift would reduce the likelihood of a backend
+	 * performing garbage collection needing to wait for a backend walking a
+	 * bucket to finish its scan.  On the other hand, decreasing the garbage
+	 * shift would allow more items to be recovered in a single garbage
+	 * collection cycle.  It's not clear what the optimal value is.
+	 */
+	table->garbage_shift = Min(bucket_shift, 6);
+	table->gc_next = 0;
+	table->gc_pid = 0;
+
+	/*
+ 	 * Experimentation reveals that the free list manipulation is both one of
+ 	 * the slowest parts of this algorithm and the most vulnerable to
+ 	 * contention.  Therefore, we want to have as many free lists as possible,
+ 	 * but there's no need to have more than the number of CPU cores, so we
+ 	 * limit the number of freelists to 64.  There might be a benefit in some
+ 	 * larger limit on a really big system, but we'll cap it here pending some
+ 	 * actual test results.  We're also limited to having no more freelists
+ 	 * than we do garbage lists.
+ 	 */
+#define LOG2_MAX_FREELIST 6
+	if (bucket_shift - table->garbage_shift < LOG2_MAX_FREELIST)
+		table->freelist_shift = 0;
+	else
+		table->freelist_shift =
+			(bucket_shift - table->garbage_shift) - LOG2_MAX_FREELIST;
+	table->nfreelists =
+		1 << (bucket_shift - (table->garbage_shift + table->freelist_shift));
+
+	/*
+	 * To make garbage collection efficient, we overallocate.  Normally, we
+	 * overallocate by one-eighth, but if that would be less than 15 elements,
+	 * then we allocate 15 elements instead.  This extra capacity can actually
+	 * be used, but for best performance, it shouldn't be.  It's the caller's
+	 * responsibility to avoid this.
+	 */
+	table->arena_limit = desc->capacity;
+	if (desc->capacity < 120)
+		table->arena_limit += 15;
+	else
+		table->arena_limit += table->arena_limit / 8;
+
+	/* Each arena element must be MAXALIGN'd and include per-node space. */
+	table->arena_stride = SizeOfCHashNode + MAXALIGN(desc->element_size);
+
+	/* Zero out statistics. */
+	memset(table->stats, 0, sizeof(uint64) * CHS_NumberOfStatistics);
+
+	return table;
+}
+
+/*
+ * Estimate shared memory requirements.
+ */
+Size
+CHashEstimateSize(CHashTable table)
+{
+	Size		total_buckets;
+	Size		size;
+	Size		nbuckets = CHashTableNBuckets(table);
+	Size		ngarbage = CHashTableNGarbage(table);
+	Size		nfreelists = CHashTableNFreeLists(table);
+
+	Assert(nbuckets > 0 && ngarbage > 0 && nfreelists > 0);
+	total_buckets = add_size(nbuckets, ngarbage);
+	total_buckets = add_size(total_buckets, nfreelists);
+
+	size = MAXALIGN(sizeof(CHashTableData));
+	size = add_size(size, mul_size(sizeof(CHashPtr), total_buckets));
+	size = add_size(size, mul_size(table->arena_stride, table->arena_limit));
+
+	return size;
+}
+
+/*
+ * Create a concurrent hash table in shared memory, or attach to an existing
+ * table.
+ */
+CHashTable
+CHashInitialize(CHashTable table, CHashDescriptor *desc)
+{
+	Size	size;
+	bool	found;
+	void   *shmem;
+	uint32	i;
+	uint32	nbuckets;
+	uint32	nfreelists;
+	uint32	ngarbage;
+	uint32	nextra;
+
+	/*
+	 * If we're under the postmaster, this must be the EXEC_BACKEND case where
+	 * we need to attach to an existing shared-memory segment.
+	 */
+	if (IsUnderPostmaster)
+	{
+		void   *shmem;
+
+		Assert(table == NULL);
+		table = MemoryContextAlloc(TopMemoryContext, sizeof(CHashTableData));
+		shmem = ShmemAttachStruct(desc->shmem_name);
+		memcpy(table, shmem, sizeof(CHashTableData));
+		return table;
+	}
+
+	/*
+	 * Otherwise, the hash table should not already exist, and we must
+	 * create it.  But the table should already be bootstrapped, since we
+	 * must previously have computed its size when figuring out our shared
+	 * memory allocation.
+	 */
+	Assert(table != NULL);
+	size = CHashEstimateSize(table);
+	shmem = ShmemInitStruct(table->desc.shmem_name, size, &found);
+	Assert(!found);
+
+	/* Bucket, garbage, and freelist arrays follow table info. */
+	table->bucket = (CHashPtr *)
+		(((char *) shmem) + MAXALIGN(sizeof(CHashTableData)));
+	nbuckets = CHashTableNBuckets(table);
+	table->extra = &table->bucket[nbuckets];
+
+	/* Arena follows the various lists. */
+	ngarbage = CHashTableNGarbage(table);
+	nfreelists = CHashTableNFreeLists(table);
+	nextra = ngarbage + nfreelists;
+	table->arena = (void *) (&table->extra[nextra]);
+
+	/* Initialize all three sets of lists to empty. */
+	for (i = 0; i < nbuckets; ++i)
+		table->bucket[i] = InvalidCHashPtr;
+	for (i = 0; i < nextra; ++i)
+		table->extra[i] = InvalidCHashPtr;
+
+	/* Put all arena elements on the free lists. */
+	for (i = 0; i < table->arena_limit; ++i)
+	{
+		CHashPtr   *f = CHashTableGetFreeList(table, i % nfreelists);
+		CHashNode  *n = CHashTableGetRaw(table, i);
+
+		n->un.gcnext = *f;
+		*f = MakeCHashPtr(i);
+	}
+
+	/*
+	 * Copy table (with pointers now filled in) to shared memory.  This is
+	 * arguably unnecessary when not using EXEC_BACKEND, but we do it anyway.
+	 */
+	memcpy(shmem, table, sizeof(CHashTableData));
+
+	return table;
+}
+
+/*
+ * Search a concurrent hash table.  entry should be a block of memory large
+ * enough to hold a complete entry, with just the key portion filled in.  If
+ * a matching entry is found, this function will fill in the rest of the entry
+ * from the data in the hash table and return true.  If not, it will return
+ * false.
+ */
+bool
+CHashSearch(CHashTable table, void *entry)
+{
+	uint32	hashcode = hash_any(entry, table->desc.key_size);
+	uint32	bucket = hashcode & table->bucket_mask;
+	CHashPtr	   *b = &table->bucket[bucket];
+	CHashScanResult	scan;
+
+	/* Prevent garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] == NULL);
+	MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+	pg_memory_barrier();
+
+	/* Scan bucket and return data from any matching entry. */
+	CHashBucketScan(table, b, hashcode, entry, &scan);
+	if (scan.found)
+		memcpy(((char *) entry) + table->desc.key_size,
+			   CHashNodeGetItem(scan.target_node) + table->desc.key_size,
+			   table->desc.element_size - table->desc.key_size);
+
+	/* Allow garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] != NULL);
+	pg_memory_barrier();
+	MyProc->hazard[0] = NULL;
+
+	CHashTableIncrementStatistic(table, CHS_Search);
+	if (!scan.found)
+		CHashTableIncrementStatistic(table, CHS_Search_Failed);
+	return scan.found;
+}
+
+/*
+ * Insert into a concurrent hash table.  entry should be the complete entry
+ * to be inserted.  If no duplicate entry is found in the table, this function
+ * will insert the entry and return true.  Otherwise, the duplicate entry's
+ * value will be copied into the supplied entry and the function will return
+ * false.
+ *
+ * The caller is responsible for ensuring that no inserts are performed into
+ * a hash table which is at capacity.  The behavor of such an insert is
+ * undefined (the actual behavior is that the insert may either succeed,
+ * degrading performance; or CHashAllocate may enter a tight loop until such
+ * time as an element is deleted).
+ */
+bool
+CHashInsert(CHashTable table, void *entry)
+{
+	uint32	hashcode = hash_any(entry, table->desc.key_size);
+	uint32	bucket = hashcode & table->bucket_mask;
+	CHashPtr	   *b = &table->bucket[bucket];
+	CHashPtr	new;
+	CHashNode  *nnew;
+	CHashScanResult	scan;
+
+	/*
+	 * Allocate and initialize a new entry, on the assumption that the insert
+	 * will succeed.  If it ends up failing, we must be sure to put this back
+	 * on some free list, lest it be permanently leaked.
+	 */
+	new = CHashAllocate(table);
+	nnew = CHashTableGetNode(table, new);
+	nnew->un.hashcode = hashcode;
+	memcpy(CHashNodeGetItem(nnew), entry, table->desc.element_size);
+
+	/* Prevent garbage collection for this bucket. */
+	MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+	pg_memory_barrier();
+
+	/*
+	 * Scan the bucket.  If we don't find a match, use compare-and-swap to
+	 * insert the new node at the insert position.  If we do find a match,
+	 * return the data to the caller.
+	 */
+retry:
+	CHashBucketScan(table, b, hashcode, entry, &scan);
+	if (scan.found)
+		memcpy(((char *) entry) + table->desc.key_size,
+			   CHashNodeGetItem(scan.target_node) + table->desc.key_size,
+			   table->desc.element_size - table->desc.key_size);
+	else
+	{
+		/*
+		 * We didn't find a matching element, so use compare-and-swap to
+		 * attempt to insert the new element we've prepared.  If this fails,
+		 * someone currently inserted or deleted an element.  The correct
+		 * insertion point might have changed, or the key we're trying to
+		 * insert might now be present when it wasn't before, so we'll have
+		 * to search the bucket chain anew.
+		 *
+		 * There is a risk of starvation here, but we hope it will not happen
+		 * in practice.  Contention for inserting new elements should be
+		 * spread out pretty much evenly across N+M possible insertion points,
+		 * where N is the number of buckets and M is the number of elements
+		 * in the table.  Even for a quite modestly size table this is likely
+		 * to exceed the number of CPU cores.
+		 */
+		Assert(!CHashPtrIsMarked(scan.target));
+		nnew->next = scan.target;
+		if (!__sync_bool_compare_and_swap(scan.pointer_to_target,
+										  scan.target, new))
+		{
+			CHashTableIncrementStatistic(table, CHS_Insert_Retry);
+			goto retry;
+		}
+	}
+
+	/* Allow garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] != NULL);
+	pg_memory_barrier();
+	MyProc->hazard[0] = NULL;
+
+	/*
+	 * If the insert failed, add the entry we found to the appropriate
+	 * garbage list.  We can't simply put this back on the freelist,
+	 * because that leads to an ABA problem: some other backend could
+	 * read the head of the freelist, go into the tank, and then use
+	 * CAS to pop an element.  If at that point, it finds the same
+	 * element at the head of the freelist but with a different successor,
+	 * we're hosed.  To prevent that, we ensure that elements are added
+	 * to a given freelist only after verifying that any allocations in
+	 * progress at the time we popped the freelist has completed.  This
+	 * guarantees that any allocation still in progress at the time this
+	 * element makes it back to the freelist is trying to allocate some
+	 * other node.
+	 */
+	CHashTableIncrementStatistic(table, CHS_Insert);
+	if (scan.found)
+	{
+		CHashTableIncrementStatistic(table, CHS_Insert_Failed);
+		CHashAddToGarbage(table, bucket, new);
+	}
+
+	/* The insert succeeded if and only if no duplicate was found. */
+	return !scan.found;
+}
+
+/*
+ * Delete from a concurrent hash table.  entry need only contain the key field.
+ * Returns true if we find and delete a matching key and false otherwise.
+ */
+bool
+CHashDelete(CHashTable table, void *entry)
+{
+	uint32	hashcode = hash_any(entry, table->desc.key_size);
+	uint32	bucket = hashcode & table->bucket_mask;
+	CHashPtr	   *b = &table->bucket[bucket];
+	CHashScanResult	scan;
+
+	/* Prevent garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] == NULL);
+	MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+	pg_memory_barrier();
+
+	/* Scan bucket. */
+retry:
+	CHashBucketScan(table, b, hashcode, entry, &scan);
+
+	/* If we found it, try to delete it. */
+	if (scan.found)
+	{
+		Assert(!CHashPtrIsMarked(scan.next));
+
+		/* Attempt to apply delete-mark. */
+		if (!__sync_bool_compare_and_swap(&scan.target_node->next,
+										  scan.next,
+										  CHashPtrMark(scan.next)))
+		{
+			CHashTableIncrementStatistic(table, CHS_Delete_Retry);
+			goto retry;
+		}
+
+		/* Deletion is done; attempt to remove node from list. */
+		if (__sync_bool_compare_and_swap(scan.pointer_to_target,
+										 scan.target,
+										 scan.next))
+			CHashAddToGarbage(table, bucket, scan.target);
+		else
+		{
+			CHashScanResult	cleanup_scan;
+
+			/*
+			 * If we weren't able to remove the deleted item, rescan
+			 * the bucket to make sure it's really gone.  This is just
+			 * like a regular bucket scan, except that we don't care
+			 * about the results.  We're just doing it to achieve the
+			 * side-effect of removing delete-marked nodes from the
+			 * bucket chain.
+			 */
+			CHashTableIncrementStatistic(table, CHS_Cleanup_Scan);
+			CHashBucketScan(table, b, hashcode, entry, &cleanup_scan);
+		}
+	}
+
+	/* Allow garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] != NULL);
+	pg_memory_barrier();
+	MyProc->hazard[0] = NULL;
+
+	/* We're done. */
+	CHashTableIncrementStatistic(table, CHS_Delete);
+	if (!scan.found)
+		CHashTableIncrementStatistic(table, CHS_Delete_Failed);
+	return scan.found;
+}
+
+/*
+ * Provide backend-local statistics to caller.
+ */
+void
+CHashStatistics(CHashTable table, uint64 *stats)
+{
+	memcpy(stats, &table->stats, sizeof(uint64) * CHS_NumberOfStatistics);
+}
+
+/*
+ * Scan one bucket of a concurrent hash table, storing the results in a
+ * CHashResult object provided by the caller.
+ *
+ * Caller must suppress garbage collection for the target bucket before
+ * calling this function, and resume it afterwards.
+ *
+ * On return, res->found will be true if a matching item was found and false
+ * otherwise.  res->target will be a CHashPtr referencing the matching item,
+ * or the first one following the position where the matching item should have
+ * been; res->pointer_to_target will be a pointer to the memory address from
+ * which res->target was read.
+ *
+ * If res->target is not invalid, then res->target_node is a pointer to its
+ * location in memory.  If res->found, then res->next will be the next pointer
+ * of res->target_node; otherwise, it's undefined.
+ */
+static void
+CHashBucketScan(CHashTable table,
+				CHashPtr *start,
+				uint32 hashcode,
+				const void *key,
+				CHashScanResult *res)
+{
+	CHashPtr	target;
+	CHashPtr   *pointer_to_target;
+	CHashNode  *target_node = NULL;
+
+retry:
+	pointer_to_target = start;
+	target = *pointer_to_target;
+	for (;;)
+	{
+		CHashPtr	next;
+		uint32		h;
+		int			cmp;
+
+		/*
+		 * If we've reached the end of the bucket chain, stop; otherwise,
+		 * figure out the actual address of the next item.
+		 */
+		if (CHashPtrIsInvalid(target))
+		{
+			res->found = false;
+			break;
+		}
+		target_node = CHashTableGetNode(table, target);
+
+		/*
+		 * target may have been fetched from an arena entry that could be
+		 * concurrently modified, so a dependency barrier is required before
+		 * dereferencing the derived pointer.
+		 */
+		pg_read_barrier_depends();
+		next = target_node->next;
+
+		/*
+		 * For simplicity, any bucket scan, even if it's servicing a search,
+		 * will attempt to remove delete-marked items from the bucket.  This
+		 * ensures that delete-marked elements are removed from bucket chains
+		 * as quickly as possible and reduces code duplication.  See
+		 * CHashDelete for further comments about why delete-marking is
+		 * necessary and how it allows safe deletion.
+		 */
+		if (CHashPtrIsMarked(next))
+		{
+zap:
+			if (__sync_bool_compare_and_swap(pointer_to_target,
+											 target,
+											 CHashPtrUnmark(next)))
+			{
+				/*
+				 * No one else can possibly have modified target_node->next,
+				 * because such modifications are not allowed after a
+				 * delete-mark has been applied.  Thus, if we just keep
+				 * following the next pointers, we're guaranteed to visit
+				 * all non-deleted items (and possibly some deleted items)
+				 * that were present at the time we began the scan.
+				 */
+				CHashTableIncrementStatistic(table, CHS_Scan_Expunge);
+				CHashAddToGarbage(table, hashcode & table->bucket_mask,
+								  target);
+				target = CHashPtrUnmark(next);
+			}
+			else
+			{
+				/*
+				 * If the previous node has been delete-marked, we can't
+				 * further update the next-pointer, so restart the scan.
+				 * Otherwise, we know that some other backend removed one
+				 * or more deleted nodes from the bucket chain just as we
+				 * were trying to do, and we can simply continue the scan
+				 * from wherever the previous node is pointing now.  It's
+				 * possible that some concurrent inserts have happened, too,
+				 * but that's OK; we can view those as having happened "before"
+				 * whatever operation this scan is supporting.
+				 *
+				 * Although starvation is a theoretical possibility here if
+				 * we are forced to retry repeatedly, even a single retry is
+				 * vanishingly unlikely in practice.  It requires some other
+				 * backend to delete both the node that we're looking at and
+				 * the node which precedes it before we advance to the next
+				 * node.  That could certainly happen occasionally, but we'd
+				 * have to be pretty unlucky to have it happen even twice in
+				 * a row.
+				 */
+				CHashTableIncrementStatistic(table, CHS_Scan_Expunge_Fail);
+				target = *pointer_to_target;
+				if (CHashPtrIsMarked(target))
+				{
+					CHashTableIncrementStatistic(table, CHS_Scan_Restart);
+					goto retry;
+				}
+			}
+			continue;
+		}
+
+		/*
+		 * Bucket chains are kept in order, so that there is exactly one legal
+		 * point at which any given key can be inserted.  The ordering is by
+		 * hashcode first, and then by memcmp ordering of the keys involved.
+		 */
+		h = target_node->un.hashcode;
+		if (h == hashcode)
+			cmp = memcmp(CHashNodeGetItem(target_node), key,
+						 table->desc.key_size);
+		else if (h > hashcode)
+			cmp = 1;
+		else
+			cmp = -1;
+
+		/*
+		 * If cmp < 0, then we haven't yet reached the point at which we'd
+		 * expect to find the key, so we must continue the scan.  If cmp == 0,
+		 * we've found the key and can stop.  If cmp > 0, we've either passed
+		 * the point where we expect to find the key OR someone delete-marked
+		 * the item and overwrote the hashcode with a gcnext pointer.  In the
+		 * latter case we must take care not to be fooled into stopping the
+		 * scan early.
+		 */
+		if (cmp >= 0)
+		{
+			if (cmp == 0)
+			{
+				res->found = true;
+				res->next = next;
+				break;
+			}
+			else
+			{
+				/*
+				 * pg_read_barrier() prevents the reread of the next pointer
+				 * from being speculated ahead of the read of the hash value.
+				 */
+				pg_read_barrier();
+				next = target_node->next;
+				if (CHashPtrIsMarked(next))
+					goto zap;
+				res->found = false;
+				break;
+			}
+		}
+
+		/* Continue scan from next node. */
+		pointer_to_target = &target_node->next;
+		target = next;
+	}
+
+	/* Send results back to caller. */
+	res->target = target;
+	res->pointer_to_target = pointer_to_target;
+	res->target_node = target_node;
+}
+
+/*
+ * Allocate an arena slot for a new item to be inserted into a hash table.
+ *
+ * We don't want to wait until every single free-list is completely empty
+ * before beginning to garbage collect, because that could result in very
+ * fast allocation followed by a storm of garbage collection activity.
+ * It could also lead to every inserting backend ganging up on the only
+ * non-empty freelist.
+ *
+ * To avoid that, we check free lists and garbage lists in alternation.
+ * We always start with the same free list - which one is based on our
+ * backend ID - but we try to round-robin over all the available garbage
+ * lists.  Whenever we successfully garbage collect, we put the recovered
+ * items on our own free list.  In this way, if there's only one backend
+ * active, it will typically find a free buffer in the first place it looks:
+ * its own free list.  It will also settle into a pattern of garbage
+ * collecting the garbage list which it has visited least recently, which
+ * is what we want.
+ */
+static CHashPtr
+CHashAllocate(CHashTable table)
+{
+	uint32		f_current;
+	CHashPtr	new;
+
+	/* Pick a starting freelist base on our backend ID. */
+ 	f_current = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
+
+	/* If this process hasn't initialized gc_next yet, do that now. */
+	if (table->gc_pid != MyProcPid)
+	{
+		table->gc_pid = MyProcPid;
+		table->gc_next = ((uint32) MyProcPid) % CHashTableNGarbage(table);
+	}
+
+	/* Loop until we allocate a buffer. */
+	for (;;)
+	{
+		CHashPtr  *b;
+
+		/*
+ 		 * Try to pop a buffer from a freelist using compare-and-swap.
+ 		 *
+ 		 * Note that this is only safe if it's impossible for the next pointer
+ 		 * of the first element on the list to change between the time when
+ 		 * we read it and the time we use CAS to pop it off the list.  This
+ 		 * means that we can't allow any element that is currently on this
+ 		 * freelist to be allocated, marked as garbage, garbage collected,
+ 		 * and returned back to this freelist before we finish the CAS.
+ 		 *
+ 		 * If we attempt to pop the free-list and fail, we retry immediately
+ 		 * with the same free-list.  This reduces the frequency with which
+ 		 * we're obliged to update our hazard pointers, which is a material
+ 		 * savings due to the associated memory barrier.
+ 		 */
+		b = CHashTableGetFreeList(table, f_current);
+		MyProc->hazard[0] = b;
+		pg_memory_barrier();
+		new = *b;
+		while (!CHashPtrIsInvalid(new))
+		{
+			CHashNode  *n = CHashTableGetNode(table, new);
+
+			/*
+			 * n is computed from table->freelist[f_current], which could
+			 * be modified by concurrent activity, so we need a dependency
+			 * barrier here.
+			 */
+			pg_read_barrier_depends();
+			if (__sync_bool_compare_and_swap(b, new, n->un.gcnext))
+				return new;
+			CHashTableIncrementStatistic(table, CHS_Allocate_Fail);
+			new = *b;
+		}
+
+		/* Attempt garbage collection. */
+		new = CHashAllocateViaGC(table);
+		if (!CHashPtrIsInvalid(new))
+			return new;
+
+		/* Advance to next freelist. */
+		f_current = (f_current + 1) % CHashTableNFreeLists(table);
+	}
+}
+
+/*
+ * Attempt to satisfy an allocation request via garbage collection.
+ */
+static CHashPtr
+CHashAllocateViaGC(CHashTable table)
+{
+	uint32		f_home;
+	CHashPtr   *b;
+	CHashPtr   *fh;
+	CHashPtr	fhead;
+	CHashPtr	garbage;
+	CHashPtr	new;
+	CHashNode  *n;
+	uint32		i;
+
+	/* Pick a target freelist based on our backend ID. */
+ 	f_home = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
+	fh = CHashTableGetFreeList(table, f_home);
+
+	/* Select target garbage list. */
+	table->gc_next = (table->gc_next + 1) % CHashTableNGarbage(table);
+	b = CHashTableGetGarbageList(table, table->gc_next);
+	garbage = *b;
+
+	/* If list is empty, fail. */
+	if (CHashPtrIsInvalid(garbage))
+		return InvalidCHashPtr;
+
+	/* If we're unable to empty the list via compare-and-swap, fail. */
+	if (!__sync_bool_compare_and_swap(b, garbage, InvalidCHashPtr))
+	{
+		CHashTableIncrementStatistic(table, CHS_Garbage_Dequeue_Fail);
+		return InvalidCHashPtr;
+	}
+
+	/*
+	 * Before removing elements removed from the garbage list to the
+	 * freelist, we must wait until (1) all bucket scans that might
+	 * still see elements on the freelist as part of the bucket chain
+	 * have completed and (2) all allocations that might see an old
+	 * version of the freelist containing one of the elements to be
+	 * garbage collected have completed.
+	 *
+	 * Note: We can't begin this operation until the clearing of the
+	 * garbage list has been committed to memory, but since that was
+	 * done using an atomic operation no explicit barrier is needed
+	 * here.
+	 *
+	 * Note: We could have a "soft" version of this that merely
+	 * requeues the garbage if it's not immediately recycleable, but
+	 * it's not clear that we need such a thing.  On the flip side we
+	 * might want to eventually enter a longer sleep here, or PANIC,
+	 * but it's not clear exactly how to calibrate that.
+	 */
+	CHashTableIncrementStatistic(table, CHS_GC);
+	MyProc->hazard[0] = NULL;
+	for (i = 0; i < ProcGlobal->allProcCount; i++)
+	{
+		volatile PGPROC	*proc = &ProcGlobal->allProcs[i];
+		void	   *hazard;
+
+		hazard = proc->hazard[0];
+		if (hazard == b || hazard == fh)
+		{
+			CHashTableIncrementStatistic(table, CHS_GC_Spin);
+			do
+			{
+				hazard = proc->hazard[0];
+			} while (hazard == b || hazard == fh);
+		}
+	}
+
+	/* Remove one item from list to satisfy current allocation. */
+	new = garbage;
+	n = CHashTableGetNode(table, new);
+	pg_read_barrier_depends();
+	fhead = n->un.gcnext;
+
+	if (CHashPtrIsInvalid(fhead))
+	{
+		/*
+		 * We have reclaimed exactly node, so there's nothing to put
+		 * back on the free list.  In this case (only) we need a
+		 * memory barrier, because the reads above must complete
+		 * before we overwrite n->un.gcnext with a new hashcode.
+		 * (This is only needed when we reclaim exactly one node,
+		 * because in any other case we'll do a compare-and-swap
+		 * before returning, which implies a full barrier.)
+ 		 */
+		pg_memory_barrier();
+		CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Skipped);
+	}
+	else if (__sync_bool_compare_and_swap(fh, InvalidCHashPtr, fhead))
+	{
+		/*
+ 		 * Our free list is empty, and we've succesfully pushed the
+ 		 * reclaimed nodes onto it.  So we're done.
+ 		 */
+		CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Fast);
+	}
+	else
+	{
+		CHashPtr	fcurrent;
+		CHashPtr	fnext;
+		CHashPtr	oldhead;
+
+		/* Walk list of reclaimed elements to end. */
+		fcurrent = fhead;
+		for (;;)
+		{
+			n = CHashTableGetNode(table, fcurrent);
+			fnext = n->un.gcnext;
+			if (CHashPtrIsInvalid(fnext))
+				break;
+			fcurrent = fnext;
+		}
+
+		/* Push reclaimed elements onto home free list. */
+		for (;;)
+		{
+			oldhead = *fh;
+			n->un.gcnext = oldhead;
+			if (__sync_bool_compare_and_swap(fh, oldhead, fhead))
+				break;
+			CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Retry);
+		}
+	}
+
+	/* Return the element we saved for ourselves. */
+	return new;
+}
+
+/*
+ * Add an arena slot to the appropriate garbage list.
+ *
+ * The next garbage collection cycle for the affected bucket will move it
+ * to the free list.  We can't do that immediately because there might be
+ * someone traversing the list who is counting on being able to follow the
+ * next pointer.  It's OK to clobber the hash value, though, since a spurious
+ * failure to match an already-deleted item shouldn't cause any problems;
+ * this is why gcnext can share space with the hash value.
+ */
+static void
+CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c)
+{
+	CHashPtr	g;
+	CHashNode *n;
+	CHashPtr *garbage;
+
+	n = CHashTableGetNode(table, c);
+	garbage = CHashTableGetGarbageByBucket(table, bucket);
+
+	while (1)
+	{
+		g = *garbage;
+		n->un.gcnext = g;
+		if (__sync_bool_compare_and_swap(garbage, g, c))
+			break;
+		CHashTableIncrementStatistic(table, CHS_Garbage_Enqueue_Retry);
+	}
+}
diff --git a/src/bin/pg_basebackup/pg_receivexlog.c b/src/bin/pg_basebackup/pg_receivexlog.c
index e6f69e4edd..7374cc8eb4 100644
--- a/src/bin/pg_basebackup/pg_receivexlog.c
+++ b/src/bin/pg_basebackup/pg_receivexlog.c
@@ -66,9 +66,12 @@ usage(void)
 	printf(_("  %s [OPTION]...\n"), progname);
 	printf(_("\nOptions:\n"));
 	printf(_("  -D, --directory=DIR    receive transaction log files into this directory\n"));
+	printf(_("  -F  --fsync-interval=SECS\n"
+			 "                         time between fsyncs to transaction log files (default: %d)\n"), (fsync_interval / 1000));
 	printf(_("  -n, --no-loop          do not loop on connection lost\n"));
-	printf(_("  -F  --fsync-interval=INTERVAL\n"
-			 "                         frequency of syncs to transaction log files (in seconds)\n"));
+	printf(_("  -s, --status-interval=SECS\n"
+			 "                         time between status packets sent to server (default: %d)\n"), (standby_message_timeout / 1000));
+	printf(_("  -S, --slot=SLOTNAME    replication slot to use\n"));
 	printf(_("  -v, --verbose          output verbose messages\n"));
 	printf(_("  -V, --version          output version information, then exit\n"));
 	printf(_("  -?, --help             show this help, then exit\n"));
@@ -76,12 +79,9 @@ usage(void)
 	printf(_("  -d, --dbname=CONNSTR   connection string\n"));
 	printf(_("  -h, --host=HOSTNAME    database server host or socket directory\n"));
 	printf(_("  -p, --port=PORT        database server port number\n"));
-	printf(_("  -s, --status-interval=INTERVAL\n"
-			 "                         time between status packets sent to server (in seconds)\n"));
 	printf(_("  -U, --username=NAME    connect as specified database user\n"));
 	printf(_("  -w, --no-password      never prompt for password\n"));
 	printf(_("  -W, --password         force password prompt (should happen automatically)\n"));
-	printf(_("  -S, --slot=SLOTNAME    replication slot to use\n"));
 	printf(_("\nOptional actions:\n"));
 	printf(_("      --create-slot      create a new replication slot (for the slot's name see --slot)\n"));
 	printf(_("      --drop-slot        drop the replication slot (for the slot's name see --slot)\n"));
diff --git a/src/bin/pg_basebackup/pg_recvlogical.c b/src/bin/pg_basebackup/pg_recvlogical.c
index 1a01167912..0d97638851 100644
--- a/src/bin/pg_basebackup/pg_recvlogical.c
+++ b/src/bin/pg_basebackup/pg_recvlogical.c
@@ -62,15 +62,27 @@ static void disconnect_and_exit(int code);
 static void
 usage(void)
 {
-	printf(_("%s receives PostgreSQL logical change stream.\n\n"),
+	printf(_("%s receives PostgreSQL logical change streams.\n\n"),
 		   progname);
 	printf(_("Usage:\n"));
 	printf(_("  %s [OPTION]...\n"), progname);
+	printf(_("\nAction to be performed:\n"));
+	printf(_("      --create-slot      create a new replication slot (for the slot's name see --slot)\n"));
+	printf(_("      --drop-slot        drop the replication slot (for the slot's name see --slot)\n"));
+	printf(_("      --start            start streaming in a replication slot (for the slot's name see --slot)\n"));
 	printf(_("\nOptions:\n"));
-	printf(_("  -f, --file=FILE        receive log into this file. - for stdout\n"));
+	printf(_("  -f, --file=FILE        receive log into this file, - for stdout\n"));
 	printf(_("  -F  --fsync-interval=SECS\n"
-			 "                         frequency of syncs to the output file (default: %d)\n"), (fsync_interval / 1000));
+			 "                         time between fsyncs to the output file (default: %d)\n"), (fsync_interval / 1000));
+	printf(_("  -I, --startpos=LSN     where in an existing slot should the streaming start\n"));
 	printf(_("  -n, --no-loop          do not loop on connection lost\n"));
+	printf(_("  -o, --option=NAME[=VALUE]\n"
+			 "                         pass option NAME with optional value VALUE to the\n"
+			 "                         output plugin\n"));
+	printf(_("  -P, --plugin=PLUGIN    use output plugin PLUGIN (default: %s)\n"), plugin);
+	printf(_("  -s, --status-interval=SECS\n"
+			 "                         time between status packets sent to server (default: %d)\n"), (standby_message_timeout / 1000));
+	printf(_("  -S, --slot=SLOTNAME    name of the logical replication slot\n"));
 	printf(_("  -v, --verbose          output verbose messages\n"));
 	printf(_("  -V, --version          output version information, then exit\n"));
 	printf(_("  -?, --help             show this help, then exit\n"));
@@ -81,19 +93,6 @@ usage(void)
 	printf(_("  -U, --username=NAME    connect as specified database user\n"));
 	printf(_("  -w, --no-password      never prompt for password\n"));
 	printf(_("  -W, --password         force password prompt (should happen automatically)\n"));
-	printf(_("\nReplication options:\n"));
-	printf(_("  -I, --startpos=PTR     where in an existing slot should the streaming start\n"));
-	printf(_("  -o, --option=NAME[=VALUE]\n"
-			 "                         specify option NAME with optional value VALUE, to be passed\n"
-			 "                         to the output plugin\n"));
-	printf(_("  -P, --plugin=PLUGIN    use output plugin PLUGIN (default: %s)\n"), plugin);
-	printf(_("  -s, --status-interval=SECS\n"
-			 "                         time between status packets sent to server (default: %d)\n"), (standby_message_timeout / 1000));
-	printf(_("  -S, --slot=SLOT        name of the logical replication slot\n"));
-	printf(_("\nAction to be performed:\n"));
-	printf(_("      --create-slot      create a new replication slot (for the slot's name see --slot)\n"));
-	printf(_("      --drop-slot        drop the replication slot (for the slot's name see --slot)\n"));
-	printf(_("      --start            start streaming in a replication slot (for the slot's name see --slot)\n"));
 	printf(_("\nReport bugs to <pgsql-bugs@postgresql.org>.\n"));
 }
 
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index a46ca53ba6..733f1cbc86 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -1456,7 +1456,9 @@ pgwin32_doRegister(void)
 	   NULL, NULL, "RPCSS\0", register_username, register_password)) == NULL)
 	{
 		CloseServiceHandle(hSCM);
-		write_stderr(_("%s: could not register service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError());
+		write_stderr(_("%s: could not register service \"%s\": error code %lu\n"),
+					 progname, register_servicename,
+					 (unsigned long) GetLastError());
 		exit(1);
 	}
 	CloseServiceHandle(hService);
@@ -1484,14 +1486,18 @@ pgwin32_doUnregister(void)
 	if ((hService = OpenService(hSCM, register_servicename, DELETE)) == NULL)
 	{
 		CloseServiceHandle(hSCM);
-		write_stderr(_("%s: could not open service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError());
+		write_stderr(_("%s: could not open service \"%s\": error code %lu\n"),
+					 progname, register_servicename,
+					 (unsigned long) GetLastError());
 		exit(1);
 	}
 	if (!DeleteService(hService))
 	{
 		CloseServiceHandle(hService);
 		CloseServiceHandle(hSCM);
-		write_stderr(_("%s: could not unregister service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError());
+		write_stderr(_("%s: could not unregister service \"%s\": error code %lu\n"),
+					 progname, register_servicename,
+					 (unsigned long) GetLastError());
 		exit(1);
 	}
 	CloseServiceHandle(hService);
@@ -1627,7 +1633,9 @@ pgwin32_doRunAsService(void)
 
 	if (StartServiceCtrlDispatcher(st) == 0)
 	{
-		write_stderr(_("%s: could not start service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError());
+		write_stderr(_("%s: could not start service \"%s\": error code %lu\n"),
+					 progname, register_servicename,
+					 (unsigned long) GetLastError());
 		exit(1);
 	}
 }
@@ -1708,7 +1716,14 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser
 	/* Open the current token to use as a base for the restricted one */
 	if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ALL_ACCESS, &origToken))
 	{
-		write_stderr(_("%s: could not open process token: error code %lu\n"), progname, GetLastError());
+		/*
+		 * Most Windows targets make DWORD a 32-bit unsigned long.  Cygwin
+		 * x86_64, an LP64 target, makes it a 32-bit unsigned int.  In code
+		 * built for Cygwin as well as for native Windows targets, cast DWORD
+		 * before printing.
+		 */
+		write_stderr(_("%s: could not open process token: error code %lu\n"),
+					 progname, (unsigned long) GetLastError());
 		return 0;
 	}
 
@@ -1721,7 +1736,8 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser
 	SECURITY_BUILTIN_DOMAIN_RID, DOMAIN_ALIAS_RID_POWER_USERS, 0, 0, 0, 0, 0,
 								  0, &dropSids[1].Sid))
 	{
-		write_stderr(_("%s: could not allocate SIDs: error code %lu\n"), progname, GetLastError());
+		write_stderr(_("%s: could not allocate SIDs: error code %lu\n"),
+					 progname, (unsigned long) GetLastError());
 		return 0;
 	}
 
@@ -1740,7 +1756,8 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser
 
 	if (!b)
 	{
-		write_stderr(_("%s: could not create restricted token: error code %lu\n"), progname, GetLastError());
+		write_stderr(_("%s: could not create restricted token: error code %lu\n"),
+					 progname, (unsigned long) GetLastError());
 		return 0;
 	}
 
@@ -1791,7 +1808,8 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser
 				HANDLE		job;
 				char		jobname[128];
 
-				sprintf(jobname, "PostgreSQL_%lu", processInfo->dwProcessId);
+				sprintf(jobname, "PostgreSQL_%lu",
+						(unsigned long) processInfo->dwProcessId);
 
 				job = _CreateJobObject(NULL, jobname);
 				if (job)
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index c9e61dfa39..0e1e0cd5f0 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -127,6 +127,10 @@ typedef struct HashJoinTableData
 	int			nbuckets;		/* # buckets in the in-memory hash table */
 	int			log2_nbuckets;	/* its log2 (nbuckets must be a power of 2) */
 
+	int			nbuckets_original;	/* # buckets when starting the first hash */
+	int			nbuckets_optimal;	/* optimal # buckets (per batch) */
+	int			log2_nbuckets_optimal;	/* same as log2_nbuckets optimal */
+
 	/* buckets[i] is head of list of tuples in i'th in-memory bucket */
 	struct HashJoinTupleData **buckets;
 	/* buckets array is per-batch storage, as are all the tuples */
@@ -148,6 +152,7 @@ typedef struct HashJoinTableData
 	bool		growEnabled;	/* flag to shut off nbatch increases */
 
 	double		totalTuples;	/* # tuples obtained from inner plan */
+	double		skewTuples;		/* # tuples inserted into skew tuples */
 
 	/*
 	 * These arrays are allocated for the life of the hash join, but only if
diff --git a/src/include/storage/barrier.h b/src/include/storage/barrier.h
index b36705b862..6ef779bf95 100644
--- a/src/include/storage/barrier.h
+++ b/src/include/storage/barrier.h
@@ -20,4 +20,12 @@
  */
 #include "port/atomics.h"
 
+/*
+ * If dependency barriers are undefined, we define them as a no-op.  The only
+ * known platform where anything more is required is DEC Alpha.
+ */
+#if !defined(pg_read_barrier_depends)
+#define pg_read_barrier_depends()
+#endif
+
 #endif   /* BARRIER_H */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 0e69b633c3..4c6fac8052 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -96,20 +96,6 @@ typedef struct buftag
 )
 
 /*
- * The shared buffer mapping table is partitioned to reduce contention.
- * To determine which partition lock a given tag requires, compute the tag's
- * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
- * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
- */
-#define BufTableHashPartition(hashcode) \
-	((hashcode) % NUM_BUFFER_PARTITIONS)
-#define BufMappingPartitionLock(hashcode) \
-	(&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \
-		BufTableHashPartition(hashcode)].lock)
-#define BufMappingPartitionLockByIndex(i) \
-	(&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock)
-
-/*
  *	BufferDesc -- shared descriptor/state data for a single shared buffer.
  *
  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
@@ -200,9 +186,9 @@ extern void StrategyInitialize(bool init);
 extern Size BufTableShmemSize(int size);
 extern void InitBufTable(int size);
 extern uint32 BufTableHashCode(BufferTag *tagPtr);
-extern int	BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
-extern int	BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
-extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
+extern int	BufTableLookup(BufferTag *tagPtr);
+extern int	BufTableInsert(BufferTag *tagPtr, int buf_id);
+extern void BufTableDelete(BufferTag *tagPtr);
 
 /* localbuf.c */
 extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 595e69da48..8e98425ca4 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -140,7 +140,7 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
  */
 
 /* Number of partitions of the shared buffer mapping hashtable */
-#define NUM_BUFFER_PARTITIONS  128
+#define NUM_BUFFER_PARTITIONS  0
 
 /* Number of partitions the shared lock tables are divided into */
 #define LOG2_NUM_LOCK_PARTITIONS  4
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 38758d3ea5..cdf2f268fd 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -59,6 +59,14 @@ struct XidCache
 #define		FP_LOCK_SLOTS_PER_BACKEND 16
 
 /*
+ * Some lock-free algorithms require each backend process to be able to
+ * advertise a certain number of "hazard pointers" in shared memory.
+ * Right now one per backend is enough for our purpose, but some
+ * algorithms require more.
+ */
+#define		NUM_HAZARD_POINTERS			1
+
+/*
  * Each backend has a PGPROC struct in shared memory.  There is also a list of
  * currently-unused PGPROC structs that will be reallocated to new backends.
  *
@@ -143,6 +151,12 @@ struct PGPROC
 	bool		fpVXIDLock;		/* are we holding a fast-path VXID lock? */
 	LocalTransactionId fpLocalTransactionId;	/* lxid for fast-path VXID
 												 * lock */
+
+	/*
+	 * Hazard pointers. Currently one is enough, though some algorithms
+	 * require a few more.
+	 */
+	void	   *hazard[NUM_HAZARD_POINTERS];
 };
 
 /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
diff --git a/src/include/storage/shm_mq.h b/src/include/storage/shm_mq.h
index 5bae3807af..063400ae28 100644
--- a/src/include/storage/shm_mq.h
+++ b/src/include/storage/shm_mq.h
@@ -25,6 +25,13 @@ typedef struct shm_mq shm_mq;
 struct shm_mq_handle;
 typedef struct shm_mq_handle shm_mq_handle;
 
+/* Descriptors for a single write spanning multiple locations. */
+typedef struct
+{
+	const char  *data;
+	Size	len;
+} shm_mq_iovec;
+
 /* Possible results of a send or receive operation. */
 typedef enum
 {
@@ -52,12 +59,17 @@ extern PGPROC *shm_mq_get_sender(shm_mq *);
 extern shm_mq_handle *shm_mq_attach(shm_mq *mq, dsm_segment *seg,
 			  BackgroundWorkerHandle *handle);
 
+/* Associate worker handle with shm_mq. */
+extern void shm_mq_set_handle(shm_mq_handle *, BackgroundWorkerHandle *);
+
 /* Break connection. */
 extern void shm_mq_detach(shm_mq *);
 
 /* Send or receive messages. */
 extern shm_mq_result shm_mq_send(shm_mq_handle *mqh,
-			Size nbytes, void *data, bool nowait);
+			Size nbytes, const void *data, bool nowait);
+extern shm_mq_result shm_mq_sendv(shm_mq_handle *mqh,
+			shm_mq_iovec *iov, int iovcnt, bool nowait);
 extern shm_mq_result shm_mq_receive(shm_mq_handle *mqh,
 			   Size *nbytesp, void **datap, bool nowait);
 
diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h
index 745eb7e576..4ff8415fac 100644
--- a/src/include/storage/shmem.h
+++ b/src/include/storage/shmem.h
@@ -40,6 +40,7 @@ extern void InitShmemIndex(void);
 extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size,
 			  HASHCTL *infoP, int hash_flags);
 extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr);
+extern void *ShmemAttachStruct(const char *name);
 extern Size add_size(Size s1, Size s2);
 extern Size mul_size(Size s1, Size s2);
 
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index d88e7a3b26..fb1b4a42dd 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -676,13 +676,10 @@ extern Datum pg_get_viewdef_name(PG_FUNCTION_ARGS);
 extern Datum pg_get_viewdef_name_ext(PG_FUNCTION_ARGS);
 extern Datum pg_get_indexdef(PG_FUNCTION_ARGS);
 extern Datum pg_get_indexdef_ext(PG_FUNCTION_ARGS);
-extern char *pg_get_indexdef_string(Oid indexrelid);
-extern char *pg_get_indexdef_columns(Oid indexrelid, bool pretty);
 extern Datum pg_get_triggerdef(PG_FUNCTION_ARGS);
 extern Datum pg_get_triggerdef_ext(PG_FUNCTION_ARGS);
 extern Datum pg_get_constraintdef(PG_FUNCTION_ARGS);
 extern Datum pg_get_constraintdef_ext(PG_FUNCTION_ARGS);
-extern char *pg_get_constraintdef_string(Oid constraintId);
 extern Datum pg_get_expr(PG_FUNCTION_ARGS);
 extern Datum pg_get_expr_ext(PG_FUNCTION_ARGS);
 extern Datum pg_get_userbyid(PG_FUNCTION_ARGS);
@@ -692,17 +689,9 @@ extern Datum pg_get_function_arguments(PG_FUNCTION_ARGS);
 extern Datum pg_get_function_identity_arguments(PG_FUNCTION_ARGS);
 extern Datum pg_get_function_result(PG_FUNCTION_ARGS);
 extern Datum pg_get_function_arg_default(PG_FUNCTION_ARGS);
-extern char *deparse_expression(Node *expr, List *dpcontext,
-				   bool forceprefix, bool showimplicit);
-extern List *deparse_context_for(const char *aliasname, Oid relid);
-extern List *deparse_context_for_planstate(Node *planstate, List *ancestors,
-							  List *rtable, List *rtable_names);
-extern List *select_rtable_names_for_explain(List *rtable,
-								Bitmapset *rels_used);
 extern const char *quote_identifier(const char *ident);
 extern char *quote_qualified_identifier(const char *qualifier,
 						   const char *ident);
-extern char *generate_collation_name(Oid collid);
 
 
 /* tid.c */
diff --git a/src/include/utils/chash.h b/src/include/utils/chash.h
new file mode 100644
index 0000000000..ee0573c9c7
--- /dev/null
+++ b/src/include/utils/chash.h
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * chash.h
+ *	  Concurrent shared-memory hash table.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/chash.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CHASH_H 
+#define CHASH_H
+
+/* Everything caller must supply to set up a concurrent hash table. */
+typedef struct
+{
+	const char *shmem_name;		/* shared memory name for this hash table */
+	uint32		capacity;		/* maximum size of hash table */
+	uint16		element_size;	/* size of each element */
+	uint16		key_size;		/* size of each key */
+} CHashDescriptor;
+
+/* Concurrent hash table statistics. */
+typedef enum
+{
+	CHS_Search,					/* search */
+	CHS_Search_Failed,			/* search failed (no such key) */
+	CHS_Insert,					/* insert */
+	CHS_Insert_Failed,			/* insert failed (duplicate key) */
+	CHS_Insert_Retry,			/* insert retried (concurrent update) */
+	CHS_Delete,					/* delete */
+	CHS_Delete_Failed,			/* delete failed (no such key) */
+	CHS_Delete_Retry,			/* delete retried (concurrent update) */
+	CHS_Scan_Expunge,			/* scan expunged deleted item */
+	CHS_Scan_Expunge_Fail,		/* scan failed to expunge */
+	CHS_Scan_Restart,			/* concurrent deletes forced a scan restart */
+	CHS_Cleanup_Scan,			/* concurrent update forced a cleanup scan */
+	CHS_Allocate_Fail,			/* allocation failed to pop freelist */
+	CHS_Garbage_Enqueue_Retry,	/* enqueue on garbage list retried */
+	CHS_Garbage_Dequeue_Fail,	/* dequeue of garbage failed */
+	CHS_GC,						/* garbage collection cycle */
+	CHS_GC_Spin,				/* GC spun waiting for concurrent process */
+	CHS_GC_Reclaim_Skipped,		/* GC recovered only one item */
+	CHS_GC_Reclaim_Fast,		/* GC put garbage on freelist via fast path */
+	CHS_GC_Reclaim_Retry,		/* enqueue of garbage on freelist retried */
+	CHS_NumberOfStatistics		/* number of statistics */
+} CHashStatisticsType;
+
+/* Human-readable names for statistics. */
+extern char *CHashStatisticsNames[];
+
+/* Opaque handle for a concurrent hash table. */
+struct CHashTableData;
+typedef struct CHashTableData *CHashTable;
+
+/* Initialization functions. */
+extern CHashTable CHashBootstrap(CHashDescriptor *desc);
+extern Size CHashEstimateSize(CHashTable table);
+extern CHashTable CHashInitialize(CHashTable table, CHashDescriptor *desc);
+
+/* Accessor functions. */
+extern bool CHashInsert(CHashTable table, void *entry);
+extern bool CHashDelete(CHashTable table, void *key);
+extern bool CHashSearch(CHashTable table, void *entry);
+extern void CHashStatistics(CHashTable table, uint64 *stats);
+
+#endif   /* CHASH_H */
diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h
new file mode 100644
index 0000000000..520b06653c
--- /dev/null
+++ b/src/include/utils/ruleutils.h
@@ -0,0 +1,34 @@
+/*-------------------------------------------------------------------------
+ *
+ * ruleutils.h
+ *		Declarations for ruleutils.c
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/ruleutils.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef RULEUTILS_H
+#define RULEUTILS_H
+
+#include "nodes/nodes.h"
+#include "nodes/parsenodes.h"
+#include "nodes/pg_list.h"
+
+
+extern char *pg_get_indexdef_string(Oid indexrelid);
+extern char *pg_get_indexdef_columns(Oid indexrelid, bool pretty);
+
+extern char *pg_get_constraintdef_string(Oid constraintId);
+extern char *deparse_expression(Node *expr, List *dpcontext,
+				   bool forceprefix, bool showimplicit);
+extern List *deparse_context_for(const char *aliasname, Oid relid);
+extern List *deparse_context_for_planstate(Node *planstate, List *ancestors,
+							  List *rtable, List *rtable_names);
+extern List *select_rtable_names_for_explain(List *rtable,
+								Bitmapset *rels_used);
+extern char *generate_collation_name(Oid collid);
+
+#endif	/* RULEUTILS_H */
diff --git a/src/port/crypt.c b/src/port/crypt.c
index ef8bf46338..6a902ef0fc 100644
--- a/src/port/crypt.c
+++ b/src/port/crypt.c
@@ -87,7 +87,7 @@ static int	des_cipher(const char *in, char *out, long salt, int num_iter);
  * define "B64" to be the declaration for a 64 bit integer.
  * XXX this feature is currently unused, see "endian" comment below.
  */
-#define B64 __int64
+/* #define B64 int64 */
 
 /*
  * define "LARGEDATA" to get faster permutations, by using about 72 kilobytes
diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out
index eb37da7168..9146f59435 100644
--- a/src/test/regress/expected/jsonb.out
+++ b/src/test/regress/expected/jsonb.out
@@ -707,6 +707,42 @@ SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b", "c":"q"}';
  f
 (1 row)
 
+SELECT '[1,2]'::jsonb @> '[1,2,2]'::jsonb;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT '[1,1,2]'::jsonb @> '[1,2,2]'::jsonb;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT '[[1,2]]'::jsonb @> '[[1,2,2]]'::jsonb;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT '[1,2,2]'::jsonb <@ '[1,2]'::jsonb;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT '[1,2,2]'::jsonb <@ '[1,1,2]'::jsonb;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT '[[1,2,2]]'::jsonb <@ '[[1,2]]'::jsonb;
+ ?column? 
+----------
+ t
+(1 row)
+
 SELECT jsonb_contained('{"a":"b"}', '{"a":"b", "b":1, "c":null}');
  jsonb_contained 
 -----------------
diff --git a/src/test/regress/expected/jsonb_1.out b/src/test/regress/expected/jsonb_1.out
index f3bfc7bcf5..83d61f8c7e 100644
--- a/src/test/regress/expected/jsonb_1.out
+++ b/src/test/regress/expected/jsonb_1.out
@@ -707,6 +707,42 @@ SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b", "c":"q"}';
  f
 (1 row)
 
+SELECT '[1,2]'::jsonb @> '[1,2,2]'::jsonb;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT '[1,1,2]'::jsonb @> '[1,2,2]'::jsonb;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT '[[1,2]]'::jsonb @> '[[1,2,2]]'::jsonb;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT '[1,2,2]'::jsonb <@ '[1,2]'::jsonb;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT '[1,2,2]'::jsonb <@ '[1,1,2]'::jsonb;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT '[[1,2,2]]'::jsonb <@ '[[1,2]]'::jsonb;
+ ?column? 
+----------
+ t
+(1 row)
+
 SELECT jsonb_contained('{"a":"b"}', '{"a":"b", "b":1, "c":null}');
  jsonb_contained 
 -----------------
diff --git a/src/test/regress/expected/matview.out b/src/test/regress/expected/matview.out
index 1076b76210..b04510c599 100644
--- a/src/test/regress/expected/matview.out
+++ b/src/test/regress/expected/matview.out
@@ -411,7 +411,7 @@ REFRESH MATERIALIZED VIEW mv;
 ERROR:  could not create unique index "mv_a_idx"
 DETAIL:  Key (a)=(1) is duplicated.
 REFRESH MATERIALIZED VIEW CONCURRENTLY mv;
-ERROR:  new data for "mv" contains duplicate rows without any NULL columns
+ERROR:  new data for "mv" contains duplicate rows without any null columns
 DETAIL:  Row: (1,10)
 DROP TABLE foo CASCADE;
 NOTICE:  drop cascades to materialized view mv
diff --git a/src/test/regress/expected/polygon.out b/src/test/regress/expected/polygon.out
index b252902720..33388eb909 100644
--- a/src/test/regress/expected/polygon.out
+++ b/src/test/regress/expected/polygon.out
@@ -3,15 +3,15 @@
 --
 -- polygon logic
 --
--- 3	      o
---	      |
--- 2	    + |
---	   /  |
--- 1	  # o +
---       /    |
--- 0	#-----o-+
+-- 3          o
+--            |
+-- 2        + |
+--         /  |
+-- 1      #   +
+--       /  o |
+-- 0    #-----o-+
 --
---	0 1 2 3 4
+--      0 1 2 3 4
 --
 CREATE TABLE POLYGON_TBL(f1 polygon);
 INSERT INTO POLYGON_TBL(f1) VALUES ('(2.0,0.0),(2.0,4.0),(0.0,0.0)');
@@ -128,15 +128,16 @@ SELECT '' AS one, p.*
 --
 -- polygon logic
 --
--- 3	      o
---	      |
--- 2	    + |
---	   /  |
--- 1	  / o +
+-- 3          o
+--           /|
+-- 2        + |
+--         /  |
+-- 1      / o +
 --       /    |
--- 0	+-----o-+
+-- 0    +-----o-+
+--
+--      0 1 2 3 4
 --
---	0 1 2 3 4
 --
 -- left of
 SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' << polygon '(3.0,1.0),(3.0,3.0),(1.0,0.0)' AS false;
@@ -248,11 +249,11 @@ SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' && polygon '(3.0,1.0),(3.0,3.0),(
 (1 row)
 
 --     +--------------------+
---     |    *---*       	1
+--     |    *---*           1
 --     |  + |   |
 --     |  2 *---*
 --     +--------------------+
---                      	3
+--                          3
 --     Edges 1-2, 2-3 are not shown on picture
 SELECT '((0,4),(6,4),(1,2),(6,0),(0,0))'::polygon && '((2,1),(2,3),(3,3),(3,1))'::polygon AS "true";
  true 
diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql
index ed266d5c88..f1ed021be2 100644
--- a/src/test/regress/sql/jsonb.sql
+++ b/src/test/regress/sql/jsonb.sql
@@ -156,6 +156,13 @@ SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"c"}';
 SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b"}';
 SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b", "c":"q"}';
 
+SELECT '[1,2]'::jsonb @> '[1,2,2]'::jsonb;
+SELECT '[1,1,2]'::jsonb @> '[1,2,2]'::jsonb;
+SELECT '[[1,2]]'::jsonb @> '[[1,2,2]]'::jsonb;
+SELECT '[1,2,2]'::jsonb <@ '[1,2]'::jsonb;
+SELECT '[1,2,2]'::jsonb <@ '[1,1,2]'::jsonb;
+SELECT '[[1,2,2]]'::jsonb <@ '[[1,2]]'::jsonb;
+
 SELECT jsonb_contained('{"a":"b"}', '{"a":"b", "b":1, "c":null}');
 SELECT jsonb_contained('{"a":"b", "c":null}', '{"a":"b", "b":1, "c":null}');
 SELECT jsonb_contained('{"a":"b", "g":null}', '{"a":"b", "b":1, "c":null}');
diff --git a/src/test/regress/sql/polygon.sql b/src/test/regress/sql/polygon.sql
index 2dad566f37..d95fa96447 100644
--- a/src/test/regress/sql/polygon.sql
+++ b/src/test/regress/sql/polygon.sql
@@ -3,15 +3,15 @@
 --
 -- polygon logic
 --
--- 3	      o
---	      |
--- 2	    + |
---	   /  |
--- 1	  # o +
---       /    |
--- 0	#-----o-+
+-- 3          o
+--            |
+-- 2        + |
+--         /  |
+-- 1      #   +
+--       /  o |
+-- 0    #-----o-+
 --
---	0 1 2 3 4
+--      0 1 2 3 4
 --
 
 CREATE TABLE POLYGON_TBL(f1 polygon);
@@ -83,15 +83,16 @@ SELECT '' AS one, p.*
 --
 -- polygon logic
 --
--- 3	      o
---	      |
--- 2	    + |
---	   /  |
--- 1	  / o +
+-- 3          o
+--           /|
+-- 2        + |
+--         /  |
+-- 1      / o +
 --       /    |
--- 0	+-----o-+
+-- 0    +-----o-+
+--
+--      0 1 2 3 4
 --
---	0 1 2 3 4
 --
 -- left of
 SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' << polygon '(3.0,1.0),(3.0,3.0),(1.0,0.0)' AS false;
@@ -155,11 +156,11 @@ SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' ~= polygon '(3.0,1.0),(3.0,3.0),(
 SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' && polygon '(3.0,1.0),(3.0,3.0),(1.0,0.0)' AS true;
 
 --     +--------------------+
---     |    *---*       	1
+--     |    *---*           1
 --     |  + |   |
 --     |  2 *---*
 --     +--------------------+
---                      	3
+--                          3
 --     Edges 1-2, 2-3 are not shown on picture
 SELECT '((0,4),(6,4),(1,2),(6,0),(0,0))'::polygon && '((2,1),(2,3),(3,3),(3,1))'::polygon AS "true";