diff options
55 files changed, 2493 insertions, 530 deletions
diff --git a/contrib/Makefile b/contrib/Makefile index b37d0dd2c3..0b91ac10ee 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -20,6 +20,7 @@ SUBDIRS = \ earthdistance \ file_fdw \ fuzzystrmatch \ + hashtest \ hstore \ intagg \ intarray \ diff --git a/contrib/hashtest/Makefile b/contrib/hashtest/Makefile new file mode 100644 index 0000000000..3ee42f87d8 --- /dev/null +++ b/contrib/hashtest/Makefile @@ -0,0 +1,18 @@ +# contrib/hashtest/Makefile + +MODULE_big = hashtest +OBJS = hashtest.o + +EXTENSION = hashtest +DATA = hashtest--1.0.sql + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/hashtest +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/hashtest/hashtest--1.0.sql b/contrib/hashtest/hashtest--1.0.sql new file mode 100644 index 0000000000..e271baff0f --- /dev/null +++ b/contrib/hashtest/hashtest--1.0.sql @@ -0,0 +1,52 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION hashtest" to load this file. \quit + +CREATE FUNCTION chash_insert_test() +RETURNS void +AS 'MODULE_PATHNAME', 'chash_insert_test' +LANGUAGE C; + +CREATE FUNCTION chash_search_test() +RETURNS void +AS 'MODULE_PATHNAME', 'chash_search_test' +LANGUAGE C; + +CREATE FUNCTION chash_delete_test() +RETURNS void +AS 'MODULE_PATHNAME', 'chash_delete_test' +LANGUAGE C; + +CREATE FUNCTION chash_concurrent_test() +RETURNS void +AS 'MODULE_PATHNAME', 'chash_concurrent_test' +LANGUAGE C; + +CREATE FUNCTION chash_collision_test() +RETURNS void +AS 'MODULE_PATHNAME', 'chash_collision_test' +LANGUAGE C; + +CREATE FUNCTION dynahash_insert_test() +RETURNS void +AS 'MODULE_PATHNAME', 'dynahash_insert_test' +LANGUAGE C; + +CREATE FUNCTION dynahash_search_test() +RETURNS void +AS 'MODULE_PATHNAME', 'dynahash_search_test' +LANGUAGE C; + +CREATE FUNCTION dynahash_delete_test() +RETURNS void +AS 'MODULE_PATHNAME', 'dynahash_delete_test' +LANGUAGE C; + +CREATE FUNCTION dynahash_concurrent_test() +RETURNS void +AS 'MODULE_PATHNAME', 'dynahash_concurrent_test' +LANGUAGE C; + +CREATE FUNCTION dynahash_collision_test() +RETURNS void +AS 'MODULE_PATHNAME', 'dynahash_collision_test' +LANGUAGE C; diff --git a/contrib/hashtest/hashtest.c b/contrib/hashtest/hashtest.c new file mode 100644 index 0000000000..172a5bb156 --- /dev/null +++ b/contrib/hashtest/hashtest.c @@ -0,0 +1,527 @@ +/*------------------------------------------------------------------------- + * hashtest.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "funcapi.h" +#include "libpq/auth.h" +#include "lib/stringinfo.h" +#include "miscadmin.h" +#include "portability/instr_time.h" +#include "storage/ipc.h" +#include "utils/chash.h" + +PG_MODULE_MAGIC; + +void _PG_init(void); +Datum chash_insert_test(PG_FUNCTION_ARGS); +Datum chash_search_test(PG_FUNCTION_ARGS); +Datum chash_delete_test(PG_FUNCTION_ARGS); +Datum chash_concurrent_test(PG_FUNCTION_ARGS); +Datum chash_collision_test(PG_FUNCTION_ARGS); +Datum dynahash_insert_test(PG_FUNCTION_ARGS); +Datum dynahash_search_test(PG_FUNCTION_ARGS); +Datum dynahash_delete_test(PG_FUNCTION_ARGS); +Datum dynahash_concurrent_test(PG_FUNCTION_ARGS); +Datum dynahash_collision_test(PG_FUNCTION_ARGS); +static void hashtest_shmem_startup(void); + +PG_FUNCTION_INFO_V1(chash_insert_test); +PG_FUNCTION_INFO_V1(chash_search_test); +PG_FUNCTION_INFO_V1(chash_delete_test); +PG_FUNCTION_INFO_V1(chash_concurrent_test); +PG_FUNCTION_INFO_V1(chash_collision_test); +PG_FUNCTION_INFO_V1(dynahash_insert_test); +PG_FUNCTION_INFO_V1(dynahash_search_test); +PG_FUNCTION_INFO_V1(dynahash_delete_test); +PG_FUNCTION_INFO_V1(dynahash_concurrent_test); +PG_FUNCTION_INFO_V1(dynahash_collision_test); + +typedef struct +{ + uint32 key; + uint32 val; +} hentry; + +static CHashDescriptor cdesc = { + "hashtest-chash", /* name */ + 1048576, /* capacity */ + sizeof(hentry), /* element size */ + sizeof(uint32) /* key size */ +}; + +#define DYNAHASH_PARTITIONS 16 + +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; +static CHashTable chash; +static HTAB *dynahash; +static LWLockId dynahash_lock[DYNAHASH_PARTITIONS]; +static ClientAuthentication_hook_type original_client_auth_hook = NULL; + +static void hashtest_client_auth_hook(Port *port, int status); +static void chash_write_stats_to_log(int code, Datum dummy); + +#define dynahash_get_lock(hashcode) \ + (dynahash_lock[(hashcode) % DYNAHASH_PARTITIONS]) + +void +_PG_init(void) +{ + Size cs; + Size ds; + + if (!process_shared_preload_libraries_in_progress) + return; + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = hashtest_shmem_startup; + chash = CHashBootstrap(&cdesc); + cs = CHashEstimateSize(chash); + RequestAddinShmemSpace(cs); + ds = hash_estimate_size(cdesc.capacity, cdesc.element_size); + RequestAddinShmemSpace(ds); + elog(LOG, "chash: %u bytes; dynahash: %u bytes", (unsigned) cs, + (unsigned) ds); + RequestAddinLWLocks(DYNAHASH_PARTITIONS); + original_client_auth_hook = ClientAuthentication_hook; + ClientAuthentication_hook = hashtest_client_auth_hook; + +} + +static void +hashtest_client_auth_hook(Port *port, int status) +{ + if (original_client_auth_hook) + original_client_auth_hook(port, status); + on_proc_exit(chash_write_stats_to_log, (Datum) 0); +} + +static void +chash_write_stats_to_log(int code, Datum dummy) +{ + uint64 stats[CHS_NumberOfStatistics]; + CHashStatisticsType i; + StringInfoData buf; + + CHashStatistics(chash, stats); + initStringInfo(&buf); + + for (i = 0; i < CHS_NumberOfStatistics; ++i) + { + if (stats[i] == 0) + continue; + appendStringInfo(&buf, UINT64_FORMAT " %s; ", stats[i], + CHashStatisticsNames[i]); + } + + if (buf.len > 1) + { + buf.data[buf.len-2] = '\0'; + elog(LOG, "chash statistics: %s", buf.data); + } +} + +static void +hashtest_shmem_startup(void) +{ + HASHCTL info; + uint32 i; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + /* Initialize concurrent hash table. */ + chash = CHashInitialize(chash, &cdesc); + + /* Initialize shared dynahash table. */ + info.keysize = cdesc.key_size; + info.entrysize = cdesc.element_size; + info.hash = tag_hash; + info.num_partitions = DYNAHASH_PARTITIONS; + + dynahash = ShmemInitHash("hashtest-dynahash", + cdesc.capacity, cdesc.capacity, + &info, + HASH_ELEM | HASH_FUNCTION | HASH_PARTITION); + + for (i = 0; i < DYNAHASH_PARTITIONS; ++i) + dynahash_lock[i] = LWLockAssign(); +} + +Datum +chash_insert_test(PG_FUNCTION_ARGS) +{ + uint32 i; + hentry e; + + for (i = 0; i < 1000000; ++i) + { + bool ok; + + e.key = i; + e.val = i * 31; + ok = CHashInsert(chash, &e); + if (!ok) + elog(LOG, "insert %u: failed", i); + ok = CHashInsert(chash, &e); + if (ok) + elog(LOG, "insert %u: worked twice", i); + } + + PG_RETURN_VOID(); +} + +Datum +chash_search_test(PG_FUNCTION_ARGS) +{ + uint32 i; + hentry e; + + for (i = 0; i < 1000000; ++i) + { + bool ok; + + e.key = i; + ok = CHashSearch(chash, &e); + if (!ok) + elog(LOG, "search %u: not found", i); + else if (e.val != e.key * 31) + elog(LOG, "search %u: found %u", i, e.val); + } + + PG_RETURN_VOID(); +} + +Datum +chash_delete_test(PG_FUNCTION_ARGS) +{ + uint32 i; + hentry e; + + for (i = 0; i < 1000000; ++i) + { + bool ok; + + e.key = i; + ok = CHashDelete(chash, &e); + if (!ok) + elog(LOG, "delete %u: not found", i); + ok = CHashDelete(chash, &e); + if (ok) + elog(LOG, "delete %u: found twice", i); + } + + PG_RETURN_VOID(); +} + +Datum +chash_concurrent_test(PG_FUNCTION_ARGS) +{ + uint32 i; + hentry e; + uint32 seed = MyProcPid << 16; + + for (i = 0; i < 10000; ++i) + { + bool ok; + + e.key = seed | i; + e.val = MyProcPid; + ok = CHashInsert(chash, &e); + if (!ok) + elog(LOG, "insert %u: found", i); + } + + for (i = 0; i < 10000; ++i) + { + bool ok; + + e.key = seed | i; + e.val = 0; + ok = CHashSearch(chash, &e); + if (!ok) + { + uint64 retry = 1; + elog(LOG, "search %u: not found", i); + while (!CHashSearch(chash, &e)) + ++retry; + elog(LOG, "search %u: eventually found it after " + UINT64_FORMAT " retries", i, retry); + } + if (e.val != MyProcPid) + elog(LOG, "search %u: expected %u found %u", i, (unsigned) MyProcPid, e.val); + } + + for (i = 0; i < 10000; ++i) + { + bool ok; + + e.key = seed | i; + ok = CHashDelete(chash, &e); + if (!ok) + { + uint64 retry = 1; + elog(LOG, "delete %u: not found", i); + while (!CHashDelete(chash, &e)) + ++retry; + elog(LOG, "delete %u: eventually deleted it after " + UINT64_FORMAT " retries", i, retry); + } + } + + PG_RETURN_VOID(); +} + +Datum +chash_collision_test(PG_FUNCTION_ARGS) +{ + uint32 i; + hentry e; + + /* Don't stack-allocate this. */ + static bool mine[10000]; + + memset(mine, 0, 10000 * sizeof(bool)); + + for (i = 0; i < 10000; ++i) + { + bool ok; + + e.key = i; + e.val = MyProcPid; + ok = CHashInsert(chash, &e); + if (ok) + mine[i] = true; + } + + for (i = 0; i < 10000; ++i) + { + bool ok; + + if (!mine[i]) + continue; + e.key = i; + ok = CHashSearch(chash, &e); + if (!ok) + elog(LOG, "search %u: not found", i); + else if (e.val != MyProcPid) + elog(LOG, "search %u: expected %u found %u", + i, (unsigned) MyProcPid, e.val); + ok = CHashDelete(chash, &e); + if (!ok) + elog(LOG, "delete %u: not found", i); + } + + PG_RETURN_VOID(); +} + +static bool +dynahash_insert(uint32 key, uint32 val) +{ + bool found; + uint32 hashcode; + hentry *e; + LWLockId lockid; + + hashcode = get_hash_value(dynahash, (void *) &key); + lockid = dynahash_get_lock(hashcode); + LWLockAcquire(lockid, LW_EXCLUSIVE); + e = hash_search_with_hash_value(dynahash, (void *) &key, + hashcode, HASH_ENTER, &found); + if (!found) + e->val = val; + LWLockRelease(lockid); + + return !found; +} + +static bool +dynahash_search(uint32 key, uint32 *val) +{ + uint32 hashcode; + hentry *e; + LWLockId lockid; + + hashcode = get_hash_value(dynahash, (void *) &key); + lockid = dynahash_get_lock(hashcode); + LWLockAcquire(lockid, LW_SHARED); + e = hash_search_with_hash_value(dynahash, (void *) &key, + hashcode, HASH_FIND, NULL); + if (e) + *val = e->val; + LWLockRelease(lockid); + + return e != NULL; +} + +static bool +dynahash_delete(uint32 key) +{ + uint32 hashcode; + hentry *e; + LWLockId lockid; + + hashcode = get_hash_value(dynahash, (void *) &key); + lockid = dynahash_get_lock(hashcode); + LWLockAcquire(lockid, LW_EXCLUSIVE); + e = hash_search_with_hash_value(dynahash, (void *) &key, + hashcode, HASH_REMOVE, NULL); + LWLockRelease(lockid); + + return e != NULL; +} + +Datum +dynahash_insert_test(PG_FUNCTION_ARGS) +{ + uint32 i; + + for (i = 0; i < 1000000; ++i) + { + bool ok; + + ok = dynahash_insert(i, i * 31); + if (!ok) + elog(LOG, "insert %u: failed", i); + ok = dynahash_insert(i, i * 31); + if (ok) + elog(LOG, "insert %u: worked twice", i); + } + + PG_RETURN_VOID(); +} + +Datum +dynahash_search_test(PG_FUNCTION_ARGS) +{ + uint32 i; + + for (i = 0; i < 1000000; ++i) + { + bool ok; + uint32 val; + + ok = dynahash_search(i, &val); + if (!ok) + elog(LOG, "search %u: not found", i); + else if (val != i* 31) + elog(LOG, "search %u: found %u", i, val); + } + + PG_RETURN_VOID(); +} + +Datum +dynahash_delete_test(PG_FUNCTION_ARGS) +{ + uint32 i; + + for (i = 0; i < 1000000; ++i) + { + bool ok; + + ok = dynahash_delete(i); + if (!ok) + elog(LOG, "delete %u: not found", i); + ok = dynahash_delete(i); + if (ok) + elog(LOG, "delete %u: found twice", i); + } + + PG_RETURN_VOID(); +} + +Datum +dynahash_concurrent_test(PG_FUNCTION_ARGS) +{ + uint32 i; + uint32 val; + uint32 seed = MyProcPid << 16; + + for (i = 0; i < 10000; ++i) + { + bool ok; + + ok = dynahash_insert(seed | i, MyProcPid); + if (!ok) + elog(LOG, "insert %u: found", i); + } + + for (i = 0; i < 10000; ++i) + { + bool ok; + + ok = dynahash_search(seed | i, &val); + if (!ok) + { + uint64 retry = 1; + elog(LOG, "search %u: not found", i); + while (!dynahash_search(seed | i, &val)) + ++retry; + elog(LOG, "search %u: eventually found it after " + UINT64_FORMAT " retries", i, retry); + } + if (val != MyProcPid) + elog(LOG, "search %u: expected %u found %u", + i, (unsigned) MyProcPid, val); + } + + for (i = 0; i < 10000; ++i) + { + bool ok; + + ok = dynahash_delete(seed | i); + if (!ok) + { + uint64 retry = 1; + elog(LOG, "delete %u: not found", i); + while (!dynahash_delete(seed | i)) + ++retry; + elog(LOG, "delete %u: eventually deleted it after " + UINT64_FORMAT " retries", i, retry); + } + } + + PG_RETURN_VOID(); +} + +Datum +dynahash_collision_test(PG_FUNCTION_ARGS) +{ + uint32 i; + uint32 val; + + /* Don't stack-allocate this. */ + static bool mine[10000]; + + memset(mine, 0, 10000 * sizeof(bool)); + + for (i = 0; i < 10000; ++i) + { + bool ok; + + ok = dynahash_insert(i, MyProcPid); + if (ok) + mine[i] = true; + } + + for (i = 0; i < 10000; ++i) + { + bool ok; + + if (!mine[i]) + continue; + ok = dynahash_search(i, &val); + if (!ok) + elog(LOG, "search %u: not found", i); + else if (val != MyProcPid) + elog(LOG, "search %u: expected %u found %u", + i, (unsigned) MyProcPid, val); + ok = dynahash_delete(i); + if (!ok) + elog(LOG, "delete %u: not found", i); + } + + PG_RETURN_VOID(); +} diff --git a/contrib/hashtest/hashtest.control b/contrib/hashtest/hashtest.control new file mode 100644 index 0000000000..b8e0f01346 --- /dev/null +++ b/contrib/hashtest/hashtest.control @@ -0,0 +1,4 @@ +comment = 'hash testing code' +default_version = '1.0' +module_pathname = '$libdir/hashtest' +relocatable = true diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c index bbfcab71ce..56db0dd654 100644 --- a/contrib/pg_upgrade/check.c +++ b/contrib/pg_upgrade/check.c @@ -14,12 +14,10 @@ #include "pg_upgrade.h" -static void set_locale_and_encoding(ClusterInfo *cluster); static void check_new_cluster_is_empty(void); -static void check_locale_and_encoding(ControlData *oldctrl, - ControlData *newctrl); -static bool equivalent_locale(const char *loca, const char *locb); -static bool equivalent_encoding(const char *chara, const char *charb); +static void check_databases_are_compatible(void); +static void check_locale_and_encoding(DbInfo *olddb, DbInfo *newdb); +static bool equivalent_locale(int category, const char *loca, const char *locb); static void check_is_install_user(ClusterInfo *cluster); static void check_for_prepared_transactions(ClusterInfo *cluster); static void check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster); @@ -81,8 +79,6 @@ check_and_dump_old_cluster(bool live_check) if (!live_check) start_postmaster(&old_cluster, true); - set_locale_and_encoding(&old_cluster); - get_pg_database_relfilenode(&old_cluster); /* Extract a list of databases and tables from the old cluster */ @@ -127,13 +123,10 @@ check_and_dump_old_cluster(bool live_check) void check_new_cluster(void) { - set_locale_and_encoding(&new_cluster); - - check_locale_and_encoding(&old_cluster.controldata, &new_cluster.controldata); - get_db_and_rel_infos(&new_cluster); check_new_cluster_is_empty(); + check_databases_are_compatible(); check_loadable_libraries(); @@ -279,93 +272,25 @@ check_cluster_compatibility(bool live_check) /* - * set_locale_and_encoding() - * - * query the database to get the template0 locale - */ -static void -set_locale_and_encoding(ClusterInfo *cluster) -{ - ControlData *ctrl = &cluster->controldata; - PGconn *conn; - PGresult *res; - int i_encoding; - int cluster_version = cluster->major_version; - - conn = connectToServer(cluster, "template1"); - - /* for pg < 80400, we got the values from pg_controldata */ - if (cluster_version >= 80400) - { - int i_datcollate; - int i_datctype; - - res = executeQueryOrDie(conn, - "SELECT datcollate, datctype " - "FROM pg_catalog.pg_database " - "WHERE datname = 'template0' "); - assert(PQntuples(res) == 1); - - i_datcollate = PQfnumber(res, "datcollate"); - i_datctype = PQfnumber(res, "datctype"); - - if (GET_MAJOR_VERSION(cluster->major_version) < 902) - { - /* - * Pre-9.2 did not canonicalize the supplied locale names to match - * what the system returns, while 9.2+ does, so convert pre-9.2 to - * match. - */ - ctrl->lc_collate = get_canonical_locale_name(LC_COLLATE, - pg_strdup(PQgetvalue(res, 0, i_datcollate))); - ctrl->lc_ctype = get_canonical_locale_name(LC_CTYPE, - pg_strdup(PQgetvalue(res, 0, i_datctype))); - } - else - { - ctrl->lc_collate = pg_strdup(PQgetvalue(res, 0, i_datcollate)); - ctrl->lc_ctype = pg_strdup(PQgetvalue(res, 0, i_datctype)); - } - - PQclear(res); - } - - res = executeQueryOrDie(conn, - "SELECT pg_catalog.pg_encoding_to_char(encoding) " - "FROM pg_catalog.pg_database " - "WHERE datname = 'template0' "); - assert(PQntuples(res) == 1); - - i_encoding = PQfnumber(res, "pg_encoding_to_char"); - ctrl->encoding = pg_strdup(PQgetvalue(res, 0, i_encoding)); - - PQclear(res); - - PQfinish(conn); -} - - -/* * check_locale_and_encoding() * - * Check that old and new locale and encoding match. Even though the backend - * tries to canonicalize stored locale names, the platform often doesn't - * cooperate, so it's entirely possible that one DB thinks its locale is - * "en_US.UTF-8" while the other says "en_US.utf8". Try to be forgiving. + * Check that locale and encoding of a database in the old and new clusters + * are compatible. */ static void -check_locale_and_encoding(ControlData *oldctrl, - ControlData *newctrl) +check_locale_and_encoding(DbInfo *olddb, DbInfo *newdb) { - if (!equivalent_locale(oldctrl->lc_collate, newctrl->lc_collate)) - pg_fatal("lc_collate cluster values do not match: old \"%s\", new \"%s\"\n", - oldctrl->lc_collate, newctrl->lc_collate); - if (!equivalent_locale(oldctrl->lc_ctype, newctrl->lc_ctype)) - pg_fatal("lc_ctype cluster values do not match: old \"%s\", new \"%s\"\n", - oldctrl->lc_ctype, newctrl->lc_ctype); - if (!equivalent_encoding(oldctrl->encoding, newctrl->encoding)) - pg_fatal("encoding cluster values do not match: old \"%s\", new \"%s\"\n", - oldctrl->encoding, newctrl->encoding); + if (olddb->db_encoding != newdb->db_encoding) + pg_fatal("encodings for database \"%s\" do not match: old \"%s\", new \"%s\"\n", + olddb->db_name, + pg_encoding_to_char(olddb->db_encoding), + pg_encoding_to_char(newdb->db_encoding)); + if (!equivalent_locale(LC_COLLATE, olddb->db_collate, newdb->db_collate)) + pg_fatal("lc_collate values for database \"%s\" do not match: old \"%s\", new \"%s\"\n", + olddb->db_name, olddb->db_collate, newdb->db_collate); + if (!equivalent_locale(LC_CTYPE, olddb->db_ctype, newdb->db_ctype)) + pg_fatal("lc_ctype values for database \"%s\" do not match: old \"%s\", new \"%s\"\n", + olddb->db_name, olddb->db_ctype, newdb->db_ctype); } /* @@ -373,61 +298,46 @@ check_locale_and_encoding(ControlData *oldctrl, * * Best effort locale-name comparison. Return false if we are not 100% sure * the locales are equivalent. + * + * Note: The encoding parts of the names are ignored. This function is + * currently used to compare locale names stored in pg_database, and + * pg_database contains a separate encoding field. That's compared directly + * in check_locale_and_encoding(). */ static bool -equivalent_locale(const char *loca, const char *locb) +equivalent_locale(int category, const char *loca, const char *locb) { - const char *chara = strrchr(loca, '.'); - const char *charb = strrchr(locb, '.'); - int lencmp; - - /* If they don't both contain an encoding part, just do strcasecmp(). */ - if (!chara || !charb) - return (pg_strcasecmp(loca, locb) == 0); + const char *chara; + const char *charb; + char *canona; + char *canonb; + int lena; + int lenb; /* - * Compare the encoding parts. Windows tends to use code page numbers for - * the encoding part, which equivalent_encoding() won't like, so accept if - * the strings are case-insensitive equal; otherwise use - * equivalent_encoding() to compare. + * If the names are equal, the locales are equivalent. Checking this + * first avoids calling setlocale() in the common case that the names + * are equal. That's a good thing, if setlocale() is buggy, for example. */ - if (pg_strcasecmp(chara + 1, charb + 1) != 0 && - !equivalent_encoding(chara + 1, charb + 1)) - return false; + if (pg_strcasecmp(loca, locb) == 0) + return true; /* - * OK, compare the locale identifiers (e.g. en_US part of en_US.utf8). - * - * It's tempting to ignore non-alphanumeric chars here, but for now it's - * not clear that that's necessary; just do case-insensitive comparison. + * Not identical. Canonicalize both names, remove the encoding parts, + * and try again. */ - lencmp = chara - loca; - if (lencmp != charb - locb) - return false; + canona = get_canonical_locale_name(category, loca); + chara = strrchr(canona, '.'); + lena = chara ? (chara - canona) : strlen(canona); - return (pg_strncasecmp(loca, locb, lencmp) == 0); -} + canonb = get_canonical_locale_name(category, locb); + charb = strrchr(canonb, '.'); + lenb = charb ? (charb - canonb) : strlen(canonb); -/* - * equivalent_encoding() - * - * Best effort encoding-name comparison. Return true only if the encodings - * are valid server-side encodings and known equivalent. - * - * Because the lookup in pg_valid_server_encoding() does case folding and - * ignores non-alphanumeric characters, this will recognize many popular - * variant spellings as equivalent, eg "utf8" and "UTF-8" will match. - */ -static bool -equivalent_encoding(const char *chara, const char *charb) -{ - int enca = pg_valid_server_encoding(chara); - int encb = pg_valid_server_encoding(charb); + if (lena == lenb && pg_strncasecmp(canona, canonb, lena) == 0) + return true; - if (enca < 0 || encb < 0) - return false; - - return (enca == encb); + return false; } @@ -450,7 +360,35 @@ check_new_cluster_is_empty(void) new_cluster.dbarr.dbs[dbnum].db_name); } } +} + +/* + * Check that every database that already exists in the new cluster is + * compatible with the corresponding database in the old one. + */ +static void +check_databases_are_compatible(void) +{ + int newdbnum; + int olddbnum; + DbInfo *newdbinfo; + DbInfo *olddbinfo; + for (newdbnum = 0; newdbnum < new_cluster.dbarr.ndbs; newdbnum++) + { + newdbinfo = &new_cluster.dbarr.dbs[newdbnum]; + + /* Find the corresponding database in the old cluster */ + for (olddbnum = 0; olddbnum < old_cluster.dbarr.ndbs; olddbnum++) + { + olddbinfo = &old_cluster.dbarr.dbs[olddbnum]; + if (strcmp(newdbinfo->db_name, olddbinfo->db_name) == 0) + { + check_locale_and_encoding(olddbinfo, newdbinfo); + break; + } + } + } } @@ -470,7 +408,8 @@ create_script_for_cluster_analyze(char **analyze_script_file_name) if (os_info.user_specified) user_specification = psprintf("-U \"%s\" ", os_info.user); - *analyze_script_file_name = psprintf("analyze_new_cluster.%s", SCRIPT_EXT); + *analyze_script_file_name = psprintf("%sanalyze_new_cluster.%s", + SCRIPT_PREFIX, SCRIPT_EXT); if ((script = fopen_priv(*analyze_script_file_name, "w")) == NULL) pg_fatal("Could not open file \"%s\": %s\n", @@ -551,7 +490,8 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name) int tblnum; char old_cluster_pgdata[MAXPGPATH]; - *deletion_script_file_name = psprintf("delete_old_cluster.%s", SCRIPT_EXT); + *deletion_script_file_name = psprintf("%sdelete_old_cluster.%s", + SCRIPT_PREFIX, SCRIPT_EXT); /* * Some users (oddly) create tablespaces inside the cluster data diff --git a/contrib/pg_upgrade/controldata.c b/contrib/pg_upgrade/controldata.c index 8379ebd71b..4e9d5948fa 100644 --- a/contrib/pg_upgrade/controldata.c +++ b/contrib/pg_upgrade/controldata.c @@ -122,10 +122,6 @@ get_control_data(ClusterInfo *cluster, bool live_check) pg_fatal("Could not get control data using %s: %s\n", cmd, getErrorText(errno)); - /* Only pre-8.4 has these so if they are not set below we will check later */ - cluster->controldata.lc_collate = NULL; - cluster->controldata.lc_ctype = NULL; - /* Only in <= 9.2 */ if (GET_MAJOR_VERSION(cluster->major_version) <= 902) { @@ -404,36 +400,6 @@ get_control_data(ClusterInfo *cluster, bool live_check) cluster->controldata.data_checksum_version = str2uint(p); got_data_checksum_version = true; } - /* In pre-8.4 only */ - else if ((p = strstr(bufin, "LC_COLLATE:")) != NULL) - { - p = strchr(p, ':'); - - if (p == NULL || strlen(p) <= 1) - pg_fatal("%d: controldata retrieval problem\n", __LINE__); - - p++; /* remove ':' char */ - /* skip leading spaces and remove trailing newline */ - p += strspn(p, " "); - if (strlen(p) > 0 && *(p + strlen(p) - 1) == '\n') - *(p + strlen(p) - 1) = '\0'; - cluster->controldata.lc_collate = pg_strdup(p); - } - /* In pre-8.4 only */ - else if ((p = strstr(bufin, "LC_CTYPE:")) != NULL) - { - p = strchr(p, ':'); - - if (p == NULL || strlen(p) <= 1) - pg_fatal("%d: controldata retrieval problem\n", __LINE__); - - p++; /* remove ':' char */ - /* skip leading spaces and remove trailing newline */ - p += strspn(p, " "); - if (strlen(p) > 0 && *(p + strlen(p) - 1) == '\n') - *(p + strlen(p) - 1) = '\0'; - cluster->controldata.lc_ctype = pg_strdup(p); - } } if (output) diff --git a/contrib/pg_upgrade/info.c b/contrib/pg_upgrade/info.c index a1773aa8e5..c347dfc493 100644 --- a/contrib/pg_upgrade/info.c +++ b/contrib/pg_upgrade/info.c @@ -239,11 +239,15 @@ get_db_infos(ClusterInfo *cluster) DbInfo *dbinfos; int i_datname, i_oid, + i_encoding, + i_datcollate, + i_datctype, i_spclocation; char query[QUERY_ALLOC]; snprintf(query, sizeof(query), - "SELECT d.oid, d.datname, %s " + "SELECT d.oid, d.datname, d.encoding, d.datcollate, d.datctype, " + "%s AS spclocation " "FROM pg_catalog.pg_database d " " LEFT OUTER JOIN pg_catalog.pg_tablespace t " " ON d.dattablespace = t.oid " @@ -252,12 +256,15 @@ get_db_infos(ClusterInfo *cluster) "ORDER BY 2", /* 9.2 removed the spclocation column */ (GET_MAJOR_VERSION(cluster->major_version) <= 901) ? - "t.spclocation" : "pg_catalog.pg_tablespace_location(t.oid) AS spclocation"); + "t.spclocation" : "pg_catalog.pg_tablespace_location(t.oid)"); res = executeQueryOrDie(conn, "%s", query); i_oid = PQfnumber(res, "oid"); i_datname = PQfnumber(res, "datname"); + i_encoding = PQfnumber(res, "encoding"); + i_datcollate = PQfnumber(res, "datcollate"); + i_datctype = PQfnumber(res, "datctype"); i_spclocation = PQfnumber(res, "spclocation"); ntups = PQntuples(res); @@ -267,6 +274,9 @@ get_db_infos(ClusterInfo *cluster) { dbinfos[tupnum].db_oid = atooid(PQgetvalue(res, tupnum, i_oid)); dbinfos[tupnum].db_name = pg_strdup(PQgetvalue(res, tupnum, i_datname)); + dbinfos[tupnum].db_encoding = atoi(PQgetvalue(res, tupnum, i_encoding)); + dbinfos[tupnum].db_collate = pg_strdup(PQgetvalue(res, tupnum, i_datcollate)); + dbinfos[tupnum].db_ctype = pg_strdup(PQgetvalue(res, tupnum, i_datctype)); snprintf(dbinfos[tupnum].db_tablespace, sizeof(dbinfos[tupnum].db_tablespace), "%s", PQgetvalue(res, tupnum, i_spclocation)); } diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h index 56a7505a96..c3b81e4a08 100644 --- a/contrib/pg_upgrade/pg_upgrade.h +++ b/contrib/pg_upgrade/pg_upgrade.h @@ -76,6 +76,7 @@ extern char *output_files[]; #define PATH_SEPARATOR '/' #define RM_CMD "rm -f" #define RMDIR_CMD "rm -rf" +#define SCRIPT_PREFIX "./" #define SCRIPT_EXT "sh" #define ECHO_QUOTE "'" #define ECHO_BLANK "" @@ -86,6 +87,7 @@ extern char *output_files[]; #define PATH_SEPARATOR '\\' #define RM_CMD "DEL /q" #define RMDIR_CMD "RMDIR /s/q" +#define SCRIPT_PREFIX "" #define SCRIPT_EXT "bat" #define EXE_EXT ".exe" #define ECHO_QUOTE "" @@ -180,6 +182,9 @@ typedef struct char *db_name; /* database name */ char db_tablespace[MAXPGPATH]; /* database default tablespace * path */ + char *db_collate; + char *db_ctype; + int db_encoding; RelInfoArr rel_arr; /* array of all user relinfos */ } DbInfo; @@ -218,9 +223,6 @@ typedef struct bool date_is_int; bool float8_pass_by_value; bool data_checksum_version; - char *lc_collate; - char *lc_ctype; - char *encoding; } ControlData; /* diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index f4617b67e9..f98e282741 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -1227,8 +1227,7 @@ <entry><type>bool</type></entry> <entry></entry> <entry> - This represents a not-null constraint. It is possible to - change this column to enable or disable the constraint. + This represents a not-null constraint. </entry> </row> diff --git a/doc/src/sgml/json.sgml b/doc/src/sgml/json.sgml index 37dd611aeb..8feb2fbf0a 100644 --- a/doc/src/sgml/json.sgml +++ b/doc/src/sgml/json.sgml @@ -269,6 +269,12 @@ SELECT '"foo"'::jsonb @> '"foo"'::jsonb; -- The array on the right side is contained within the one on the left: SELECT '[1, 2, 3]'::jsonb @> '[1, 3]'::jsonb; +-- Order of array elements is not significant, so this is also true: +SELECT '[1, 2, 3]'::jsonb @> '[3, 1]'::jsonb; + +-- Duplicate array elements don't matter either: +SELECT '[1, 2, 3]'::jsonb @> '[1, 2, 2]'::jsonb; + -- The object with a single pair on the right side is contained -- within the object on the left side: SELECT '{"product": "PostgreSQL", "version": 9.4, "jsonb":true}'::jsonb @> '{"version":9.4}'::jsonb; @@ -288,8 +294,10 @@ SELECT '{"foo": {"bar": "baz"}}'::jsonb @> '{"bar": "baz"}'::jsonb; -- yields f The general principle is that the contained object must match the containing object as to structure and data contents, possibly after discarding some non-matching array elements or object key/value pairs - from the containing object. However, the order of array elements is - not significant when doing a containment match. + from the containing object. + But remember that the order of array elements is not significant when + doing a containment match, and duplicate array elements are effectively + considered only once. </para> <para> diff --git a/doc/src/sgml/plpgsql.sgml b/doc/src/sgml/plpgsql.sgml index f008e937ee..f195495520 100644 --- a/doc/src/sgml/plpgsql.sgml +++ b/doc/src/sgml/plpgsql.sgml @@ -487,8 +487,8 @@ $$ LANGUAGE plpgsql; CREATE FUNCTION extended_sales(p_itemno int) RETURNS TABLE(quantity int, total numeric) AS $$ BEGIN - RETURN QUERY SELECT quantity, quantity * price FROM sales - WHERE itemno = p_itemno; + RETURN QUERY SELECT s.quantity, s.quantity * s.price FROM sales AS s + WHERE s.itemno = p_itemno; END; $$ LANGUAGE plpgsql; </programlisting> diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 2af9413f21..e76b22fb2d 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -302,7 +302,7 @@ PROVE_FLAGS = --verbose # prepend to path if already set, else just set it define add_to_path -$(1)='$(if $($(1)),$(2):$$$(1),$(2))' +$(1)="$(if $($(1)),$(2):$$$(1),$(2))" endef # platform-specific environment variable to set shared library path diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 850008b340..8849c08e54 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -28,6 +28,7 @@ #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/ruleutils.h" #include "utils/snapmgr.h" #include "utils/tqual.h" diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index bab5a49187..b71f65de2c 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1186,7 +1186,7 @@ _bt_pagedel(Relation rel, Buffer buf) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" contains a half-dead internal page", RelationGetRelationName(rel)), - errhint("This can be caused by an interrupt VACUUM in version 9.3 or older, before upgrade. Please REINDEX it."))); + errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it."))); _bt_relbuf(rel, buf); return ndeleted; } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 5a4dbb9c53..235b442296 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -5193,8 +5193,8 @@ readRecoveryCommandFile(void) else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid recovery_target parameter"), - errhint("The only allowed value is 'immediate'"))); + errmsg("invalid value for recovery parameter \"recovery_target\""), + errhint("The only allowed value is \"immediate\"."))); ereport(DEBUG2, (errmsg_internal("recovery_target = '%s'", item->value))); @@ -5257,7 +5257,7 @@ readRecoveryCommandFile(void) "recovery_min_apply_delay"), hintmsg ? errhint("%s", _(hintmsg)) : 0)); ereport(DEBUG2, - (errmsg("recovery_min_apply_delay = '%s'", item->value))); + (errmsg_internal("recovery_min_apply_delay = '%s'", item->value))); } else ereport(FATAL, diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 55c1e79563..c0eade0a3d 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -69,6 +69,7 @@ #include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/ruleutils.h" #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tqual.h" diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 7831e900ba..b52e6b9bc0 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -839,7 +839,7 @@ dropdb(const char *dbname, bool missing_ok) if (ReplicationSlotsCountDBSlots(db_id, &nslots, &nslots_active)) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), - errmsg("database \"%s\" is used by a logical decoding slot", + errmsg("database \"%s\" is used by a logical replication slot", dbname), errdetail_plural("There is %d slot, %d of them active.", "There are %d slots, %d of them active.", diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 781a736115..387d263e87 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -28,6 +28,7 @@ #include "utils/json.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/ruleutils.h" #include "utils/snapmgr.h" #include "utils/tuplesort.h" #include "utils/xml.h" @@ -1900,18 +1901,21 @@ show_hash_info(HashState *hashstate, ExplainState *es) if (es->format != EXPLAIN_FORMAT_TEXT) { ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es); + ExplainPropertyLong("Original Hash Buckets", + hashtable->nbuckets_original, es); ExplainPropertyLong("Hash Batches", hashtable->nbatch, es); ExplainPropertyLong("Original Hash Batches", hashtable->nbatch_original, es); ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es); } - else if (hashtable->nbatch_original != hashtable->nbatch) + else if ((hashtable->nbatch_original != hashtable->nbatch) || + (hashtable->nbuckets_original != hashtable->nbuckets)) { appendStringInfoSpaces(es->str, es->indent * 2); appendStringInfo(es->str, - "Buckets: %d Batches: %d (originally %d) Memory Usage: %ldkB\n", - hashtable->nbuckets, hashtable->nbatch, - hashtable->nbatch_original, spacePeakKb); + "Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n", + hashtable->nbuckets, hashtable->nbuckets_original, + hashtable->nbatch, hashtable->nbatch_original, spacePeakKb); } else { diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index d1c8bb0d53..30bd40db18 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -597,7 +597,7 @@ refresh_by_match_merge(Oid matviewOid, Oid tempOid, Oid relowner, { ereport(ERROR, (errcode(ERRCODE_CARDINALITY_VIOLATION), - errmsg("new data for \"%s\" contains duplicate rows without any NULL columns", + errmsg("new data for \"%s\" contains duplicate rows without any null columns", RelationGetRelationName(matviewRel)), errdetail("Row: %s", SPI_getvalue(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1)))); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index cb16c53a60..ecdff1e5e3 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -85,6 +85,7 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/relcache.h" +#include "utils/ruleutils.h" #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tqual.h" @@ -9045,7 +9046,7 @@ ATExecSetRelOptions(Relation rel, List *defList, AlterTableType operation, if (view_updatable_error) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("WITH CHECK OPTION is supported only on auto-updatable views"), + errmsg("WITH CHECK OPTION is supported only on automatically updatable views"), errhint("%s", view_updatable_error))); } } diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c index ad364efbcb..55a68810f2 100644 --- a/src/backend/commands/typecmds.c +++ b/src/backend/commands/typecmds.c @@ -72,6 +72,7 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/ruleutils.h" #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tqual.h" diff --git a/src/backend/commands/view.c b/src/backend/commands/view.c index 9d0039c42a..184bcd0582 100644 --- a/src/backend/commands/view.c +++ b/src/backend/commands/view.c @@ -471,7 +471,7 @@ DefineView(ViewStmt *stmt, const char *queryString) if (view_updatable_error) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("WITH CHECK OPTION is supported only on auto-updatable views"), + errmsg("WITH CHECK OPTION is supported only on automatically updatable views"), errhint("%s", view_updatable_error))); } diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index b428c18b5c..7c5bb77b0c 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -39,6 +39,7 @@ static void ExecHashIncreaseNumBatches(HashJoinTable hashtable); +static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable); static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node, int mcvsToUse); static void ExecHashSkewTableInsert(HashJoinTable hashtable, @@ -117,6 +118,7 @@ MultiExecHash(HashState *node) /* It's a skew tuple, so put it into that hash table */ ExecHashSkewTableInsert(hashtable, slot, hashvalue, bucketNumber); + hashtable->skewTuples += 1; } else { @@ -127,6 +129,25 @@ MultiExecHash(HashState *node) } } + /* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */ + if (hashtable->nbuckets != hashtable->nbuckets_optimal) + { + /* We never decrease the number of buckets. */ + Assert(hashtable->nbuckets_optimal > hashtable->nbuckets); + +#ifdef HJDEBUG + printf("Increasing nbuckets %d => %d\n", + hashtable->nbuckets, hashtable->nbuckets_optimal); +#endif + + ExecHashIncreaseNumBuckets(hashtable); + } + + /* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */ + hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple); + if (hashtable->spaceUsed > hashtable->spacePeak) + hashtable->spacePeak = hashtable->spaceUsed; + /* must provide our own instrumentation support */ if (node->ps.instrument) InstrStopNode(node->ps.instrument, hashtable->totalTuples); @@ -272,7 +293,10 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls) */ hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData)); hashtable->nbuckets = nbuckets; + hashtable->nbuckets_original = nbuckets; + hashtable->nbuckets_optimal = nbuckets; hashtable->log2_nbuckets = log2_nbuckets; + hashtable->log2_nbuckets_optimal = log2_nbuckets; hashtable->buckets = NULL; hashtable->keepNulls = keepNulls; hashtable->skewEnabled = false; @@ -286,6 +310,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls) hashtable->nbatch_outstart = nbatch; hashtable->growEnabled = true; hashtable->totalTuples = 0; + hashtable->skewTuples = 0; hashtable->innerBatchFile = NULL; hashtable->outerBatchFile = NULL; hashtable->spaceUsed = 0; @@ -620,6 +645,19 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) */ ninmemory = nfreed = 0; + /* If know we need to resize nbuckets, we can do it while rebatching. */ + if (hashtable->nbuckets_optimal != hashtable->nbuckets) + { + /* we never decrease the number of buckets */ + Assert(hashtable->nbuckets_optimal > hashtable->nbuckets); + + hashtable->nbuckets = hashtable->nbuckets_optimal; + hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal; + + hashtable->buckets = repalloc(hashtable->buckets, + sizeof(HashJoinTuple) * hashtable->nbuckets); + } + /* * We will scan through the chunks directly, so that we can reset the * buckets now and not have to keep track which tuples in the buckets have @@ -704,6 +742,78 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) } /* + * ExecHashIncreaseNumBuckets + * increase the original number of buckets in order to reduce + * number of tuples per bucket + */ +static void +ExecHashIncreaseNumBuckets(HashJoinTable hashtable) +{ + HashMemoryChunk chunk; + + /* do nothing if not an increase (it's called increase for a reason) */ + if (hashtable->nbuckets >= hashtable->nbuckets_optimal) + return; + + /* + * We already know the optimal number of buckets, so let's just + * compute the log2_nbuckets for it. + */ + hashtable->nbuckets = hashtable->nbuckets_optimal; + hashtable->log2_nbuckets = my_log2(hashtable->nbuckets_optimal); + + Assert(hashtable->nbuckets > 1); + Assert(hashtable->nbuckets <= (INT_MAX / 2)); + Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets)); + +#ifdef HJDEBUG + printf("Increasing nbuckets to %d\n", hashtable->nbuckets); +#endif + + /* + * Just reallocate the proper number of buckets - we don't need to + * walk through them - we can walk the dense-allocated chunks + * (just like in ExecHashIncreaseNumBatches, but without all the + * copying into new chunks) + */ + hashtable->buckets = + (HashJoinTuple *) repalloc(hashtable->buckets, + hashtable->nbuckets * sizeof(HashJoinTuple)); + + memset(hashtable->buckets, 0, sizeof(void *) * hashtable->nbuckets); + + /* scan through all tuples in all chunks to rebuild the hash table */ + for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next) + { + /* process all tuples stored in this chunk */ + size_t idx = 0; + while (idx < chunk->used) + { + HashJoinTuple hashTuple = (HashJoinTuple) (chunk->data + idx); + int bucketno; + int batchno; + + ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue, + &bucketno, &batchno); + + /* add the tuple to the proper bucket */ + hashTuple->next = hashtable->buckets[bucketno]; + hashtable->buckets[bucketno] = hashTuple; + + /* advance index past the tuple */ + idx += MAXALIGN(HJTUPLE_OVERHEAD + + HJTUPLE_MINTUPLE(hashTuple)->t_len); + } + } + +#ifdef HJDEBUG + printf("Nbuckets increased to %d, average items per bucket %.1f\n", + hashtable->nbuckets, batchTuples / hashtable->nbuckets); +#endif +} + + +/* * ExecHashTableInsert * insert a tuple into the hash table depending on the hash value * it may just go to a temp file for later batches @@ -736,6 +846,7 @@ ExecHashTableInsert(HashJoinTable hashtable, */ HashJoinTuple hashTuple; int hashTupleSize; + double ntuples = (hashtable->totalTuples - hashtable->skewTuples); /* Create the HashJoinTuple */ hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len; @@ -756,11 +867,24 @@ ExecHashTableInsert(HashJoinTable hashtable, hashTuple->next = hashtable->buckets[bucketno]; hashtable->buckets[bucketno] = hashTuple; + /* + * Increase the (optimal) number of buckets if we just exceeded the + * NTUP_PER_BUCKET threshold, but only when there's still a single batch. + */ + if ((hashtable->nbatch == 1) && + (hashtable->nbuckets_optimal <= INT_MAX/2) && /* overflow protection */ + (ntuples >= (hashtable->nbuckets_optimal * NTUP_PER_BUCKET))) + { + hashtable->nbuckets_optimal *= 2; + hashtable->log2_nbuckets_optimal += 1; + } + /* Account for space used, and back off if we've used too much */ hashtable->spaceUsed += hashTupleSize; if (hashtable->spaceUsed > hashtable->spacePeak) hashtable->spacePeak = hashtable->spaceUsed; - if (hashtable->spaceUsed + hashtable->nbuckets * sizeof(HashJoinTuple) + if (hashtable->spaceUsed + + hashtable->nbuckets_optimal * sizeof(HashJoinTuple) > hashtable->spaceAllowed) ExecHashIncreaseNumBatches(hashtable); } @@ -885,7 +1009,10 @@ ExecHashGetHashValue(HashJoinTable hashtable, * functions are good about randomizing all their output bits, else we are * likely to have very skewed bucket or batch occupancy.) * - * nbuckets doesn't change over the course of the join. + * nbuckets and log2_nbuckets may change while nbatch == 1 because of dynamic + * bucket count growth. Once we start batching, the value is fixed and does + * not change over the course of the join (making it possible to compute batch + * number the way we do here). * * nbatch is always a power of 2; we increase it only by doubling it. This * effectively adds one more bit to the top of the batchno. diff --git a/src/backend/libpq/be-secure-openssl.c b/src/backend/libpq/be-secure-openssl.c index 8d8f12952a..b05364ced0 100644 --- a/src/backend/libpq/be-secure-openssl.c +++ b/src/backend/libpq/be-secure-openssl.c @@ -614,7 +614,7 @@ be_tls_write(Port *port, void *ptr, size_t len) if (retries >= 20) ereport(FATAL, (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg("unable to complete SSL handshake"))); + errmsg("could not complete SSL handshake on renegotiation, too many failures"))); } } } diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c index 7a38f2f150..092cf8fe43 100644 --- a/src/backend/storage/buffer/buf_table.c +++ b/src/backend/storage/buffer/buf_table.c @@ -21,8 +21,10 @@ */ #include "postgres.h" +#include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/buf_internals.h" +#include "utils/chash.h" /* entry for buffer lookup hashtable */ @@ -32,8 +34,13 @@ typedef struct int id; /* Associated buffer ID */ } BufferLookupEnt; -static HTAB *SharedBufHash; - +static CHashDescriptor SharedBufDescriptor = { + "buffer lookup table", + 0, + sizeof(BufferLookupEnt), + sizeof(BufferTag) +}; +static CHashTable SharedBufHash; /* * Estimate space needed for mapping hashtable @@ -42,7 +49,13 @@ static HTAB *SharedBufHash; Size BufTableShmemSize(int size) { - return hash_estimate_size(size, sizeof(BufferLookupEnt)); + if (SharedBufHash == NULL) + { + SharedBufDescriptor.capacity = size; + SharedBufHash = CHashBootstrap(&SharedBufDescriptor); + } + + return CHashEstimateSize(SharedBufHash); } /* @@ -52,59 +65,29 @@ BufTableShmemSize(int size) void InitBufTable(int size) { - HASHCTL info; - - /* assume no locking is needed yet */ - - /* BufferTag maps to Buffer */ - info.keysize = sizeof(BufferTag); - info.entrysize = sizeof(BufferLookupEnt); - info.hash = tag_hash; - info.num_partitions = NUM_BUFFER_PARTITIONS; - - SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table", - size, size, - &info, - HASH_ELEM | HASH_FUNCTION | HASH_PARTITION); -} - -/* - * BufTableHashCode - * Compute the hash code associated with a BufferTag - * - * This must be passed to the lookup/insert/delete routines along with the - * tag. We do it like this because the callers need to know the hash code - * in order to determine which buffer partition to lock, and we don't want - * to do the hash computation twice (hash_any is a bit slow). - */ -uint32 -BufTableHashCode(BufferTag *tagPtr) -{ - return get_hash_value(SharedBufHash, (void *) tagPtr); + if (SharedBufHash == NULL || !IsUnderPostmaster) + { + Assert(SharedBufDescriptor.capacity == 0 || + SharedBufDescriptor.capacity == size); + SharedBufDescriptor.capacity = size; + SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor); + } } /* * BufTableLookup * Lookup the given BufferTag; return buffer ID, or -1 if not found - * - * Caller must hold at least share lock on BufMappingLock for tag's partition */ int -BufTableLookup(BufferTag *tagPtr, uint32 hashcode) +BufTableLookup(BufferTag *tagPtr) { - BufferLookupEnt *result; - - result = (BufferLookupEnt *) - hash_search_with_hash_value(SharedBufHash, - (void *) tagPtr, - hashcode, - HASH_FIND, - NULL); + BufferLookupEnt ent; - if (!result) + ent.key = *tagPtr; + if (!CHashSearch(SharedBufHash, &ent)) return -1; - return result->id; + return ent.id; } /* @@ -118,27 +101,20 @@ BufTableLookup(BufferTag *tagPtr, uint32 hashcode) * Caller must hold exclusive lock on BufMappingLock for tag's partition */ int -BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id) +BufTableInsert(BufferTag *tagPtr, int buf_id) { - BufferLookupEnt *result; - bool found; + BufferLookupEnt ent; + + ent.key = *tagPtr; + ent.id = buf_id; Assert(buf_id >= 0); /* -1 is reserved for not-in-table */ Assert(tagPtr->blockNum != P_NEW); /* invalid tag */ - result = (BufferLookupEnt *) - hash_search_with_hash_value(SharedBufHash, - (void *) tagPtr, - hashcode, - HASH_ENTER, - &found); - - if (found) /* found something already in the table */ - return result->id; - - result->id = buf_id; + if (CHashInsert(SharedBufHash, &ent)) + return -1; - return -1; + return ent.id; } /* @@ -148,17 +124,8 @@ BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id) * Caller must hold exclusive lock on BufMappingLock for tag's partition */ void -BufTableDelete(BufferTag *tagPtr, uint32 hashcode) +BufTableDelete(BufferTag *tagPtr) { - BufferLookupEnt *result; - - result = (BufferLookupEnt *) - hash_search_with_hash_value(SharedBufHash, - (void *) tagPtr, - hashcode, - HASH_REMOVE, - NULL); - - if (!result) /* shouldn't happen */ + if (!CHashDelete(SharedBufHash, tagPtr)) elog(ERROR, "shared buffer hash table corrupted"); } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 45d1d61d95..437deb905c 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -429,22 +429,14 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum) else { BufferTag newTag; /* identity of requested block */ - uint32 newHash; /* hash value for newTag */ - LWLock *newPartitionLock; /* buffer partition lock for it */ int buf_id; /* create a tag so we can lookup the buffer */ INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node, forkNum, blockNum); - /* determine its hash code and partition lock ID */ - newHash = BufTableHashCode(&newTag); - newPartitionLock = BufMappingPartitionLock(newHash); - /* see if the block is in the buffer pool already */ - LWLockAcquire(newPartitionLock, LW_SHARED); - buf_id = BufTableLookup(&newTag, newHash); - LWLockRelease(newPartitionLock); + buf_id = BufTableLookup(&newTag); /* If not in buffers, initiate prefetch */ if (buf_id < 0) @@ -822,11 +814,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, bool *foundPtr) { BufferTag newTag; /* identity of requested block */ - uint32 newHash; /* hash value for newTag */ - LWLock *newPartitionLock; /* buffer partition lock for it */ BufferTag oldTag; /* previous identity of selected buffer */ - uint32 oldHash; /* hash value for oldTag */ - LWLock *oldPartitionLock; /* buffer partition lock for it */ BufFlags oldFlags; int buf_id; volatile BufferDesc *buf; @@ -835,29 +823,31 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, /* create a tag so we can lookup the buffer */ INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum); - /* determine its hash code and partition lock ID */ - newHash = BufTableHashCode(&newTag); - newPartitionLock = BufMappingPartitionLock(newHash); - /* see if the block is in the buffer pool already */ - LWLockAcquire(newPartitionLock, LW_SHARED); - buf_id = BufTableLookup(&newTag, newHash); +start: + buf_id = BufTableLookup(&newTag); if (buf_id >= 0) { + BufferDesc *foundbuf; + /* * Found it. Now, pin the buffer so no one can steal it from the - * buffer pool, and check to see if the correct data has been loaded - * into the buffer. + * buffer pool. */ - buf = &BufferDescriptors[buf_id]; + foundbuf = &BufferDescriptors[buf_id]; - valid = PinBuffer(buf, strategy); + valid = PinBuffer(foundbuf, strategy); - /* Can release the mapping lock as soon as we've pinned it */ - LWLockRelease(newPartitionLock); + /* Check whether someone recycled the buffer before we pinned it. */ + if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag)) + { + UnpinBuffer(foundbuf, true); + goto start; + } *foundPtr = TRUE; + /* Check to see if the correct data has been loaded into the buffer. */ if (!valid) { /* @@ -867,7 +857,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * own read attempt if the page is still not BM_VALID. * StartBufferIO does it all. */ - if (StartBufferIO(buf, true)) + if (StartBufferIO(foundbuf, true)) { /* * If we get here, previous attempts to read the buffer must @@ -877,15 +867,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } } - return buf; + return foundbuf; } - /* - * Didn't find it in the buffer pool. We'll have to initialize a new - * buffer. Remember to unlock the mapping lock while doing the work. - */ - LWLockRelease(newPartitionLock); - /* Loop here in case we have to try another victim buffer */ for (;;) { @@ -986,42 +970,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, */ if (oldFlags & BM_TAG_VALID) { - /* - * Need to compute the old tag's hashcode and partition lock ID. - * XXX is it worth storing the hashcode in BufferDesc so we need - * not recompute it here? Probably not. - */ + /* Save old tag. */ oldTag = buf->tag; - oldHash = BufTableHashCode(&oldTag); - oldPartitionLock = BufMappingPartitionLock(oldHash); - - /* - * Must lock the lower-numbered partition first to avoid - * deadlocks. - */ - if (oldPartitionLock < newPartitionLock) - { - LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE); - LWLockAcquire(newPartitionLock, LW_EXCLUSIVE); - } - else if (oldPartitionLock > newPartitionLock) - { - LWLockAcquire(newPartitionLock, LW_EXCLUSIVE); - LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE); - } - else - { - /* only one partition, only one lock */ - LWLockAcquire(newPartitionLock, LW_EXCLUSIVE); - } - } - else - { - /* if it wasn't valid, we need only the new partition */ - LWLockAcquire(newPartitionLock, LW_EXCLUSIVE); - /* these just keep the compiler quiet about uninit variables */ - oldHash = 0; - oldPartitionLock = 0; } /* @@ -1031,32 +981,34 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * Note that we have not yet removed the hashtable entry for the old * tag. */ - buf_id = BufTableInsert(&newTag, newHash, buf->buf_id); +enter: + buf_id = BufTableInsert(&newTag, buf->buf_id); if (buf_id >= 0) { + BufferDesc *foundbuf; + /* - * Got a collision. Someone has already done what we were about to - * do. We'll just handle this as if it were found in the buffer - * pool in the first place. First, give up the buffer we were - * planning to use. + * We've got a collision, apparently: it looks like someone else + * did what we were about to do. We can handle this as if we had + * found the buffer in the pool in the first place, but we must + * recheck the buffer tag after pinning it, because it could still + * get renamed under us. + */ + foundbuf = &BufferDescriptors[buf_id]; + valid = PinBuffer(foundbuf, strategy); + if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag)) + { + UnpinBuffer(foundbuf, true); + goto enter; + } + + /* + * Collision confirmed. Give up the buffer we were planning to + * use. */ UnpinBuffer(buf, true); - /* Can give up that buffer's mapping partition lock now */ - if ((oldFlags & BM_TAG_VALID) && - oldPartitionLock != newPartitionLock) - LWLockRelease(oldPartitionLock); - - /* remaining code should match code at top of routine */ - - buf = &BufferDescriptors[buf_id]; - - valid = PinBuffer(buf, strategy); - - /* Can release the mapping lock as soon as we've pinned it */ - LWLockRelease(newPartitionLock); - *foundPtr = TRUE; if (!valid) @@ -1068,7 +1020,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * then set up our own read attempt if the page is still not * BM_VALID. StartBufferIO does it all. */ - if (StartBufferIO(buf, true)) + if (StartBufferIO(foundbuf, true)) { /* * If we get here, previous attempts to read the buffer @@ -1078,7 +1030,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } } - return buf; + return foundbuf; } /* @@ -1097,11 +1049,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, break; UnlockBufHdr(buf); - BufTableDelete(&newTag, newHash); - if ((oldFlags & BM_TAG_VALID) && - oldPartitionLock != newPartitionLock) - LWLockRelease(oldPartitionLock); - LWLockRelease(newPartitionLock); + BufTableDelete(&newTag); UnpinBuffer(buf, true); } @@ -1124,13 +1072,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, UnlockBufHdr(buf); if (oldFlags & BM_TAG_VALID) - { - BufTableDelete(&oldTag, oldHash); - if (oldPartitionLock != newPartitionLock) - LWLockRelease(oldPartitionLock); - } - - LWLockRelease(newPartitionLock); + BufTableDelete(&oldTag); /* * Buffer contents are currently invalid. Try to get the io_in_progress @@ -1166,42 +1108,11 @@ static void InvalidateBuffer(volatile BufferDesc *buf) { BufferTag oldTag; - uint32 oldHash; /* hash value for oldTag */ - LWLock *oldPartitionLock; /* buffer partition lock for it */ BufFlags oldFlags; /* Save the original buffer tag before dropping the spinlock */ oldTag = buf->tag; - UnlockBufHdr(buf); - - /* - * Need to compute the old tag's hashcode and partition lock ID. XXX is it - * worth storing the hashcode in BufferDesc so we need not recompute it - * here? Probably not. - */ - oldHash = BufTableHashCode(&oldTag); - oldPartitionLock = BufMappingPartitionLock(oldHash); - -retry: - - /* - * Acquire exclusive mapping lock in preparation for changing the buffer's - * association. - */ - LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE); - - /* Re-lock the buffer header */ - LockBufHdr(buf); - - /* If it's changed while we were waiting for lock, do nothing */ - if (!BUFFERTAGS_EQUAL(buf->tag, oldTag)) - { - UnlockBufHdr(buf); - LWLockRelease(oldPartitionLock); - return; - } - /* * We assume the only reason for it to be pinned is that someone else is * flushing the page out. Wait for them to finish. (This could be an @@ -1211,15 +1122,21 @@ retry: * yet done StartBufferIO, WaitIO will fall through and we'll effectively * be busy-looping here.) */ - if (buf->refcount != 0) + while (buf->refcount != 0) { UnlockBufHdr(buf); - LWLockRelease(oldPartitionLock); /* safety check: should definitely not be our *own* pin */ if (GetPrivateRefCount(buf->buf_id) > 0) elog(ERROR, "buffer is pinned in InvalidateBuffer"); WaitIO(buf); - goto retry; + LockBufHdr(buf); + + /* If it's changed while we were waiting for lock, do nothing */ + if (!BUFFERTAGS_EQUAL(buf->tag, oldTag)) + { + UnlockBufHdr(buf); + return; + } } /* @@ -1237,12 +1154,7 @@ retry: * Remove the buffer from the lookup hashtable, if it was in there. */ if (oldFlags & BM_TAG_VALID) - BufTableDelete(&oldTag, oldHash); - - /* - * Done with mapping lock. - */ - LWLockRelease(oldPartitionLock); + BufTableDelete(&oldTag); /* * Insert the buffer at the head of the list of free buffers. diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c index d96627a774..90df5930e1 100644 --- a/src/backend/storage/ipc/shm_mq.c +++ b/src/backend/storage/ipc/shm_mq.c @@ -139,7 +139,7 @@ struct shm_mq_handle }; static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mq, Size nbytes, - void *data, bool nowait, Size *bytes_written); + const void *data, bool nowait, Size *bytes_written); static shm_mq_result shm_mq_receive_bytes(shm_mq *mq, Size bytes_needed, bool nowait, Size *nbytesp, void **datap); static bool shm_mq_wait_internal(volatile shm_mq *mq, PGPROC *volatile * ptr, @@ -301,7 +301,33 @@ shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle) } /* + * Associate a BackgroundWorkerHandle with a shm_mq_handle just as if it had + * been passed to shm_mq_attach. + */ +void +shm_mq_set_handle(shm_mq_handle *mqh, BackgroundWorkerHandle *handle) +{ + Assert(mqh->mqh_handle == NULL); + mqh->mqh_handle = handle; +} + +/* * Write a message into a shared message queue. + */ +shm_mq_result +shm_mq_send(shm_mq_handle *mqh, Size nbytes, const void *data, bool nowait) +{ + shm_mq_iovec iov; + + iov.data = data; + iov.len = nbytes; + + return shm_mq_sendv(mqh, &iov, 1, nowait); +} + +/* + * Write a message into a shared message queue, gathered from multiple + * addresses. * * When nowait = false, we'll wait on our process latch when the ring buffer * fills up, and then continue writing once the receiver has drained some data. @@ -315,14 +341,22 @@ shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle) * the length or payload will corrupt the queue.) */ shm_mq_result -shm_mq_send(shm_mq_handle *mqh, Size nbytes, void *data, bool nowait) +shm_mq_sendv(shm_mq_handle *mqh, shm_mq_iovec *iov, int iovcnt, bool nowait) { shm_mq_result res; shm_mq *mq = mqh->mqh_queue; + Size nbytes = 0; Size bytes_written; + int i; + int which_iov = 0; + Size offset; Assert(mq->mq_sender == MyProc); + /* Compute total size of write. */ + for (i = 0; i < iovcnt; ++i) + nbytes += iov[i].len; + /* Try to write, or finish writing, the length word into the buffer. */ while (!mqh->mqh_length_word_complete) { @@ -348,18 +382,80 @@ shm_mq_send(shm_mq_handle *mqh, Size nbytes, void *data, bool nowait) /* Write the actual data bytes into the buffer. */ Assert(mqh->mqh_partial_bytes <= nbytes); - res = shm_mq_send_bytes(mqh, nbytes - mqh->mqh_partial_bytes, - ((char *) data) + mqh->mqh_partial_bytes, - nowait, &bytes_written); - if (res == SHM_MQ_WOULD_BLOCK) - mqh->mqh_partial_bytes += bytes_written; - else + offset = mqh->mqh_partial_bytes; + do { - mqh->mqh_partial_bytes = 0; - mqh->mqh_length_word_complete = false; - } - if (res != SHM_MQ_SUCCESS) - return res; + Size chunksize; + + /* Figure out which bytes need to be sent next. */ + if (offset >= iov[which_iov].len) + { + offset -= iov[which_iov].len; + ++which_iov; + if (which_iov >= iovcnt) + break; + continue; + } + + /* + * We want to avoid copying the data if at all possible, but every + * chunk of bytes we write into the queue has to be MAXALIGN'd, + * except the last. Thus, if a chunk other than the last one ends + * on a non-MAXALIGN'd boundary, we have to combine the tail end of + * its data with data from one or more following chunks until we + * either reach the last chunk or accumulate a number of bytes which + * is MAXALIGN'd. + */ + if (which_iov + 1 < iovcnt && + offset + MAXIMUM_ALIGNOF > iov[which_iov].len) + { + char tmpbuf[MAXIMUM_ALIGNOF]; + int j = 0; + + for (;;) + { + if (offset < iov[which_iov].len) + { + tmpbuf[j] = iov[which_iov].data[offset]; + j++; + offset++; + if (j == MAXIMUM_ALIGNOF) + break; + } + else + { + offset -= iov[which_iov].len; + which_iov++; + if (which_iov >= iovcnt) + break; + } + } + res = shm_mq_send_bytes(mqh, j, tmpbuf, nowait, &bytes_written); + mqh->mqh_partial_bytes += bytes_written; + if (res != SHM_MQ_SUCCESS) + return res; + continue; + } + + /* + * If this is the last chunk, we can write all the data, even if it + * isn't a multiple of MAXIMUM_ALIGNOF. Otherwise, we need to + * MAXALIGN_DOWN the write size. + */ + chunksize = iov[which_iov].len - offset; + if (which_iov + 1 < iovcnt) + chunksize = MAXALIGN_DOWN(chunksize); + res = shm_mq_send_bytes(mqh, chunksize, &iov[which_iov].data[offset], + nowait, &bytes_written); + mqh->mqh_partial_bytes += bytes_written; + offset += bytes_written; + if (res != SHM_MQ_SUCCESS) + return res; + } while (mqh->mqh_partial_bytes < nbytes); + + /* Reset for next message. */ + mqh->mqh_partial_bytes = 0; + mqh->mqh_length_word_complete = false; /* Notify receiver of the newly-written data, and return. */ return shm_mq_notify_receiver(mq); @@ -653,8 +749,8 @@ shm_mq_detach(shm_mq *mq) * Write bytes into a shared message queue. */ static shm_mq_result -shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, void *data, bool nowait, - Size *bytes_written) +shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data, + bool nowait, Size *bytes_written) { shm_mq *mq = mqh->mqh_queue; Size sent = 0; diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 2ea2216a65..38614a449d 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -423,6 +423,29 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) return structPtr; } +/* + * ShmemInitStruct -- Attach to an existing structure in shared memory. + */ +void * +ShmemAttachStruct(const char *name) +{ + ShmemIndexEnt *result; + void *ptr; + bool found; + + LWLockAcquire(ShmemIndexLock, LW_SHARED); + + result = (ShmemIndexEnt *) + hash_search(ShmemIndex, name, HASH_FIND, &found); + if (!found || result == NULL) + elog(ERROR, "shared memory structure %s not found", name); + ptr = result->location; + Assert(ptr != NULL); + + LWLockRelease(ShmemIndexLock); + + return ptr; +} /* * Add two Size values, checking for overflow diff --git a/src/backend/utils/adt/jsonb_op.c b/src/backend/utils/adt/jsonb_op.c index 2d071b2523..d9aaac9ac2 100644 --- a/src/backend/utils/adt/jsonb_op.c +++ b/src/backend/utils/adt/jsonb_op.c @@ -57,7 +57,7 @@ jsonb_exists_any(PG_FUNCTION_ARGS) for (i = 0; i < elem_count; i++) { - JsonbValue strVal; + JsonbValue strVal; if (key_nulls[i]) continue; @@ -90,7 +90,7 @@ jsonb_exists_all(PG_FUNCTION_ARGS) for (i = 0; i < elem_count; i++) { - JsonbValue strVal; + JsonbValue strVal; if (key_nulls[i]) continue; @@ -117,8 +117,7 @@ jsonb_contains(PG_FUNCTION_ARGS) JsonbIterator *it1, *it2; - if (JB_ROOT_COUNT(val) < JB_ROOT_COUNT(tmpl) || - JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl)) + if (JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl)) PG_RETURN_BOOL(false); it1 = JsonbIteratorInit(&val->root); @@ -137,8 +136,7 @@ jsonb_contained(PG_FUNCTION_ARGS) JsonbIterator *it1, *it2; - if (JB_ROOT_COUNT(val) < JB_ROOT_COUNT(tmpl) || - JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl)) + if (JB_ROOT_IS_OBJECT(val) != JB_ROOT_IS_OBJECT(tmpl)) PG_RETURN_BOOL(false); it1 = JsonbIteratorInit(&val->root); diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c index f157df3532..2ff85396d0 100644 --- a/src/backend/utils/adt/jsonb_util.c +++ b/src/backend/utils/adt/jsonb_util.c @@ -957,13 +957,24 @@ JsonbDeepContains(JsonbIterator **val, JsonbIterator **mContained) } else if (rcont == WJB_BEGIN_OBJECT) { - JsonbValue *lhsVal; /* lhsVal is from pair in lhs object */ - + Assert(vval.type == jbvObject); Assert(vcontained.type == jbvObject); + /* + * If the lhs has fewer pairs than the rhs, it can't possibly contain + * the rhs. (This conclusion is safe only because we de-duplicate + * keys in all Jsonb objects; thus there can be no corresponding + * optimization in the array case.) The case probably won't arise + * often, but since it's such a cheap check we may as well make it. + */ + if (vval.val.object.nPairs < vcontained.val.object.nPairs) + return false; + /* Work through rhs "is it contained within?" object */ for (;;) { + JsonbValue *lhsVal; /* lhsVal is from pair in lhs object */ + rcont = JsonbIteratorNext(mContained, &vcontained, false); /* @@ -1047,6 +1058,7 @@ JsonbDeepContains(JsonbIterator **val, JsonbIterator **mContained) JsonbValue *lhsConts = NULL; uint32 nLhsElems = vval.val.array.nElems; + Assert(vval.type == jbvArray); Assert(vcontained.type == jbvArray); /* diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c index 4eeb6314fa..67539ecde9 100644 --- a/src/backend/utils/adt/misc.c +++ b/src/backend/utils/adt/misc.c @@ -35,6 +35,7 @@ #include "storage/proc.h" #include "storage/procarray.h" #include "utils/lsyscache.h" +#include "utils/ruleutils.h" #include "tcop/tcopprot.h" #include "utils/builtins.h" #include "utils/timestamp.h" diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 6e41cbd142..24ade6cc20 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -55,6 +55,7 @@ #include "utils/fmgroids.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/ruleutils.h" #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tqual.h" diff --git a/src/backend/utils/hash/Makefile b/src/backend/utils/hash/Makefile index 05d347c856..5d5338266d 100644 --- a/src/backend/utils/hash/Makefile +++ b/src/backend/utils/hash/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/utils/hash top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = dynahash.o hashfn.o +OBJS = chash.o dynahash.o hashfn.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/utils/hash/chash.c b/src/backend/utils/hash/chash.c new file mode 100644 index 0000000000..0d4dc78a4e --- /dev/null +++ b/src/backend/utils/hash/chash.c @@ -0,0 +1,1075 @@ +/*------------------------------------------------------------------------- + * + * chash.c + * concurrent hash tables + * + * A concurrent hash table stores a collection of fixed-size objects. + * From the point of view of this module, such objects are merely an + * opaque array of bytes, but the caller will typically implement them as + * a C "struct". Some fixed-size, leading portion of each object is + * designated as the key, which must be distinct for all objects in the + * collection. Since PostgreSQL's shared memory model does not permit + * dynamic shared-memory allocation, we preallocate shared-memory space + * for the maximum number of entities which can be stored (plus a few + * extra, for reasons that will be further explained below). This space + * is allocated as a single large array called the arena, and we often + * refer to entities by their position in the arena rather than via an + * ordinary pointer. This saves a considerable amount of memory, since + * most modern architectures are 64-bit and therefore use 8-byte pointers, + * while arena offsets can be stored in a 32-bit word. In fact, we + * reserve one bit in each such word as a mark bit, so the maximum size + * of the arena is 2^31 elements, a restriction that does not currently + * appear to be problematic. An additional advantage of this representation + * is that aligned 32-bit loads and stores are atomic on all architectures + * we support, but 64-bit loads and stores are not. + * + * When an element is inserted, we copy the data from the backend-private + * object supplied by the caller into one of these shared-memory entities. + * When the hash table is searched, the caller passes a backend-private + * entity with just the key filled in; if a matching element is found, + * data is copied from the shared memory entity into the non-key portion + * of the user-supplied entity. In this way, clients of this module + * never use pointers into shared memory directly. + * + * As normal, we structure the hash table as an array of buckets, whose + * size is always a power of two, so that the low-order bytes of the + * hash code can be used to select a bucket. If multiple entities has + * to the same bucket, we use separate chaining: each entity in the + * arena has an 8-byte header that stores the 4-byte arena offset of the + * next item in the bucket and the hash value of the entity's key. + * Bucket chains are maintained in order by ascending hash value and + * then by ascending entity key (as per memcmp) so that there is + * precisely one legal location at which a given new item can be inserted + * into a bucket. + * + * Items are inserted into buckets using compare-and-swap (CAS). Thus, this + * module cannot be used on architectures where we do not have 4-byte + * compare-and-swap. When an item is deleted, we first set its mark bit, + * which is stored within the next-pointer, again using CAS. Once this + * step is completed, the node is deleted. As a cleanup operation, we then + * use CAS to modify the next-pointer of the previous node to point to the + * node following the deleted node. Note that, even once this cleanup + * operation has been performed, some other backend concurrently walking the + * chain might still be visiting the deleted node. Thus, we must be certain + * not to overwrite the deleted node's next-pointer until all concurrent + * bucket scans have completed. This means, in particular, that we cannot + * immediately view the deleted node as available for reuse. + * + * Instead, when a delete-marked node is removed from the bucket chain, it + * is added to one of many garbage lists. There is a many-to-one mapping from + * buckets to garbage lists, so that the choice of bucket determines the + * garbage list but not visca versa. Any process which wishes to scan a bucket + * must first advertise in shared memory the corresponding garbage list number. + * When a backend wishes to move entries from a garbage list to a free list, + * it must first wait (by spinning) for any backends scanning that garbage + * list to complete their scans. + * + * Ideally, it would be nice to make this completely lock-free, but because + * of the above-described choice of garbage collection algorithm, it currently + * isn't. For an algorithm to be lock-free, it must be possible to suspend + * the execution of any number of processes for an arbitrary period of time + * without impeding the overall progress of the system. For this code, that + * is true except when garbage collection occurs. In that case, an insert, + * search, or delete operation can obstruct an inserting thread attempting to + * perform garbage collection for an unbounded period of time. The algorithm + * could be adapted as to be completely lock-free, essentially by guaranteeing + * that even in the worst case no combination of processes can obstruct garbage + * collection to a sufficient degree as to prevent an inserter from finding + * an available entry in a hash table containing fewer live elements than its + * declared maximum capacity. However, it's not clear that this is a good + * idea, because it would likely slow down operation in the non-contended + * case to solve a problem that we hope won't happen anyway. + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/hash/chash.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "access/hash.h" +#include "storage/barrier.h" +#include "storage/proc.h" +#include "storage/shmem.h" +#include "utils/chash.h" +#include "utils/memutils.h" + +/* + * CHashPtr represents an offset into the arena, plus a mark bit that is + * used to implement concurrent deletion. + */ +typedef uint32 CHashPtr; +#define InvalidCHashPtr ((uint32) -2) +#define CHashPtrIsInvalid(x) ((x) >= InvalidCHashPtr) +#define CHashPtrIsMarked(x) ((x) & 1) +#define CHashPtrGetOffset(x) ((x) >> 1) +#define CHashPtrMark(x) ((x) | 1) +#define CHashPtrUnmark(x) ((x) & ~1) +#define MakeCHashPtr(x) ((x) << 1) +#define CHashMaxCapacity CHashPtrGetOffset(InvalidCHashPtr) + +/* + * Each object stored in the hash table is represented by a CHashNode, which + * stores a pointer to the next item in the same bucket, and the exact hash + * value of the current item. Each CHashNode is followed by space for the + * item itself. + */ +typedef struct +{ + CHashPtr next; /* arena offset of next element */ + union + { + uint32 hashcode; /* hash(key) */ + CHashPtr gcnext; /* arena offset of next garbage item */ + } un; +} CHashNode; + +#define SizeOfCHashNode MAXALIGN(sizeof(CHashNode)) +#define CHashNodeGetItem(x) (((char *) x) + SizeOfCHashNode) + +/* + * CHashTableData stores all the information that we need in order to access + * a concurrent hash table. We store one copy of this data in shared memory, + * and an additional copy in the private memory of each backend accessing the + * table. + */ +typedef struct CHashTableData +{ + /* + * These fields do not change after initialization. + */ + CHashDescriptor desc; /* descriptor for this hash table */ + uint32 bucket_mask; /* # of buckets, minus one */ + uint8 garbage_shift; /* log2(# of buckets/# of garbage lists) */ + uint8 freelist_shift; /* log2(# of garbage lists/# freelists) */ + uint16 nfreelists; /* # of freelists */ + uint32 arena_limit; /* # of arena elements */ + uint32 arena_stride; /* bytes allocated per arena element */ + CHashPtr *bucket; /* array with 1 entry per bucket */ + CHashPtr *extra; /* entries for garbage and free lists */ + char *arena; /* arena */ + + /* + * These fields will be different in each backend; the shared copy is + * irrelevant. + */ + int gc_pid; /* PID that set gc_next */ + uint32 gc_next; /* next garbage list to reclaim */ + uint64 stats[CHS_NumberOfStatistics]; /* statistics */ +} CHashTableData; + +/* Compute # of buckets, garbage lists, or free lists. */ +#define CHashTableNBuckets(table) \ + ((table)->bucket_mask + 1) +#define CHashTableNGarbage(table) \ + (CHashTableNBuckets((table)) >> (table)->garbage_shift) +#define CHashTableNFreeLists(table) \ + ((table)->nfreelists) + +/* + * Garbage lists and free lists are interleaved to reduce cache line + * contention on the free lists, so the calculation of where an individual + * list is located is a bit complex. + */ +#define CHashTableGetGarbageList(table, n) \ + (&(table)->extra[(n) + ((n) >> (table)->freelist_shift)]) +#define CHashTableGetGarbageByBucket(table, n) \ + (CHashTableGetGarbageList((table), (n) >> (table)->garbage_shift)) +#define CHashTableGetFreeList(table, n) \ + (&(table)->extra[(n) + (((n) + 1) << (table)->freelist_shift)]) + +/* Access macros for arena nodes. */ +#define CHashTableGetRaw(table, offset) \ + (AssertMacro((offset) < (table)->arena_limit), \ + (CHashNode *) ((table)->arena + (table)->arena_stride * (offset))) +#define CHashTableGetNode(table, ptr) \ + (AssertMacro(!CHashPtrIsInvalid(ptr)), \ + CHashTableGetRaw((table), CHashPtrGetOffset((ptr)))) + +/* Statistics macros. */ +#define CHashTableIncrementStatistic(table, stat) \ + ((table)->stats[(stat)]++) + +/* + * Bucket scan. + */ +typedef struct +{ + CHashPtr target; + CHashPtr next; + CHashPtr *pointer_to_target; + CHashNode *target_node; + bool found; +} CHashScanResult; + +/* Human-readable statistics names. */ +char *CHashStatisticsNames[] = { + "searches", + "searches failed", + "inserts", + "inserts failed", + "inserts retried", + "deletions", + "deletions failed", + "deletions retried", + "scan expunges", + "scan expunges failed", + "scans restarted", + "cleanup scans", + "allocations failed", + "garbage enqueues retried", + "garbage dequeues failed", + "garbage collections", + "garbage collection spins", + "garbage collection reclaims skipped", + "garbage collection fast reclaims", + "garbage collection reclaims retried", + "<end>" +}; + +/* Function prototypes. */ +static CHashPtr CHashAllocate(CHashTable table); +static CHashPtr CHashAllocateViaGC(CHashTable table); +static void CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c); +static void CHashBucketScan(CHashTable table, + CHashPtr *start, + uint32 hashcode, + const void *key, + CHashScanResult *res); + +/* + * First stage of CHashTable initialization. We fill in all the constants + * here, but not the pointers. + */ +CHashTable +CHashBootstrap(CHashDescriptor *desc) +{ + CHashTable table; + uint32 bucket_shift; + + /* Sanity check. */ + Assert(!strcmp(CHashStatisticsNames[CHS_NumberOfStatistics], "<end>")); + + /* Allocate table and copy descriptor. */ + table = MemoryContextAllocZero(TopMemoryContext, sizeof(CHashTableData)); + memcpy(&table->desc, desc, sizeof(CHashDescriptor)); + + /* Sanity checks. */ + if (desc->capacity < 1 || desc->capacity > CHashMaxCapacity) + elog(ERROR, "invalid capacity for concurrent hash"); + if (desc->key_size < 1 || desc->key_size > desc->element_size) + elog(ERROR, "invalid key size for concurrent hash"); + + /* + * The number of buckets must be a power of two. To avoid (as much as + * possible) having to traverse long bucket chains, we aim for a load + * factor <= 1.0, so this is a pretty simple calculation: we just find the + * smallest power of two greater than or equal to the target capacity. + */ + bucket_shift = fls(desc->capacity) - 1; + table->bucket_mask = (1 << bucket_shift) - 1; + + /* + * We choose to have one garbage list for every 64 hash table buckets + * (that is, garbage_shift = 6) unless there are fewer than 64 buckets in + * total, in which case we still have a minimum of one garbage list. + * + * Increasing the garbage_shift would reduce the likelihood of a backend + * performing garbage collection needing to wait for a backend walking a + * bucket to finish its scan. On the other hand, decreasing the garbage + * shift would allow more items to be recovered in a single garbage + * collection cycle. It's not clear what the optimal value is. + */ + table->garbage_shift = Min(bucket_shift, 6); + table->gc_next = 0; + table->gc_pid = 0; + + /* + * Experimentation reveals that the free list manipulation is both one of + * the slowest parts of this algorithm and the most vulnerable to + * contention. Therefore, we want to have as many free lists as possible, + * but there's no need to have more than the number of CPU cores, so we + * limit the number of freelists to 64. There might be a benefit in some + * larger limit on a really big system, but we'll cap it here pending some + * actual test results. We're also limited to having no more freelists + * than we do garbage lists. + */ +#define LOG2_MAX_FREELIST 6 + if (bucket_shift - table->garbage_shift < LOG2_MAX_FREELIST) + table->freelist_shift = 0; + else + table->freelist_shift = + (bucket_shift - table->garbage_shift) - LOG2_MAX_FREELIST; + table->nfreelists = + 1 << (bucket_shift - (table->garbage_shift + table->freelist_shift)); + + /* + * To make garbage collection efficient, we overallocate. Normally, we + * overallocate by one-eighth, but if that would be less than 15 elements, + * then we allocate 15 elements instead. This extra capacity can actually + * be used, but for best performance, it shouldn't be. It's the caller's + * responsibility to avoid this. + */ + table->arena_limit = desc->capacity; + if (desc->capacity < 120) + table->arena_limit += 15; + else + table->arena_limit += table->arena_limit / 8; + + /* Each arena element must be MAXALIGN'd and include per-node space. */ + table->arena_stride = SizeOfCHashNode + MAXALIGN(desc->element_size); + + /* Zero out statistics. */ + memset(table->stats, 0, sizeof(uint64) * CHS_NumberOfStatistics); + + return table; +} + +/* + * Estimate shared memory requirements. + */ +Size +CHashEstimateSize(CHashTable table) +{ + Size total_buckets; + Size size; + Size nbuckets = CHashTableNBuckets(table); + Size ngarbage = CHashTableNGarbage(table); + Size nfreelists = CHashTableNFreeLists(table); + + Assert(nbuckets > 0 && ngarbage > 0 && nfreelists > 0); + total_buckets = add_size(nbuckets, ngarbage); + total_buckets = add_size(total_buckets, nfreelists); + + size = MAXALIGN(sizeof(CHashTableData)); + size = add_size(size, mul_size(sizeof(CHashPtr), total_buckets)); + size = add_size(size, mul_size(table->arena_stride, table->arena_limit)); + + return size; +} + +/* + * Create a concurrent hash table in shared memory, or attach to an existing + * table. + */ +CHashTable +CHashInitialize(CHashTable table, CHashDescriptor *desc) +{ + Size size; + bool found; + void *shmem; + uint32 i; + uint32 nbuckets; + uint32 nfreelists; + uint32 ngarbage; + uint32 nextra; + + /* + * If we're under the postmaster, this must be the EXEC_BACKEND case where + * we need to attach to an existing shared-memory segment. + */ + if (IsUnderPostmaster) + { + void *shmem; + + Assert(table == NULL); + table = MemoryContextAlloc(TopMemoryContext, sizeof(CHashTableData)); + shmem = ShmemAttachStruct(desc->shmem_name); + memcpy(table, shmem, sizeof(CHashTableData)); + return table; + } + + /* + * Otherwise, the hash table should not already exist, and we must + * create it. But the table should already be bootstrapped, since we + * must previously have computed its size when figuring out our shared + * memory allocation. + */ + Assert(table != NULL); + size = CHashEstimateSize(table); + shmem = ShmemInitStruct(table->desc.shmem_name, size, &found); + Assert(!found); + + /* Bucket, garbage, and freelist arrays follow table info. */ + table->bucket = (CHashPtr *) + (((char *) shmem) + MAXALIGN(sizeof(CHashTableData))); + nbuckets = CHashTableNBuckets(table); + table->extra = &table->bucket[nbuckets]; + + /* Arena follows the various lists. */ + ngarbage = CHashTableNGarbage(table); + nfreelists = CHashTableNFreeLists(table); + nextra = ngarbage + nfreelists; + table->arena = (void *) (&table->extra[nextra]); + + /* Initialize all three sets of lists to empty. */ + for (i = 0; i < nbuckets; ++i) + table->bucket[i] = InvalidCHashPtr; + for (i = 0; i < nextra; ++i) + table->extra[i] = InvalidCHashPtr; + + /* Put all arena elements on the free lists. */ + for (i = 0; i < table->arena_limit; ++i) + { + CHashPtr *f = CHashTableGetFreeList(table, i % nfreelists); + CHashNode *n = CHashTableGetRaw(table, i); + + n->un.gcnext = *f; + *f = MakeCHashPtr(i); + } + + /* + * Copy table (with pointers now filled in) to shared memory. This is + * arguably unnecessary when not using EXEC_BACKEND, but we do it anyway. + */ + memcpy(shmem, table, sizeof(CHashTableData)); + + return table; +} + +/* + * Search a concurrent hash table. entry should be a block of memory large + * enough to hold a complete entry, with just the key portion filled in. If + * a matching entry is found, this function will fill in the rest of the entry + * from the data in the hash table and return true. If not, it will return + * false. + */ +bool +CHashSearch(CHashTable table, void *entry) +{ + uint32 hashcode = hash_any(entry, table->desc.key_size); + uint32 bucket = hashcode & table->bucket_mask; + CHashPtr *b = &table->bucket[bucket]; + CHashScanResult scan; + + /* Prevent garbage collection for this bucket. */ + Assert(MyProc->hazard[0] == NULL); + MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket); + pg_memory_barrier(); + + /* Scan bucket and return data from any matching entry. */ + CHashBucketScan(table, b, hashcode, entry, &scan); + if (scan.found) + memcpy(((char *) entry) + table->desc.key_size, + CHashNodeGetItem(scan.target_node) + table->desc.key_size, + table->desc.element_size - table->desc.key_size); + + /* Allow garbage collection for this bucket. */ + Assert(MyProc->hazard[0] != NULL); + pg_memory_barrier(); + MyProc->hazard[0] = NULL; + + CHashTableIncrementStatistic(table, CHS_Search); + if (!scan.found) + CHashTableIncrementStatistic(table, CHS_Search_Failed); + return scan.found; +} + +/* + * Insert into a concurrent hash table. entry should be the complete entry + * to be inserted. If no duplicate entry is found in the table, this function + * will insert the entry and return true. Otherwise, the duplicate entry's + * value will be copied into the supplied entry and the function will return + * false. + * + * The caller is responsible for ensuring that no inserts are performed into + * a hash table which is at capacity. The behavor of such an insert is + * undefined (the actual behavior is that the insert may either succeed, + * degrading performance; or CHashAllocate may enter a tight loop until such + * time as an element is deleted). + */ +bool +CHashInsert(CHashTable table, void *entry) +{ + uint32 hashcode = hash_any(entry, table->desc.key_size); + uint32 bucket = hashcode & table->bucket_mask; + CHashPtr *b = &table->bucket[bucket]; + CHashPtr new; + CHashNode *nnew; + CHashScanResult scan; + + /* + * Allocate and initialize a new entry, on the assumption that the insert + * will succeed. If it ends up failing, we must be sure to put this back + * on some free list, lest it be permanently leaked. + */ + new = CHashAllocate(table); + nnew = CHashTableGetNode(table, new); + nnew->un.hashcode = hashcode; + memcpy(CHashNodeGetItem(nnew), entry, table->desc.element_size); + + /* Prevent garbage collection for this bucket. */ + MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket); + pg_memory_barrier(); + + /* + * Scan the bucket. If we don't find a match, use compare-and-swap to + * insert the new node at the insert position. If we do find a match, + * return the data to the caller. + */ +retry: + CHashBucketScan(table, b, hashcode, entry, &scan); + if (scan.found) + memcpy(((char *) entry) + table->desc.key_size, + CHashNodeGetItem(scan.target_node) + table->desc.key_size, + table->desc.element_size - table->desc.key_size); + else + { + /* + * We didn't find a matching element, so use compare-and-swap to + * attempt to insert the new element we've prepared. If this fails, + * someone currently inserted or deleted an element. The correct + * insertion point might have changed, or the key we're trying to + * insert might now be present when it wasn't before, so we'll have + * to search the bucket chain anew. + * + * There is a risk of starvation here, but we hope it will not happen + * in practice. Contention for inserting new elements should be + * spread out pretty much evenly across N+M possible insertion points, + * where N is the number of buckets and M is the number of elements + * in the table. Even for a quite modestly size table this is likely + * to exceed the number of CPU cores. + */ + Assert(!CHashPtrIsMarked(scan.target)); + nnew->next = scan.target; + if (!__sync_bool_compare_and_swap(scan.pointer_to_target, + scan.target, new)) + { + CHashTableIncrementStatistic(table, CHS_Insert_Retry); + goto retry; + } + } + + /* Allow garbage collection for this bucket. */ + Assert(MyProc->hazard[0] != NULL); + pg_memory_barrier(); + MyProc->hazard[0] = NULL; + + /* + * If the insert failed, add the entry we found to the appropriate + * garbage list. We can't simply put this back on the freelist, + * because that leads to an ABA problem: some other backend could + * read the head of the freelist, go into the tank, and then use + * CAS to pop an element. If at that point, it finds the same + * element at the head of the freelist but with a different successor, + * we're hosed. To prevent that, we ensure that elements are added + * to a given freelist only after verifying that any allocations in + * progress at the time we popped the freelist has completed. This + * guarantees that any allocation still in progress at the time this + * element makes it back to the freelist is trying to allocate some + * other node. + */ + CHashTableIncrementStatistic(table, CHS_Insert); + if (scan.found) + { + CHashTableIncrementStatistic(table, CHS_Insert_Failed); + CHashAddToGarbage(table, bucket, new); + } + + /* The insert succeeded if and only if no duplicate was found. */ + return !scan.found; +} + +/* + * Delete from a concurrent hash table. entry need only contain the key field. + * Returns true if we find and delete a matching key and false otherwise. + */ +bool +CHashDelete(CHashTable table, void *entry) +{ + uint32 hashcode = hash_any(entry, table->desc.key_size); + uint32 bucket = hashcode & table->bucket_mask; + CHashPtr *b = &table->bucket[bucket]; + CHashScanResult scan; + + /* Prevent garbage collection for this bucket. */ + Assert(MyProc->hazard[0] == NULL); + MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket); + pg_memory_barrier(); + + /* Scan bucket. */ +retry: + CHashBucketScan(table, b, hashcode, entry, &scan); + + /* If we found it, try to delete it. */ + if (scan.found) + { + Assert(!CHashPtrIsMarked(scan.next)); + + /* Attempt to apply delete-mark. */ + if (!__sync_bool_compare_and_swap(&scan.target_node->next, + scan.next, + CHashPtrMark(scan.next))) + { + CHashTableIncrementStatistic(table, CHS_Delete_Retry); + goto retry; + } + + /* Deletion is done; attempt to remove node from list. */ + if (__sync_bool_compare_and_swap(scan.pointer_to_target, + scan.target, + scan.next)) + CHashAddToGarbage(table, bucket, scan.target); + else + { + CHashScanResult cleanup_scan; + + /* + * If we weren't able to remove the deleted item, rescan + * the bucket to make sure it's really gone. This is just + * like a regular bucket scan, except that we don't care + * about the results. We're just doing it to achieve the + * side-effect of removing delete-marked nodes from the + * bucket chain. + */ + CHashTableIncrementStatistic(table, CHS_Cleanup_Scan); + CHashBucketScan(table, b, hashcode, entry, &cleanup_scan); + } + } + + /* Allow garbage collection for this bucket. */ + Assert(MyProc->hazard[0] != NULL); + pg_memory_barrier(); + MyProc->hazard[0] = NULL; + + /* We're done. */ + CHashTableIncrementStatistic(table, CHS_Delete); + if (!scan.found) + CHashTableIncrementStatistic(table, CHS_Delete_Failed); + return scan.found; +} + +/* + * Provide backend-local statistics to caller. + */ +void +CHashStatistics(CHashTable table, uint64 *stats) +{ + memcpy(stats, &table->stats, sizeof(uint64) * CHS_NumberOfStatistics); +} + +/* + * Scan one bucket of a concurrent hash table, storing the results in a + * CHashResult object provided by the caller. + * + * Caller must suppress garbage collection for the target bucket before + * calling this function, and resume it afterwards. + * + * On return, res->found will be true if a matching item was found and false + * otherwise. res->target will be a CHashPtr referencing the matching item, + * or the first one following the position where the matching item should have + * been; res->pointer_to_target will be a pointer to the memory address from + * which res->target was read. + * + * If res->target is not invalid, then res->target_node is a pointer to its + * location in memory. If res->found, then res->next will be the next pointer + * of res->target_node; otherwise, it's undefined. + */ +static void +CHashBucketScan(CHashTable table, + CHashPtr *start, + uint32 hashcode, + const void *key, + CHashScanResult *res) +{ + CHashPtr target; + CHashPtr *pointer_to_target; + CHashNode *target_node = NULL; + +retry: + pointer_to_target = start; + target = *pointer_to_target; + for (;;) + { + CHashPtr next; + uint32 h; + int cmp; + + /* + * If we've reached the end of the bucket chain, stop; otherwise, + * figure out the actual address of the next item. + */ + if (CHashPtrIsInvalid(target)) + { + res->found = false; + break; + } + target_node = CHashTableGetNode(table, target); + + /* + * target may have been fetched from an arena entry that could be + * concurrently modified, so a dependency barrier is required before + * dereferencing the derived pointer. + */ + pg_read_barrier_depends(); + next = target_node->next; + + /* + * For simplicity, any bucket scan, even if it's servicing a search, + * will attempt to remove delete-marked items from the bucket. This + * ensures that delete-marked elements are removed from bucket chains + * as quickly as possible and reduces code duplication. See + * CHashDelete for further comments about why delete-marking is + * necessary and how it allows safe deletion. + */ + if (CHashPtrIsMarked(next)) + { +zap: + if (__sync_bool_compare_and_swap(pointer_to_target, + target, + CHashPtrUnmark(next))) + { + /* + * No one else can possibly have modified target_node->next, + * because such modifications are not allowed after a + * delete-mark has been applied. Thus, if we just keep + * following the next pointers, we're guaranteed to visit + * all non-deleted items (and possibly some deleted items) + * that were present at the time we began the scan. + */ + CHashTableIncrementStatistic(table, CHS_Scan_Expunge); + CHashAddToGarbage(table, hashcode & table->bucket_mask, + target); + target = CHashPtrUnmark(next); + } + else + { + /* + * If the previous node has been delete-marked, we can't + * further update the next-pointer, so restart the scan. + * Otherwise, we know that some other backend removed one + * or more deleted nodes from the bucket chain just as we + * were trying to do, and we can simply continue the scan + * from wherever the previous node is pointing now. It's + * possible that some concurrent inserts have happened, too, + * but that's OK; we can view those as having happened "before" + * whatever operation this scan is supporting. + * + * Although starvation is a theoretical possibility here if + * we are forced to retry repeatedly, even a single retry is + * vanishingly unlikely in practice. It requires some other + * backend to delete both the node that we're looking at and + * the node which precedes it before we advance to the next + * node. That could certainly happen occasionally, but we'd + * have to be pretty unlucky to have it happen even twice in + * a row. + */ + CHashTableIncrementStatistic(table, CHS_Scan_Expunge_Fail); + target = *pointer_to_target; + if (CHashPtrIsMarked(target)) + { + CHashTableIncrementStatistic(table, CHS_Scan_Restart); + goto retry; + } + } + continue; + } + + /* + * Bucket chains are kept in order, so that there is exactly one legal + * point at which any given key can be inserted. The ordering is by + * hashcode first, and then by memcmp ordering of the keys involved. + */ + h = target_node->un.hashcode; + if (h == hashcode) + cmp = memcmp(CHashNodeGetItem(target_node), key, + table->desc.key_size); + else if (h > hashcode) + cmp = 1; + else + cmp = -1; + + /* + * If cmp < 0, then we haven't yet reached the point at which we'd + * expect to find the key, so we must continue the scan. If cmp == 0, + * we've found the key and can stop. If cmp > 0, we've either passed + * the point where we expect to find the key OR someone delete-marked + * the item and overwrote the hashcode with a gcnext pointer. In the + * latter case we must take care not to be fooled into stopping the + * scan early. + */ + if (cmp >= 0) + { + if (cmp == 0) + { + res->found = true; + res->next = next; + break; + } + else + { + /* + * pg_read_barrier() prevents the reread of the next pointer + * from being speculated ahead of the read of the hash value. + */ + pg_read_barrier(); + next = target_node->next; + if (CHashPtrIsMarked(next)) + goto zap; + res->found = false; + break; + } + } + + /* Continue scan from next node. */ + pointer_to_target = &target_node->next; + target = next; + } + + /* Send results back to caller. */ + res->target = target; + res->pointer_to_target = pointer_to_target; + res->target_node = target_node; +} + +/* + * Allocate an arena slot for a new item to be inserted into a hash table. + * + * We don't want to wait until every single free-list is completely empty + * before beginning to garbage collect, because that could result in very + * fast allocation followed by a storm of garbage collection activity. + * It could also lead to every inserting backend ganging up on the only + * non-empty freelist. + * + * To avoid that, we check free lists and garbage lists in alternation. + * We always start with the same free list - which one is based on our + * backend ID - but we try to round-robin over all the available garbage + * lists. Whenever we successfully garbage collect, we put the recovered + * items on our own free list. In this way, if there's only one backend + * active, it will typically find a free buffer in the first place it looks: + * its own free list. It will also settle into a pattern of garbage + * collecting the garbage list which it has visited least recently, which + * is what we want. + */ +static CHashPtr +CHashAllocate(CHashTable table) +{ + uint32 f_current; + CHashPtr new; + + /* Pick a starting freelist base on our backend ID. */ + f_current = ((uint32) MyBackendId) % CHashTableNFreeLists(table); + + /* If this process hasn't initialized gc_next yet, do that now. */ + if (table->gc_pid != MyProcPid) + { + table->gc_pid = MyProcPid; + table->gc_next = ((uint32) MyProcPid) % CHashTableNGarbage(table); + } + + /* Loop until we allocate a buffer. */ + for (;;) + { + CHashPtr *b; + + /* + * Try to pop a buffer from a freelist using compare-and-swap. + * + * Note that this is only safe if it's impossible for the next pointer + * of the first element on the list to change between the time when + * we read it and the time we use CAS to pop it off the list. This + * means that we can't allow any element that is currently on this + * freelist to be allocated, marked as garbage, garbage collected, + * and returned back to this freelist before we finish the CAS. + * + * If we attempt to pop the free-list and fail, we retry immediately + * with the same free-list. This reduces the frequency with which + * we're obliged to update our hazard pointers, which is a material + * savings due to the associated memory barrier. + */ + b = CHashTableGetFreeList(table, f_current); + MyProc->hazard[0] = b; + pg_memory_barrier(); + new = *b; + while (!CHashPtrIsInvalid(new)) + { + CHashNode *n = CHashTableGetNode(table, new); + + /* + * n is computed from table->freelist[f_current], which could + * be modified by concurrent activity, so we need a dependency + * barrier here. + */ + pg_read_barrier_depends(); + if (__sync_bool_compare_and_swap(b, new, n->un.gcnext)) + return new; + CHashTableIncrementStatistic(table, CHS_Allocate_Fail); + new = *b; + } + + /* Attempt garbage collection. */ + new = CHashAllocateViaGC(table); + if (!CHashPtrIsInvalid(new)) + return new; + + /* Advance to next freelist. */ + f_current = (f_current + 1) % CHashTableNFreeLists(table); + } +} + +/* + * Attempt to satisfy an allocation request via garbage collection. + */ +static CHashPtr +CHashAllocateViaGC(CHashTable table) +{ + uint32 f_home; + CHashPtr *b; + CHashPtr *fh; + CHashPtr fhead; + CHashPtr garbage; + CHashPtr new; + CHashNode *n; + uint32 i; + + /* Pick a target freelist based on our backend ID. */ + f_home = ((uint32) MyBackendId) % CHashTableNFreeLists(table); + fh = CHashTableGetFreeList(table, f_home); + + /* Select target garbage list. */ + table->gc_next = (table->gc_next + 1) % CHashTableNGarbage(table); + b = CHashTableGetGarbageList(table, table->gc_next); + garbage = *b; + + /* If list is empty, fail. */ + if (CHashPtrIsInvalid(garbage)) + return InvalidCHashPtr; + + /* If we're unable to empty the list via compare-and-swap, fail. */ + if (!__sync_bool_compare_and_swap(b, garbage, InvalidCHashPtr)) + { + CHashTableIncrementStatistic(table, CHS_Garbage_Dequeue_Fail); + return InvalidCHashPtr; + } + + /* + * Before removing elements removed from the garbage list to the + * freelist, we must wait until (1) all bucket scans that might + * still see elements on the freelist as part of the bucket chain + * have completed and (2) all allocations that might see an old + * version of the freelist containing one of the elements to be + * garbage collected have completed. + * + * Note: We can't begin this operation until the clearing of the + * garbage list has been committed to memory, but since that was + * done using an atomic operation no explicit barrier is needed + * here. + * + * Note: We could have a "soft" version of this that merely + * requeues the garbage if it's not immediately recycleable, but + * it's not clear that we need such a thing. On the flip side we + * might want to eventually enter a longer sleep here, or PANIC, + * but it's not clear exactly how to calibrate that. + */ + CHashTableIncrementStatistic(table, CHS_GC); + MyProc->hazard[0] = NULL; + for (i = 0; i < ProcGlobal->allProcCount; i++) + { + volatile PGPROC *proc = &ProcGlobal->allProcs[i]; + void *hazard; + + hazard = proc->hazard[0]; + if (hazard == b || hazard == fh) + { + CHashTableIncrementStatistic(table, CHS_GC_Spin); + do + { + hazard = proc->hazard[0]; + } while (hazard == b || hazard == fh); + } + } + + /* Remove one item from list to satisfy current allocation. */ + new = garbage; + n = CHashTableGetNode(table, new); + pg_read_barrier_depends(); + fhead = n->un.gcnext; + + if (CHashPtrIsInvalid(fhead)) + { + /* + * We have reclaimed exactly node, so there's nothing to put + * back on the free list. In this case (only) we need a + * memory barrier, because the reads above must complete + * before we overwrite n->un.gcnext with a new hashcode. + * (This is only needed when we reclaim exactly one node, + * because in any other case we'll do a compare-and-swap + * before returning, which implies a full barrier.) + */ + pg_memory_barrier(); + CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Skipped); + } + else if (__sync_bool_compare_and_swap(fh, InvalidCHashPtr, fhead)) + { + /* + * Our free list is empty, and we've succesfully pushed the + * reclaimed nodes onto it. So we're done. + */ + CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Fast); + } + else + { + CHashPtr fcurrent; + CHashPtr fnext; + CHashPtr oldhead; + + /* Walk list of reclaimed elements to end. */ + fcurrent = fhead; + for (;;) + { + n = CHashTableGetNode(table, fcurrent); + fnext = n->un.gcnext; + if (CHashPtrIsInvalid(fnext)) + break; + fcurrent = fnext; + } + + /* Push reclaimed elements onto home free list. */ + for (;;) + { + oldhead = *fh; + n->un.gcnext = oldhead; + if (__sync_bool_compare_and_swap(fh, oldhead, fhead)) + break; + CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Retry); + } + } + + /* Return the element we saved for ourselves. */ + return new; +} + +/* + * Add an arena slot to the appropriate garbage list. + * + * The next garbage collection cycle for the affected bucket will move it + * to the free list. We can't do that immediately because there might be + * someone traversing the list who is counting on being able to follow the + * next pointer. It's OK to clobber the hash value, though, since a spurious + * failure to match an already-deleted item shouldn't cause any problems; + * this is why gcnext can share space with the hash value. + */ +static void +CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c) +{ + CHashPtr g; + CHashNode *n; + CHashPtr *garbage; + + n = CHashTableGetNode(table, c); + garbage = CHashTableGetGarbageByBucket(table, bucket); + + while (1) + { + g = *garbage; + n->un.gcnext = g; + if (__sync_bool_compare_and_swap(garbage, g, c)) + break; + CHashTableIncrementStatistic(table, CHS_Garbage_Enqueue_Retry); + } +} diff --git a/src/bin/pg_basebackup/pg_receivexlog.c b/src/bin/pg_basebackup/pg_receivexlog.c index e6f69e4edd..7374cc8eb4 100644 --- a/src/bin/pg_basebackup/pg_receivexlog.c +++ b/src/bin/pg_basebackup/pg_receivexlog.c @@ -66,9 +66,12 @@ usage(void) printf(_(" %s [OPTION]...\n"), progname); printf(_("\nOptions:\n")); printf(_(" -D, --directory=DIR receive transaction log files into this directory\n")); + printf(_(" -F --fsync-interval=SECS\n" + " time between fsyncs to transaction log files (default: %d)\n"), (fsync_interval / 1000)); printf(_(" -n, --no-loop do not loop on connection lost\n")); - printf(_(" -F --fsync-interval=INTERVAL\n" - " frequency of syncs to transaction log files (in seconds)\n")); + printf(_(" -s, --status-interval=SECS\n" + " time between status packets sent to server (default: %d)\n"), (standby_message_timeout / 1000)); + printf(_(" -S, --slot=SLOTNAME replication slot to use\n")); printf(_(" -v, --verbose output verbose messages\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" -?, --help show this help, then exit\n")); @@ -76,12 +79,9 @@ usage(void) printf(_(" -d, --dbname=CONNSTR connection string\n")); printf(_(" -h, --host=HOSTNAME database server host or socket directory\n")); printf(_(" -p, --port=PORT database server port number\n")); - printf(_(" -s, --status-interval=INTERVAL\n" - " time between status packets sent to server (in seconds)\n")); printf(_(" -U, --username=NAME connect as specified database user\n")); printf(_(" -w, --no-password never prompt for password\n")); printf(_(" -W, --password force password prompt (should happen automatically)\n")); - printf(_(" -S, --slot=SLOTNAME replication slot to use\n")); printf(_("\nOptional actions:\n")); printf(_(" --create-slot create a new replication slot (for the slot's name see --slot)\n")); printf(_(" --drop-slot drop the replication slot (for the slot's name see --slot)\n")); diff --git a/src/bin/pg_basebackup/pg_recvlogical.c b/src/bin/pg_basebackup/pg_recvlogical.c index 1a01167912..0d97638851 100644 --- a/src/bin/pg_basebackup/pg_recvlogical.c +++ b/src/bin/pg_basebackup/pg_recvlogical.c @@ -62,15 +62,27 @@ static void disconnect_and_exit(int code); static void usage(void) { - printf(_("%s receives PostgreSQL logical change stream.\n\n"), + printf(_("%s receives PostgreSQL logical change streams.\n\n"), progname); printf(_("Usage:\n")); printf(_(" %s [OPTION]...\n"), progname); + printf(_("\nAction to be performed:\n")); + printf(_(" --create-slot create a new replication slot (for the slot's name see --slot)\n")); + printf(_(" --drop-slot drop the replication slot (for the slot's name see --slot)\n")); + printf(_(" --start start streaming in a replication slot (for the slot's name see --slot)\n")); printf(_("\nOptions:\n")); - printf(_(" -f, --file=FILE receive log into this file. - for stdout\n")); + printf(_(" -f, --file=FILE receive log into this file, - for stdout\n")); printf(_(" -F --fsync-interval=SECS\n" - " frequency of syncs to the output file (default: %d)\n"), (fsync_interval / 1000)); + " time between fsyncs to the output file (default: %d)\n"), (fsync_interval / 1000)); + printf(_(" -I, --startpos=LSN where in an existing slot should the streaming start\n")); printf(_(" -n, --no-loop do not loop on connection lost\n")); + printf(_(" -o, --option=NAME[=VALUE]\n" + " pass option NAME with optional value VALUE to the\n" + " output plugin\n")); + printf(_(" -P, --plugin=PLUGIN use output plugin PLUGIN (default: %s)\n"), plugin); + printf(_(" -s, --status-interval=SECS\n" + " time between status packets sent to server (default: %d)\n"), (standby_message_timeout / 1000)); + printf(_(" -S, --slot=SLOTNAME name of the logical replication slot\n")); printf(_(" -v, --verbose output verbose messages\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" -?, --help show this help, then exit\n")); @@ -81,19 +93,6 @@ usage(void) printf(_(" -U, --username=NAME connect as specified database user\n")); printf(_(" -w, --no-password never prompt for password\n")); printf(_(" -W, --password force password prompt (should happen automatically)\n")); - printf(_("\nReplication options:\n")); - printf(_(" -I, --startpos=PTR where in an existing slot should the streaming start\n")); - printf(_(" -o, --option=NAME[=VALUE]\n" - " specify option NAME with optional value VALUE, to be passed\n" - " to the output plugin\n")); - printf(_(" -P, --plugin=PLUGIN use output plugin PLUGIN (default: %s)\n"), plugin); - printf(_(" -s, --status-interval=SECS\n" - " time between status packets sent to server (default: %d)\n"), (standby_message_timeout / 1000)); - printf(_(" -S, --slot=SLOT name of the logical replication slot\n")); - printf(_("\nAction to be performed:\n")); - printf(_(" --create-slot create a new replication slot (for the slot's name see --slot)\n")); - printf(_(" --drop-slot drop the replication slot (for the slot's name see --slot)\n")); - printf(_(" --start start streaming in a replication slot (for the slot's name see --slot)\n")); printf(_("\nReport bugs to <pgsql-bugs@postgresql.org>.\n")); } diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index a46ca53ba6..733f1cbc86 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -1456,7 +1456,9 @@ pgwin32_doRegister(void) NULL, NULL, "RPCSS\0", register_username, register_password)) == NULL) { CloseServiceHandle(hSCM); - write_stderr(_("%s: could not register service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError()); + write_stderr(_("%s: could not register service \"%s\": error code %lu\n"), + progname, register_servicename, + (unsigned long) GetLastError()); exit(1); } CloseServiceHandle(hService); @@ -1484,14 +1486,18 @@ pgwin32_doUnregister(void) if ((hService = OpenService(hSCM, register_servicename, DELETE)) == NULL) { CloseServiceHandle(hSCM); - write_stderr(_("%s: could not open service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError()); + write_stderr(_("%s: could not open service \"%s\": error code %lu\n"), + progname, register_servicename, + (unsigned long) GetLastError()); exit(1); } if (!DeleteService(hService)) { CloseServiceHandle(hService); CloseServiceHandle(hSCM); - write_stderr(_("%s: could not unregister service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError()); + write_stderr(_("%s: could not unregister service \"%s\": error code %lu\n"), + progname, register_servicename, + (unsigned long) GetLastError()); exit(1); } CloseServiceHandle(hService); @@ -1627,7 +1633,9 @@ pgwin32_doRunAsService(void) if (StartServiceCtrlDispatcher(st) == 0) { - write_stderr(_("%s: could not start service \"%s\": error code %lu\n"), progname, register_servicename, GetLastError()); + write_stderr(_("%s: could not start service \"%s\": error code %lu\n"), + progname, register_servicename, + (unsigned long) GetLastError()); exit(1); } } @@ -1708,7 +1716,14 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser /* Open the current token to use as a base for the restricted one */ if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ALL_ACCESS, &origToken)) { - write_stderr(_("%s: could not open process token: error code %lu\n"), progname, GetLastError()); + /* + * Most Windows targets make DWORD a 32-bit unsigned long. Cygwin + * x86_64, an LP64 target, makes it a 32-bit unsigned int. In code + * built for Cygwin as well as for native Windows targets, cast DWORD + * before printing. + */ + write_stderr(_("%s: could not open process token: error code %lu\n"), + progname, (unsigned long) GetLastError()); return 0; } @@ -1721,7 +1736,8 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser SECURITY_BUILTIN_DOMAIN_RID, DOMAIN_ALIAS_RID_POWER_USERS, 0, 0, 0, 0, 0, 0, &dropSids[1].Sid)) { - write_stderr(_("%s: could not allocate SIDs: error code %lu\n"), progname, GetLastError()); + write_stderr(_("%s: could not allocate SIDs: error code %lu\n"), + progname, (unsigned long) GetLastError()); return 0; } @@ -1740,7 +1756,8 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser if (!b) { - write_stderr(_("%s: could not create restricted token: error code %lu\n"), progname, GetLastError()); + write_stderr(_("%s: could not create restricted token: error code %lu\n"), + progname, (unsigned long) GetLastError()); return 0; } @@ -1791,7 +1808,8 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser HANDLE job; char jobname[128]; - sprintf(jobname, "PostgreSQL_%lu", processInfo->dwProcessId); + sprintf(jobname, "PostgreSQL_%lu", + (unsigned long) processInfo->dwProcessId); job = _CreateJobObject(NULL, jobname); if (job) diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index c9e61dfa39..0e1e0cd5f0 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -127,6 +127,10 @@ typedef struct HashJoinTableData int nbuckets; /* # buckets in the in-memory hash table */ int log2_nbuckets; /* its log2 (nbuckets must be a power of 2) */ + int nbuckets_original; /* # buckets when starting the first hash */ + int nbuckets_optimal; /* optimal # buckets (per batch) */ + int log2_nbuckets_optimal; /* same as log2_nbuckets optimal */ + /* buckets[i] is head of list of tuples in i'th in-memory bucket */ struct HashJoinTupleData **buckets; /* buckets array is per-batch storage, as are all the tuples */ @@ -148,6 +152,7 @@ typedef struct HashJoinTableData bool growEnabled; /* flag to shut off nbatch increases */ double totalTuples; /* # tuples obtained from inner plan */ + double skewTuples; /* # tuples inserted into skew tuples */ /* * These arrays are allocated for the life of the hash join, but only if diff --git a/src/include/storage/barrier.h b/src/include/storage/barrier.h index b36705b862..6ef779bf95 100644 --- a/src/include/storage/barrier.h +++ b/src/include/storage/barrier.h @@ -20,4 +20,12 @@ */ #include "port/atomics.h" +/* + * If dependency barriers are undefined, we define them as a no-op. The only + * known platform where anything more is required is DEC Alpha. + */ +#if !defined(pg_read_barrier_depends) +#define pg_read_barrier_depends() +#endif + #endif /* BARRIER_H */ diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 0e69b633c3..4c6fac8052 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -96,20 +96,6 @@ typedef struct buftag ) /* - * The shared buffer mapping table is partitioned to reduce contention. - * To determine which partition lock a given tag requires, compute the tag's - * hash code with BufTableHashCode(), then apply BufMappingPartitionLock(). - * NB: NUM_BUFFER_PARTITIONS must be a power of 2! - */ -#define BufTableHashPartition(hashcode) \ - ((hashcode) % NUM_BUFFER_PARTITIONS) -#define BufMappingPartitionLock(hashcode) \ - (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \ - BufTableHashPartition(hashcode)].lock) -#define BufMappingPartitionLockByIndex(i) \ - (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock) - -/* * BufferDesc -- shared descriptor/state data for a single shared buffer. * * Note: buf_hdr_lock must be held to examine or change the tag, flags, @@ -200,9 +186,9 @@ extern void StrategyInitialize(bool init); extern Size BufTableShmemSize(int size); extern void InitBufTable(int size); extern uint32 BufTableHashCode(BufferTag *tagPtr); -extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode); -extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id); -extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode); +extern int BufTableLookup(BufferTag *tagPtr); +extern int BufTableInsert(BufferTag *tagPtr, int buf_id); +extern void BufTableDelete(BufferTag *tagPtr); /* localbuf.c */ extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 595e69da48..8e98425ca4 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -140,7 +140,7 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray; */ /* Number of partitions of the shared buffer mapping hashtable */ -#define NUM_BUFFER_PARTITIONS 128 +#define NUM_BUFFER_PARTITIONS 0 /* Number of partitions the shared lock tables are divided into */ #define LOG2_NUM_LOCK_PARTITIONS 4 diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 38758d3ea5..cdf2f268fd 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -59,6 +59,14 @@ struct XidCache #define FP_LOCK_SLOTS_PER_BACKEND 16 /* + * Some lock-free algorithms require each backend process to be able to + * advertise a certain number of "hazard pointers" in shared memory. + * Right now one per backend is enough for our purpose, but some + * algorithms require more. + */ +#define NUM_HAZARD_POINTERS 1 + +/* * Each backend has a PGPROC struct in shared memory. There is also a list of * currently-unused PGPROC structs that will be reallocated to new backends. * @@ -143,6 +151,12 @@ struct PGPROC bool fpVXIDLock; /* are we holding a fast-path VXID lock? */ LocalTransactionId fpLocalTransactionId; /* lxid for fast-path VXID * lock */ + + /* + * Hazard pointers. Currently one is enough, though some algorithms + * require a few more. + */ + void *hazard[NUM_HAZARD_POINTERS]; }; /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */ diff --git a/src/include/storage/shm_mq.h b/src/include/storage/shm_mq.h index 5bae3807af..063400ae28 100644 --- a/src/include/storage/shm_mq.h +++ b/src/include/storage/shm_mq.h @@ -25,6 +25,13 @@ typedef struct shm_mq shm_mq; struct shm_mq_handle; typedef struct shm_mq_handle shm_mq_handle; +/* Descriptors for a single write spanning multiple locations. */ +typedef struct +{ + const char *data; + Size len; +} shm_mq_iovec; + /* Possible results of a send or receive operation. */ typedef enum { @@ -52,12 +59,17 @@ extern PGPROC *shm_mq_get_sender(shm_mq *); extern shm_mq_handle *shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle); +/* Associate worker handle with shm_mq. */ +extern void shm_mq_set_handle(shm_mq_handle *, BackgroundWorkerHandle *); + /* Break connection. */ extern void shm_mq_detach(shm_mq *); /* Send or receive messages. */ extern shm_mq_result shm_mq_send(shm_mq_handle *mqh, - Size nbytes, void *data, bool nowait); + Size nbytes, const void *data, bool nowait); +extern shm_mq_result shm_mq_sendv(shm_mq_handle *mqh, + shm_mq_iovec *iov, int iovcnt, bool nowait); extern shm_mq_result shm_mq_receive(shm_mq_handle *mqh, Size *nbytesp, void **datap, bool nowait); diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h index 745eb7e576..4ff8415fac 100644 --- a/src/include/storage/shmem.h +++ b/src/include/storage/shmem.h @@ -40,6 +40,7 @@ extern void InitShmemIndex(void); extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size, HASHCTL *infoP, int hash_flags); extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr); +extern void *ShmemAttachStruct(const char *name); extern Size add_size(Size s1, Size s2); extern Size mul_size(Size s1, Size s2); diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index d88e7a3b26..fb1b4a42dd 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -676,13 +676,10 @@ extern Datum pg_get_viewdef_name(PG_FUNCTION_ARGS); extern Datum pg_get_viewdef_name_ext(PG_FUNCTION_ARGS); extern Datum pg_get_indexdef(PG_FUNCTION_ARGS); extern Datum pg_get_indexdef_ext(PG_FUNCTION_ARGS); -extern char *pg_get_indexdef_string(Oid indexrelid); -extern char *pg_get_indexdef_columns(Oid indexrelid, bool pretty); extern Datum pg_get_triggerdef(PG_FUNCTION_ARGS); extern Datum pg_get_triggerdef_ext(PG_FUNCTION_ARGS); extern Datum pg_get_constraintdef(PG_FUNCTION_ARGS); extern Datum pg_get_constraintdef_ext(PG_FUNCTION_ARGS); -extern char *pg_get_constraintdef_string(Oid constraintId); extern Datum pg_get_expr(PG_FUNCTION_ARGS); extern Datum pg_get_expr_ext(PG_FUNCTION_ARGS); extern Datum pg_get_userbyid(PG_FUNCTION_ARGS); @@ -692,17 +689,9 @@ extern Datum pg_get_function_arguments(PG_FUNCTION_ARGS); extern Datum pg_get_function_identity_arguments(PG_FUNCTION_ARGS); extern Datum pg_get_function_result(PG_FUNCTION_ARGS); extern Datum pg_get_function_arg_default(PG_FUNCTION_ARGS); -extern char *deparse_expression(Node *expr, List *dpcontext, - bool forceprefix, bool showimplicit); -extern List *deparse_context_for(const char *aliasname, Oid relid); -extern List *deparse_context_for_planstate(Node *planstate, List *ancestors, - List *rtable, List *rtable_names); -extern List *select_rtable_names_for_explain(List *rtable, - Bitmapset *rels_used); extern const char *quote_identifier(const char *ident); extern char *quote_qualified_identifier(const char *qualifier, const char *ident); -extern char *generate_collation_name(Oid collid); /* tid.c */ diff --git a/src/include/utils/chash.h b/src/include/utils/chash.h new file mode 100644 index 0000000000..ee0573c9c7 --- /dev/null +++ b/src/include/utils/chash.h @@ -0,0 +1,69 @@ +/*------------------------------------------------------------------------- + * + * chash.h + * Concurrent shared-memory hash table. + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/chash.h + * + *------------------------------------------------------------------------- + */ +#ifndef CHASH_H +#define CHASH_H + +/* Everything caller must supply to set up a concurrent hash table. */ +typedef struct +{ + const char *shmem_name; /* shared memory name for this hash table */ + uint32 capacity; /* maximum size of hash table */ + uint16 element_size; /* size of each element */ + uint16 key_size; /* size of each key */ +} CHashDescriptor; + +/* Concurrent hash table statistics. */ +typedef enum +{ + CHS_Search, /* search */ + CHS_Search_Failed, /* search failed (no such key) */ + CHS_Insert, /* insert */ + CHS_Insert_Failed, /* insert failed (duplicate key) */ + CHS_Insert_Retry, /* insert retried (concurrent update) */ + CHS_Delete, /* delete */ + CHS_Delete_Failed, /* delete failed (no such key) */ + CHS_Delete_Retry, /* delete retried (concurrent update) */ + CHS_Scan_Expunge, /* scan expunged deleted item */ + CHS_Scan_Expunge_Fail, /* scan failed to expunge */ + CHS_Scan_Restart, /* concurrent deletes forced a scan restart */ + CHS_Cleanup_Scan, /* concurrent update forced a cleanup scan */ + CHS_Allocate_Fail, /* allocation failed to pop freelist */ + CHS_Garbage_Enqueue_Retry, /* enqueue on garbage list retried */ + CHS_Garbage_Dequeue_Fail, /* dequeue of garbage failed */ + CHS_GC, /* garbage collection cycle */ + CHS_GC_Spin, /* GC spun waiting for concurrent process */ + CHS_GC_Reclaim_Skipped, /* GC recovered only one item */ + CHS_GC_Reclaim_Fast, /* GC put garbage on freelist via fast path */ + CHS_GC_Reclaim_Retry, /* enqueue of garbage on freelist retried */ + CHS_NumberOfStatistics /* number of statistics */ +} CHashStatisticsType; + +/* Human-readable names for statistics. */ +extern char *CHashStatisticsNames[]; + +/* Opaque handle for a concurrent hash table. */ +struct CHashTableData; +typedef struct CHashTableData *CHashTable; + +/* Initialization functions. */ +extern CHashTable CHashBootstrap(CHashDescriptor *desc); +extern Size CHashEstimateSize(CHashTable table); +extern CHashTable CHashInitialize(CHashTable table, CHashDescriptor *desc); + +/* Accessor functions. */ +extern bool CHashInsert(CHashTable table, void *entry); +extern bool CHashDelete(CHashTable table, void *key); +extern bool CHashSearch(CHashTable table, void *entry); +extern void CHashStatistics(CHashTable table, uint64 *stats); + +#endif /* CHASH_H */ diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h new file mode 100644 index 0000000000..520b06653c --- /dev/null +++ b/src/include/utils/ruleutils.h @@ -0,0 +1,34 @@ +/*------------------------------------------------------------------------- + * + * ruleutils.h + * Declarations for ruleutils.c + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/ruleutils.h + * + *------------------------------------------------------------------------- + */ +#ifndef RULEUTILS_H +#define RULEUTILS_H + +#include "nodes/nodes.h" +#include "nodes/parsenodes.h" +#include "nodes/pg_list.h" + + +extern char *pg_get_indexdef_string(Oid indexrelid); +extern char *pg_get_indexdef_columns(Oid indexrelid, bool pretty); + +extern char *pg_get_constraintdef_string(Oid constraintId); +extern char *deparse_expression(Node *expr, List *dpcontext, + bool forceprefix, bool showimplicit); +extern List *deparse_context_for(const char *aliasname, Oid relid); +extern List *deparse_context_for_planstate(Node *planstate, List *ancestors, + List *rtable, List *rtable_names); +extern List *select_rtable_names_for_explain(List *rtable, + Bitmapset *rels_used); +extern char *generate_collation_name(Oid collid); + +#endif /* RULEUTILS_H */ diff --git a/src/port/crypt.c b/src/port/crypt.c index ef8bf46338..6a902ef0fc 100644 --- a/src/port/crypt.c +++ b/src/port/crypt.c @@ -87,7 +87,7 @@ static int des_cipher(const char *in, char *out, long salt, int num_iter); * define "B64" to be the declaration for a 64 bit integer. * XXX this feature is currently unused, see "endian" comment below. */ -#define B64 __int64 +/* #define B64 int64 */ /* * define "LARGEDATA" to get faster permutations, by using about 72 kilobytes diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out index eb37da7168..9146f59435 100644 --- a/src/test/regress/expected/jsonb.out +++ b/src/test/regress/expected/jsonb.out @@ -707,6 +707,42 @@ SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b", "c":"q"}'; f (1 row) +SELECT '[1,2]'::jsonb @> '[1,2,2]'::jsonb; + ?column? +---------- + t +(1 row) + +SELECT '[1,1,2]'::jsonb @> '[1,2,2]'::jsonb; + ?column? +---------- + t +(1 row) + +SELECT '[[1,2]]'::jsonb @> '[[1,2,2]]'::jsonb; + ?column? +---------- + t +(1 row) + +SELECT '[1,2,2]'::jsonb <@ '[1,2]'::jsonb; + ?column? +---------- + t +(1 row) + +SELECT '[1,2,2]'::jsonb <@ '[1,1,2]'::jsonb; + ?column? +---------- + t +(1 row) + +SELECT '[[1,2,2]]'::jsonb <@ '[[1,2]]'::jsonb; + ?column? +---------- + t +(1 row) + SELECT jsonb_contained('{"a":"b"}', '{"a":"b", "b":1, "c":null}'); jsonb_contained ----------------- diff --git a/src/test/regress/expected/jsonb_1.out b/src/test/regress/expected/jsonb_1.out index f3bfc7bcf5..83d61f8c7e 100644 --- a/src/test/regress/expected/jsonb_1.out +++ b/src/test/regress/expected/jsonb_1.out @@ -707,6 +707,42 @@ SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b", "c":"q"}'; f (1 row) +SELECT '[1,2]'::jsonb @> '[1,2,2]'::jsonb; + ?column? +---------- + t +(1 row) + +SELECT '[1,1,2]'::jsonb @> '[1,2,2]'::jsonb; + ?column? +---------- + t +(1 row) + +SELECT '[[1,2]]'::jsonb @> '[[1,2,2]]'::jsonb; + ?column? +---------- + t +(1 row) + +SELECT '[1,2,2]'::jsonb <@ '[1,2]'::jsonb; + ?column? +---------- + t +(1 row) + +SELECT '[1,2,2]'::jsonb <@ '[1,1,2]'::jsonb; + ?column? +---------- + t +(1 row) + +SELECT '[[1,2,2]]'::jsonb <@ '[[1,2]]'::jsonb; + ?column? +---------- + t +(1 row) + SELECT jsonb_contained('{"a":"b"}', '{"a":"b", "b":1, "c":null}'); jsonb_contained ----------------- diff --git a/src/test/regress/expected/matview.out b/src/test/regress/expected/matview.out index 1076b76210..b04510c599 100644 --- a/src/test/regress/expected/matview.out +++ b/src/test/regress/expected/matview.out @@ -411,7 +411,7 @@ REFRESH MATERIALIZED VIEW mv; ERROR: could not create unique index "mv_a_idx" DETAIL: Key (a)=(1) is duplicated. REFRESH MATERIALIZED VIEW CONCURRENTLY mv; -ERROR: new data for "mv" contains duplicate rows without any NULL columns +ERROR: new data for "mv" contains duplicate rows without any null columns DETAIL: Row: (1,10) DROP TABLE foo CASCADE; NOTICE: drop cascades to materialized view mv diff --git a/src/test/regress/expected/polygon.out b/src/test/regress/expected/polygon.out index b252902720..33388eb909 100644 --- a/src/test/regress/expected/polygon.out +++ b/src/test/regress/expected/polygon.out @@ -3,15 +3,15 @@ -- -- polygon logic -- --- 3 o --- | --- 2 + | --- / | --- 1 # o + --- / | --- 0 #-----o-+ +-- 3 o +-- | +-- 2 + | +-- / | +-- 1 # + +-- / o | +-- 0 #-----o-+ -- --- 0 1 2 3 4 +-- 0 1 2 3 4 -- CREATE TABLE POLYGON_TBL(f1 polygon); INSERT INTO POLYGON_TBL(f1) VALUES ('(2.0,0.0),(2.0,4.0),(0.0,0.0)'); @@ -128,15 +128,16 @@ SELECT '' AS one, p.* -- -- polygon logic -- --- 3 o --- | --- 2 + | --- / | --- 1 / o + +-- 3 o +-- /| +-- 2 + | +-- / | +-- 1 / o + -- / | --- 0 +-----o-+ +-- 0 +-----o-+ +-- +-- 0 1 2 3 4 -- --- 0 1 2 3 4 -- -- left of SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' << polygon '(3.0,1.0),(3.0,3.0),(1.0,0.0)' AS false; @@ -248,11 +249,11 @@ SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' && polygon '(3.0,1.0),(3.0,3.0),( (1 row) -- +--------------------+ --- | *---* 1 +-- | *---* 1 -- | + | | -- | 2 *---* -- +--------------------+ --- 3 +-- 3 -- Edges 1-2, 2-3 are not shown on picture SELECT '((0,4),(6,4),(1,2),(6,0),(0,0))'::polygon && '((2,1),(2,3),(3,3),(3,1))'::polygon AS "true"; true diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql index ed266d5c88..f1ed021be2 100644 --- a/src/test/regress/sql/jsonb.sql +++ b/src/test/regress/sql/jsonb.sql @@ -156,6 +156,13 @@ SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"c"}'; SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b"}'; SELECT '{"a":"b", "b":1, "c":null}'::jsonb @> '{"a":"b", "c":"q"}'; +SELECT '[1,2]'::jsonb @> '[1,2,2]'::jsonb; +SELECT '[1,1,2]'::jsonb @> '[1,2,2]'::jsonb; +SELECT '[[1,2]]'::jsonb @> '[[1,2,2]]'::jsonb; +SELECT '[1,2,2]'::jsonb <@ '[1,2]'::jsonb; +SELECT '[1,2,2]'::jsonb <@ '[1,1,2]'::jsonb; +SELECT '[[1,2,2]]'::jsonb <@ '[[1,2]]'::jsonb; + SELECT jsonb_contained('{"a":"b"}', '{"a":"b", "b":1, "c":null}'); SELECT jsonb_contained('{"a":"b", "c":null}', '{"a":"b", "b":1, "c":null}'); SELECT jsonb_contained('{"a":"b", "g":null}', '{"a":"b", "b":1, "c":null}'); diff --git a/src/test/regress/sql/polygon.sql b/src/test/regress/sql/polygon.sql index 2dad566f37..d95fa96447 100644 --- a/src/test/regress/sql/polygon.sql +++ b/src/test/regress/sql/polygon.sql @@ -3,15 +3,15 @@ -- -- polygon logic -- --- 3 o --- | --- 2 + | --- / | --- 1 # o + --- / | --- 0 #-----o-+ +-- 3 o +-- | +-- 2 + | +-- / | +-- 1 # + +-- / o | +-- 0 #-----o-+ -- --- 0 1 2 3 4 +-- 0 1 2 3 4 -- CREATE TABLE POLYGON_TBL(f1 polygon); @@ -83,15 +83,16 @@ SELECT '' AS one, p.* -- -- polygon logic -- --- 3 o --- | --- 2 + | --- / | --- 1 / o + +-- 3 o +-- /| +-- 2 + | +-- / | +-- 1 / o + -- / | --- 0 +-----o-+ +-- 0 +-----o-+ +-- +-- 0 1 2 3 4 -- --- 0 1 2 3 4 -- -- left of SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' << polygon '(3.0,1.0),(3.0,3.0),(1.0,0.0)' AS false; @@ -155,11 +156,11 @@ SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' ~= polygon '(3.0,1.0),(3.0,3.0),( SELECT polygon '(2.0,0.0),(2.0,4.0),(0.0,0.0)' && polygon '(3.0,1.0),(3.0,3.0),(1.0,0.0)' AS true; -- +--------------------+ --- | *---* 1 +-- | *---* 1 -- | + | | -- | 2 *---* -- +--------------------+ --- 3 +-- 3 -- Edges 1-2, 2-3 are not shown on picture SELECT '((0,4),(6,4),(1,2),(6,0),(0,0))'::polygon && '((2,1),(2,3),(3,3),(3,1))'::polygon AS "true"; |