diff options
author | Alvaro Herrera | 2013-01-23 15:04:59 +0000 |
---|---|---|
committer | Alvaro Herrera | 2013-01-23 15:04:59 +0000 |
commit | 0ac5ad5134f2769ccbaefec73844f8504c4d6182 (patch) | |
tree | d9b0ba4a1b65a52030820efe68a9c937c46aad1f /src/backend/commands | |
parent | f925c79b9f36c54b67053ade5ad225a75b8dc803 (diff) |
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE". UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.
Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.
The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid. Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates. This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed. pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.
Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header. This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.
Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)
With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.
As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.
Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane. There's probably room for several more tests.
There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it. Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.
This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov
Diffstat (limited to 'src/backend/commands')
-rw-r--r-- | src/backend/commands/analyze.c | 9 | ||||
-rw-r--r-- | src/backend/commands/cluster.c | 37 | ||||
-rw-r--r-- | src/backend/commands/dbcommands.c | 15 | ||||
-rw-r--r-- | src/backend/commands/sequence.c | 10 | ||||
-rw-r--r-- | src/backend/commands/tablecmds.c | 12 | ||||
-rw-r--r-- | src/backend/commands/trigger.c | 32 | ||||
-rw-r--r-- | src/backend/commands/vacuum.c | 96 | ||||
-rw-r--r-- | src/backend/commands/vacuumlazy.c | 24 |
8 files changed, 180 insertions, 55 deletions
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 7a5eb42424b..d7b17a5aba6 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -16,6 +16,7 @@ #include <math.h> +#include "access/multixact.h" #include "access/transam.h" #include "access/tupconvert.h" #include "access/tuptoaster.h" @@ -580,7 +581,8 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, totalrows, visibilitymap_count(onerel), hasindex, - InvalidTransactionId); + InvalidTransactionId, + InvalidMultiXactId); /* * Same for indexes. Vacuum always scans all indexes, so if we're part of @@ -600,7 +602,8 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, totalindexrows, 0, false, - InvalidTransactionId); + InvalidTransactionId, + InvalidMultiXactId); } } @@ -1193,7 +1196,7 @@ acquire_sample_rows(Relation onerel, int elevel, * right. (Note: this works out properly when the row was * both inserted and deleted in our xact.) */ - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(targtuple.t_data))) + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple.t_data))) deadrows += 1; else liverows += 1; diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 238781b6a70..c0cb2f66545 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -17,6 +17,7 @@ */ #include "postgres.h" +#include "access/multixact.h" #include "access/relscan.h" #include "access/rewriteheap.h" #include "access/transam.h" @@ -65,7 +66,8 @@ static void rebuild_relation(Relation OldHeap, Oid indexOid, int freeze_min_age, int freeze_table_age, bool verbose); static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, int freeze_min_age, int freeze_table_age, bool verbose, - bool *pSwapToastByContent, TransactionId *pFreezeXid); + bool *pSwapToastByContent, TransactionId *pFreezeXid, + MultiXactId *pFreezeMulti); static List *get_tables_to_cluster(MemoryContext cluster_context); static void reform_and_rewrite_tuple(HeapTuple tuple, TupleDesc oldTupDesc, TupleDesc newTupDesc, @@ -549,6 +551,7 @@ rebuild_relation(Relation OldHeap, Oid indexOid, bool is_system_catalog; bool swap_toast_by_content; TransactionId frozenXid; + MultiXactId frozenMulti; /* Mark the correct index as clustered */ if (OidIsValid(indexOid)) @@ -566,14 +569,14 @@ rebuild_relation(Relation OldHeap, Oid indexOid, /* Copy the heap data into the new table in the desired order */ copy_heap_data(OIDNewHeap, tableOid, indexOid, freeze_min_age, freeze_table_age, verbose, - &swap_toast_by_content, &frozenXid); + &swap_toast_by_content, &frozenXid, &frozenMulti); /* * Swap the physical files of the target and transient tables, then * rebuild the target's indexes and throw away the transient table. */ finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog, - swap_toast_by_content, false, frozenXid); + swap_toast_by_content, false, frozenXid, frozenMulti); } @@ -706,7 +709,8 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace) static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, int freeze_min_age, int freeze_table_age, bool verbose, - bool *pSwapToastByContent, TransactionId *pFreezeXid) + bool *pSwapToastByContent, TransactionId *pFreezeXid, + MultiXactId *pFreezeMulti) { Relation NewHeap, OldHeap, @@ -722,6 +726,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool is_system_catalog; TransactionId OldestXmin; TransactionId FreezeXid; + MultiXactId MultiXactFrzLimit; RewriteState rwstate; bool use_sort; Tuplesortstate *tuplesort; @@ -822,7 +827,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, */ vacuum_set_xid_limits(freeze_min_age, freeze_table_age, OldHeap->rd_rel->relisshared, - &OldestXmin, &FreezeXid, NULL); + &OldestXmin, &FreezeXid, NULL, &MultiXactFrzLimit); /* * FreezeXid will become the table's new relfrozenxid, and that mustn't go @@ -831,14 +836,16 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, if (TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid)) FreezeXid = OldHeap->rd_rel->relfrozenxid; - /* return selected value to caller */ + /* return selected values to caller */ *pFreezeXid = FreezeXid; + *pFreezeMulti = MultiXactFrzLimit; /* Remember if it's a system catalog */ is_system_catalog = IsSystemRelation(OldHeap); /* Initialize the rewrite operation */ - rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid, use_wal); + rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid, + MultiXactFrzLimit, use_wal); /* * Decide whether to use an indexscan or seqscan-and-optional-sort to scan @@ -966,9 +973,8 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, /* * Similar situation to INSERT_IN_PROGRESS case. */ - Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI)); if (!is_system_catalog && - !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple->t_data))) + !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data))) elog(WARNING, "concurrent delete in progress within table \"%s\"", RelationGetRelationName(OldHeap)); /* treat as recently dead */ @@ -1097,6 +1103,7 @@ static void swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, bool swap_toast_by_content, TransactionId frozenXid, + MultiXactId frozenMulti, Oid *mapped_tables) { Relation relRelation; @@ -1204,11 +1211,13 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, * and then fail to commit the pg_class update. */ - /* set rel1's frozen Xid */ + /* set rel1's frozen Xid and minimum MultiXid */ if (relform1->relkind != RELKIND_INDEX) { Assert(TransactionIdIsNormal(frozenXid)); relform1->relfrozenxid = frozenXid; + Assert(MultiXactIdIsValid(frozenMulti)); + relform1->relminmxid = frozenMulti; } /* swap size statistics too, since new rel has freshly-updated stats */ @@ -1272,6 +1281,7 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, target_is_pg_class, swap_toast_by_content, frozenXid, + frozenMulti, mapped_tables); } else @@ -1361,6 +1371,7 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, target_is_pg_class, swap_toast_by_content, InvalidTransactionId, + InvalidMultiXactId, mapped_tables); /* Clean up. */ @@ -1398,7 +1409,8 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, bool is_system_catalog, bool swap_toast_by_content, bool check_constraints, - TransactionId frozenXid) + TransactionId frozenXid, + MultiXactId frozenMulti) { ObjectAddress object; Oid mapped_tables[4]; @@ -1414,7 +1426,8 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, */ swap_relation_files(OIDOldHeap, OIDNewHeap, (OIDOldHeap == RelationRelationId), - swap_toast_by_content, frozenXid, mapped_tables); + swap_toast_by_content, frozenXid, frozenMulti, + mapped_tables); /* * If it's a system catalog, queue an sinval message to flush all diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 4ad4b997585..5b06af24a6c 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -80,6 +80,7 @@ static bool get_db_info(const char *name, LOCKMODE lockmode, Oid *dbIdP, Oid *ownerIdP, int *encodingP, bool *dbIsTemplateP, bool *dbAllowConnP, Oid *dbLastSysOidP, TransactionId *dbFrozenXidP, + MultiXactId *dbMinMultiP, Oid *dbTablespace, char **dbCollate, char **dbCtype); static bool have_createdb_privilege(void); static void remove_dbtablespaces(Oid db_id); @@ -104,6 +105,7 @@ createdb(const CreatedbStmt *stmt) bool src_allowconn; Oid src_lastsysoid; TransactionId src_frozenxid; + MultiXactId src_minmxid; Oid src_deftablespace; volatile Oid dst_deftablespace; Relation pg_database_rel; @@ -288,7 +290,7 @@ createdb(const CreatedbStmt *stmt) if (!get_db_info(dbtemplate, ShareLock, &src_dboid, &src_owner, &src_encoding, &src_istemplate, &src_allowconn, &src_lastsysoid, - &src_frozenxid, &src_deftablespace, + &src_frozenxid, &src_minmxid, &src_deftablespace, &src_collate, &src_ctype)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_DATABASE), @@ -491,6 +493,7 @@ createdb(const CreatedbStmt *stmt) new_record[Anum_pg_database_datconnlimit - 1] = Int32GetDatum(dbconnlimit); new_record[Anum_pg_database_datlastsysoid - 1] = ObjectIdGetDatum(src_lastsysoid); new_record[Anum_pg_database_datfrozenxid - 1] = TransactionIdGetDatum(src_frozenxid); + new_record[Anum_pg_database_datminmxid - 1] = TransactionIdGetDatum(src_minmxid); new_record[Anum_pg_database_dattablespace - 1] = ObjectIdGetDatum(dst_deftablespace); /* @@ -786,7 +789,7 @@ dropdb(const char *dbname, bool missing_ok) pgdbrel = heap_open(DatabaseRelationId, RowExclusiveLock); if (!get_db_info(dbname, AccessExclusiveLock, &db_id, NULL, NULL, - &db_istemplate, NULL, NULL, NULL, NULL, NULL, NULL)) + &db_istemplate, NULL, NULL, NULL, NULL, NULL, NULL, NULL)) { if (!missing_ok) { @@ -945,7 +948,7 @@ RenameDatabase(const char *oldname, const char *newname) rel = heap_open(DatabaseRelationId, RowExclusiveLock); if (!get_db_info(oldname, AccessExclusiveLock, &db_id, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL)) + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", oldname))); @@ -1046,7 +1049,7 @@ movedb(const char *dbname, const char *tblspcname) pgdbrel = heap_open(DatabaseRelationId, RowExclusiveLock); if (!get_db_info(dbname, AccessExclusiveLock, &db_id, NULL, NULL, - NULL, NULL, NULL, NULL, &src_tblspcoid, NULL, NULL)) + NULL, NULL, NULL, NULL, NULL, &src_tblspcoid, NULL, NULL)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", dbname))); @@ -1599,6 +1602,7 @@ get_db_info(const char *name, LOCKMODE lockmode, Oid *dbIdP, Oid *ownerIdP, int *encodingP, bool *dbIsTemplateP, bool *dbAllowConnP, Oid *dbLastSysOidP, TransactionId *dbFrozenXidP, + MultiXactId *dbMinMultiP, Oid *dbTablespace, char **dbCollate, char **dbCtype) { bool result = false; @@ -1685,6 +1689,9 @@ get_db_info(const char *name, LOCKMODE lockmode, /* limit of frozen XIDs */ if (dbFrozenXidP) *dbFrozenXidP = dbform->datfrozenxid; + /* limit of frozen Multixacts */ + if (dbMinMultiP) + *dbMinMultiP = dbform->datminmxid; /* default tablespace for this database */ if (dbTablespace) *dbTablespace = dbform->dattablespace; diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 1f2546d69ca..de41c8a1c71 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -14,8 +14,9 @@ */ #include "postgres.h" -#include "access/transam.h" #include "access/htup_details.h" +#include "access/multixact.h" +#include "access/transam.h" #include "access/xlogutils.h" #include "catalog/dependency.h" #include "catalog/namespace.h" @@ -282,8 +283,10 @@ ResetSequence(Oid seq_relid) /* * Create a new storage file for the sequence. We want to keep the * sequence's relfrozenxid at 0, since it won't contain any unfrozen XIDs. + * Same with relminmxid, since a sequence will never contain multixacts. */ - RelationSetNewRelfilenode(seq_rel, InvalidTransactionId); + RelationSetNewRelfilenode(seq_rel, InvalidTransactionId, + InvalidMultiXactId); /* * Insert the modified tuple into the new storage file. @@ -1110,7 +1113,8 @@ read_seq_tuple(SeqTable elm, Relation rel, Buffer *buf, HeapTuple seqtuple) * bit update, ie, don't bother to WAL-log it, since we can certainly do * this again if the update gets lost. */ - if (HeapTupleHeaderGetXmax(seqtuple->t_data) != InvalidTransactionId) + Assert(!(seqtuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI)); + if (HeapTupleHeaderGetRawXmax(seqtuple->t_data) != InvalidTransactionId) { HeapTupleHeaderSetXmax(seqtuple->t_data, InvalidTransactionId); seqtuple->t_data->t_infomask &= ~HEAP_XMAX_COMMITTED; diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index cad83117f95..6bc056bbc33 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -15,7 +15,9 @@ #include "postgres.h" #include "access/genam.h" +#include "access/heapam.h" #include "access/heapam_xlog.h" +#include "access/multixact.h" #include "access/reloptions.h" #include "access/relscan.h" #include "access/sysattr.h" @@ -1130,6 +1132,7 @@ ExecuteTruncate(TruncateStmt *stmt) { Oid heap_relid; Oid toast_relid; + MultiXactId minmulti; /* * This effectively deletes all rows in the table, and may be done @@ -1139,6 +1142,8 @@ ExecuteTruncate(TruncateStmt *stmt) */ CheckTableForSerializableConflictIn(rel); + minmulti = GetOldestMultiXactId(); + /* * Need the full transaction-safe pushups. * @@ -1146,7 +1151,7 @@ ExecuteTruncate(TruncateStmt *stmt) * as the relfilenode value. The old storage file is scheduled for * deletion at commit. */ - RelationSetNewRelfilenode(rel, RecentXmin); + RelationSetNewRelfilenode(rel, RecentXmin, minmulti); if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) heap_create_init_fork(rel); @@ -1159,7 +1164,7 @@ ExecuteTruncate(TruncateStmt *stmt) if (OidIsValid(toast_relid)) { rel = relation_open(toast_relid, AccessExclusiveLock); - RelationSetNewRelfilenode(rel, RecentXmin); + RelationSetNewRelfilenode(rel, RecentXmin, minmulti); if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) heap_create_init_fork(rel); heap_close(rel, NoLock); @@ -3516,7 +3521,8 @@ ATRewriteTables(List **wqueue, LOCKMODE lockmode) * interest in letting this code work on system catalogs. */ finish_heap_swap(tab->relid, OIDNewHeap, - false, false, true, RecentXmin); + false, false, true, RecentXmin, + ReadNextMultiXactId()); } else { diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index a719cf24f43..f11a8ec5d42 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -73,6 +73,7 @@ static HeapTuple GetTupleForTrigger(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, ItemPointer tid, + LockTupleMode lockmode, TupleTableSlot **newSlot); static bool TriggerEnabled(EState *estate, ResultRelInfo *relinfo, Trigger *trigger, TriggerEvent event, @@ -2147,7 +2148,7 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, int i; trigtuple = GetTupleForTrigger(estate, epqstate, relinfo, tupleid, - &newSlot); + LockTupleExclusive, &newSlot); if (trigtuple == NULL) return false; @@ -2201,7 +2202,8 @@ ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, if (trigdesc && trigdesc->trig_delete_after_row) { HeapTuple trigtuple = GetTupleForTrigger(estate, NULL, relinfo, - tupleid, NULL); + tupleid, LockTupleExclusive, + NULL); AfterTriggerSaveEvent(estate, relinfo, TRIGGER_EVENT_DELETE, true, trigtuple, NULL, NIL, NULL); @@ -2332,10 +2334,24 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, TupleTableSlot *newSlot; int i; Bitmapset *modifiedCols; + Bitmapset *keyCols; + LockTupleMode lockmode; + + /* + * Compute lock mode to use. If columns that are part of the key have not + * been modified, then we can use a weaker lock, allowing for better + * concurrency. + */ + modifiedCols = GetModifiedColumns(relinfo, estate); + keyCols = RelationGetIndexAttrBitmap(relinfo->ri_RelationDesc, true); + if (bms_overlap(keyCols, modifiedCols)) + lockmode = LockTupleExclusive; + else + lockmode = LockTupleNoKeyExclusive; /* get a copy of the on-disk tuple we are planning to update */ trigtuple = GetTupleForTrigger(estate, epqstate, relinfo, tupleid, - &newSlot); + lockmode, &newSlot); if (trigtuple == NULL) return NULL; /* cancel the update action */ @@ -2357,7 +2373,6 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, newtuple = slottuple; } - modifiedCols = GetModifiedColumns(relinfo, estate); LocTriggerData.type = T_TriggerData; LocTriggerData.tg_event = TRIGGER_EVENT_UPDATE | @@ -2426,7 +2441,8 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, if (trigdesc && trigdesc->trig_update_after_row) { HeapTuple trigtuple = GetTupleForTrigger(estate, NULL, relinfo, - tupleid, NULL); + tupleid, LockTupleExclusive, + NULL); AfterTriggerSaveEvent(estate, relinfo, TRIGGER_EVENT_UPDATE, true, trigtuple, newtuple, recheckIndexes, @@ -2565,6 +2581,7 @@ GetTupleForTrigger(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, ItemPointer tid, + LockTupleMode lockmode, TupleTableSlot **newSlot) { Relation relation = relinfo->ri_RelationDesc; @@ -2589,8 +2606,8 @@ ltrmark:; tuple.t_self = *tid; test = heap_lock_tuple(relation, &tuple, estate->es_output_cid, - LockTupleExclusive, false /* wait */, - &buffer, &hufd); + lockmode, false /* wait */, + false, &buffer, &hufd); switch (test) { case HeapTupleSelfUpdated: @@ -2630,6 +2647,7 @@ ltrmark:; epqstate, relation, relinfo->ri_RangeTableIndex, + lockmode, &hufd.ctid, hufd.xmax); if (!TupIsNull(epqslot)) diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 2d3170a2504..a37a54e5b42 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -26,6 +26,7 @@ #include "access/genam.h" #include "access/heapam.h" #include "access/htup_details.h" +#include "access/multixact.h" #include "access/transam.h" #include "access/xact.h" #include "catalog/namespace.h" @@ -63,7 +64,7 @@ static BufferAccessStrategy vac_strategy; /* non-export function prototypes */ static List *get_rel_oids(Oid relid, const RangeVar *vacrel); -static void vac_truncate_clog(TransactionId frozenXID); +static void vac_truncate_clog(TransactionId frozenXID, MultiXactId frozenMulti); static bool vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound); @@ -379,7 +380,8 @@ vacuum_set_xid_limits(int freeze_min_age, bool sharedRel, TransactionId *oldestXmin, TransactionId *freezeLimit, - TransactionId *freezeTableLimit) + TransactionId *freezeTableLimit, + MultiXactId *multiXactFrzLimit) { int freezemin; TransactionId limit; @@ -463,8 +465,22 @@ vacuum_set_xid_limits(int freeze_min_age, *freezeTableLimit = limit; } -} + if (multiXactFrzLimit != NULL) + { + MultiXactId mxLimit; + + /* + * simplistic multixactid freezing: use the same freezing policy as + * for Xids + */ + mxLimit = GetOldestMultiXactId() - freezemin; + if (mxLimit < FirstMultiXactId) + mxLimit = FirstMultiXactId; + + *multiXactFrzLimit = mxLimit; + } +} /* * vac_estimate_reltuples() -- estimate the new value for pg_class.reltuples @@ -574,7 +590,8 @@ void vac_update_relstats(Relation relation, BlockNumber num_pages, double num_tuples, BlockNumber num_all_visible_pages, - bool hasindex, TransactionId frozenxid) + bool hasindex, TransactionId frozenxid, + MultiXactId minmulti) { Oid relid = RelationGetRelid(relation); Relation rd; @@ -648,6 +665,14 @@ vac_update_relstats(Relation relation, dirty = true; } + /* relminmxid must never go backward, either */ + if (MultiXactIdIsValid(minmulti) && + MultiXactIdPrecedes(pgcform->relminmxid, minmulti)) + { + pgcform->relminmxid = minmulti; + dirty = true; + } + /* If anything changed, write out the tuple. */ if (dirty) heap_inplace_update(rd, ctup); @@ -660,8 +685,13 @@ vac_update_relstats(Relation relation, * vac_update_datfrozenxid() -- update pg_database.datfrozenxid for our DB * * Update pg_database's datfrozenxid entry for our database to be the - * minimum of the pg_class.relfrozenxid values. If we are able to - * advance pg_database.datfrozenxid, also try to truncate pg_clog. + * minimum of the pg_class.relfrozenxid values. + * + * Similarly, update our datfrozenmulti to be the minimum of the + * pg_class.relfrozenmulti values. + * + * If we are able to advance either pg_database value, also try to + * truncate pg_clog and pg_multixact. * * We violate transaction semantics here by overwriting the database's * existing pg_database tuple with the new value. This is reasonably @@ -678,17 +708,24 @@ vac_update_datfrozenxid(void) SysScanDesc scan; HeapTuple classTup; TransactionId newFrozenXid; + MultiXactId newFrozenMulti; bool dirty = false; /* * Initialize the "min" calculation with GetOldestXmin, which is a * reasonable approximation to the minimum relfrozenxid for not-yet- * committed pg_class entries for new tables; see AddNewRelationTuple(). - * Se we cannot produce a wrong minimum by starting with this. + * So we cannot produce a wrong minimum by starting with this. */ newFrozenXid = GetOldestXmin(true, true); /* + * Similarly, initialize the MultiXact "min" with the value that would + * be used on pg_class for new tables. See AddNewRelationTuple(). + */ + newFrozenMulti = GetOldestMultiXactId(); + + /* * We must seqscan pg_class to find the minimum Xid, because there is no * index that can help us here. */ @@ -710,9 +747,13 @@ vac_update_datfrozenxid(void) continue; Assert(TransactionIdIsNormal(classForm->relfrozenxid)); + Assert(MultiXactIdIsValid(classForm->relminmxid)); if (TransactionIdPrecedes(classForm->relfrozenxid, newFrozenXid)) newFrozenXid = classForm->relfrozenxid; + + if (MultiXactIdPrecedes(classForm->relminmxid, newFrozenMulti)) + newFrozenMulti = classForm->relminmxid; } /* we're done with pg_class */ @@ -720,6 +761,7 @@ vac_update_datfrozenxid(void) heap_close(relation, AccessShareLock); Assert(TransactionIdIsNormal(newFrozenXid)); + Assert(MultiXactIdIsValid(newFrozenMulti)); /* Now fetch the pg_database tuple we need to update. */ relation = heap_open(DatabaseRelationId, RowExclusiveLock); @@ -740,6 +782,13 @@ vac_update_datfrozenxid(void) dirty = true; } + /* ditto */ + if (MultiXactIdPrecedes(dbform->datminmxid, newFrozenMulti)) + { + dbform->datminmxid = newFrozenMulti; + dirty = true; + } + if (dirty) heap_inplace_update(relation, tuple); @@ -752,7 +801,7 @@ vac_update_datfrozenxid(void) * this action will update that too. */ if (dirty || ForceTransactionIdLimitUpdate()) - vac_truncate_clog(newFrozenXid); + vac_truncate_clog(newFrozenXid, newFrozenMulti); } @@ -771,17 +820,19 @@ vac_update_datfrozenxid(void) * info is stale. */ static void -vac_truncate_clog(TransactionId frozenXID) +vac_truncate_clog(TransactionId frozenXID, MultiXactId frozenMulti) { TransactionId myXID = GetCurrentTransactionId(); Relation relation; HeapScanDesc scan; HeapTuple tuple; - Oid oldest_datoid; + Oid oldestxid_datoid; + Oid oldestmulti_datoid; bool frozenAlreadyWrapped = false; - /* init oldest_datoid to sync with my frozenXID */ - oldest_datoid = MyDatabaseId; + /* init oldest datoids to sync with my frozen values */ + oldestxid_datoid = MyDatabaseId; + oldestmulti_datoid = MyDatabaseId; /* * Scan pg_database to compute the minimum datfrozenxid @@ -804,13 +855,20 @@ vac_truncate_clog(TransactionId frozenXID) Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple); Assert(TransactionIdIsNormal(dbform->datfrozenxid)); + Assert(MultiXactIdIsValid(dbform->datminmxid)); if (TransactionIdPrecedes(myXID, dbform->datfrozenxid)) frozenAlreadyWrapped = true; else if (TransactionIdPrecedes(dbform->datfrozenxid, frozenXID)) { frozenXID = dbform->datfrozenxid; - oldest_datoid = HeapTupleGetOid(tuple); + oldestxid_datoid = HeapTupleGetOid(tuple); + } + + if (MultiXactIdPrecedes(dbform->datminmxid, frozenMulti)) + { + frozenMulti = dbform->datminmxid; + oldestmulti_datoid = HeapTupleGetOid(tuple); } } @@ -832,14 +890,18 @@ vac_truncate_clog(TransactionId frozenXID) return; } - /* Truncate CLOG to the oldest frozenxid */ + /* Truncate CLOG and Multi to the oldest computed value */ TruncateCLOG(frozenXID); + TruncateMultiXact(frozenMulti); /* - * Update the wrap limit for GetNewTransactionId. Note: this function - * will also signal the postmaster for an(other) autovac cycle if needed. + * Update the wrap limit for GetNewTransactionId and creation of new + * MultiXactIds. Note: these functions will also signal the postmaster for + * an(other) autovac cycle if needed. XXX should we avoid possibly + * signalling twice? */ - SetTransactionIdLimit(frozenXID, oldest_datoid); + SetTransactionIdLimit(frozenXID, oldestxid_datoid); + MultiXactAdvanceOldest(frozenMulti, oldestmulti_datoid); } diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 8eda66364b3..5ec65ea41be 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -41,6 +41,7 @@ #include "access/heapam.h" #include "access/heapam_xlog.h" #include "access/htup_details.h" +#include "access/multixact.h" #include "access/transam.h" #include "access/visibilitymap.h" #include "catalog/storage.h" @@ -124,6 +125,7 @@ static int elevel = -1; static TransactionId OldestXmin; static TransactionId FreezeLimit; +static MultiXactId MultiXactFrzLimit; static BufferAccessStrategy vac_strategy; @@ -180,6 +182,7 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, double new_rel_tuples; BlockNumber new_rel_allvisible; TransactionId new_frozen_xid; + MultiXactId new_min_multi; /* measure elapsed time iff autovacuum logging requires it */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0) @@ -197,7 +200,8 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age, onerel->rd_rel->relisshared, - &OldestXmin, &FreezeLimit, &freezeTableLimit); + &OldestXmin, &FreezeLimit, &freezeTableLimit, + &MultiXactFrzLimit); scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid, freezeTableLimit); @@ -267,12 +271,17 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, if (vacrelstats->scanned_pages < vacrelstats->rel_pages) new_frozen_xid = InvalidTransactionId; + new_min_multi = MultiXactFrzLimit; + if (vacrelstats->scanned_pages < vacrelstats->rel_pages) + new_min_multi = InvalidMultiXactId; + vac_update_relstats(onerel, new_rel_pages, new_rel_tuples, new_rel_allvisible, vacrelstats->hasindex, - new_frozen_xid); + new_frozen_xid, + new_min_multi); /* * Report results to the stats collector, too. An early terminated @@ -839,7 +848,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ - if (heap_freeze_tuple(tuple.t_data, FreezeLimit)) + if (heap_freeze_tuple(tuple.t_data, FreezeLimit, + MultiXactFrzLimit)) frozen[nfrozen++] = offnum; } } /* scan along page */ @@ -857,7 +867,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, - frozen, nfrozen); + MultiXactFrzLimit, frozen, nfrozen); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } @@ -1176,7 +1186,8 @@ lazy_check_needs_freeze(Buffer buf) tupleheader = (HeapTupleHeader) PageGetItem(page, itemid); - if (heap_tuple_needs_freeze(tupleheader, FreezeLimit, buf)) + if (heap_tuple_needs_freeze(tupleheader, FreezeLimit, + MultiXactFrzLimit, buf)) return true; } /* scan along page */ @@ -1253,7 +1264,8 @@ lazy_cleanup_index(Relation indrel, stats->num_index_tuples, 0, false, - InvalidTransactionId); + InvalidTransactionId, + InvalidMultiXactId); ereport(elevel, (errmsg("index \"%s\" now contains %.0f row versions in %u pages", |