summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Paquier2024-02-20 04:43:51 +0000
committerMichael Paquier2024-02-20 04:43:51 +0000
commit818fefd8fd4412d45eb542155cb2833a2b864acc (patch)
treeaee58e95702240630654c8a9e087968f94ca5eae
parent01ec4d89b91ed4c0cad57b188b530b9e7980ccb5 (diff)
Fix race leading to incorrect conflict cause in InvalidatePossiblyObsoleteSlot()
The invalidation of an active slot is done in two steps: - Termination of the backend holding it, if any. - Report that the slot is obsolete, with a conflict cause depending on the slot's data. This can be racy because between these two steps the slot mutex would be released while doing system calls, which means that the effective_xmin and effective_catalog_xmin could advance during that time, detecting a conflict cause different than the one originally wanted before the process owning a slot is terminated. Holding the mutex longer is not an option, so this commit changes the code to record the LSNs stored in the slot during the termination of the process owning the slot. Bonus thanks to Alexander Lakhin for the various tests and the analysis. Author: Bertrand Drouvot Reviewed-by: Michael Paquier, Bharath Rupireddy Discussion: https://postgr.es/m/ZaTjW2Xh+TQUCOH0@ip-10-97-1-34.eu-west-3.compute.internal Backpatch-through: 16
-rw-r--r--src/backend/replication/slot.c39
1 files changed, 33 insertions, 6 deletions
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 2180a380632..a142855bd32 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1454,6 +1454,11 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
{
int last_signaled_pid = 0;
bool released_lock = false;
+ bool terminated = false;
+ XLogRecPtr initial_effective_xmin = InvalidXLogRecPtr;
+ XLogRecPtr initial_catalog_effective_xmin = InvalidXLogRecPtr;
+ XLogRecPtr initial_restart_lsn = InvalidXLogRecPtr;
+ ReplicationSlotInvalidationCause conflict_prev PG_USED_FOR_ASSERTS_ONLY = RS_INVAL_NONE;
for (;;)
{
@@ -1488,11 +1493,24 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
*/
if (s->data.invalidated == RS_INVAL_NONE)
{
+ /*
+ * The slot's mutex will be released soon, and it is possible that
+ * those values change since the process holding the slot has been
+ * terminated (if any), so record them here to ensure that we
+ * would report the correct conflict cause.
+ */
+ if (!terminated)
+ {
+ initial_restart_lsn = s->data.restart_lsn;
+ initial_effective_xmin = s->effective_xmin;
+ initial_catalog_effective_xmin = s->effective_catalog_xmin;
+ }
+
switch (cause)
{
case RS_INVAL_WAL_REMOVED:
- if (s->data.restart_lsn != InvalidXLogRecPtr &&
- s->data.restart_lsn < oldestLSN)
+ if (initial_restart_lsn != InvalidXLogRecPtr &&
+ initial_restart_lsn < oldestLSN)
conflict = cause;
break;
case RS_INVAL_HORIZON:
@@ -1501,12 +1519,12 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
/* invalid DB oid signals a shared relation */
if (dboid != InvalidOid && dboid != s->data.database)
break;
- if (TransactionIdIsValid(s->effective_xmin) &&
- TransactionIdPrecedesOrEquals(s->effective_xmin,
+ if (TransactionIdIsValid(initial_effective_xmin) &&
+ TransactionIdPrecedesOrEquals(initial_effective_xmin,
snapshotConflictHorizon))
conflict = cause;
- else if (TransactionIdIsValid(s->effective_catalog_xmin) &&
- TransactionIdPrecedesOrEquals(s->effective_catalog_xmin,
+ else if (TransactionIdIsValid(initial_catalog_effective_xmin) &&
+ TransactionIdPrecedesOrEquals(initial_catalog_effective_xmin,
snapshotConflictHorizon))
conflict = cause;
break;
@@ -1519,6 +1537,13 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
}
}
+ /*
+ * The conflict cause recorded previously should not change while the
+ * process owning the slot (if any) has been terminated.
+ */
+ Assert(!(conflict_prev != RS_INVAL_NONE && terminated &&
+ conflict_prev != conflict));
+
/* if there's no conflict, we're done */
if (conflict == RS_INVAL_NONE)
{
@@ -1601,6 +1626,8 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
(void) kill(active_pid, SIGTERM);
last_signaled_pid = active_pid;
+ terminated = true;
+ conflict_prev = conflict;
}
/* Wait until the slot is released. */