Skip to content

Commit 71b49ea

Browse files
michail-nikolaevCommitfest Bot
authored andcommitted
Fix btree index scan concurrency issues with dirty snapshots
This patch addresses an issue where non-MVCC index scans using SnapshotDirty or SnapshotSelf could miss tuples due to concurrent modifications. The fix retains read locks on pages for these special snapshot types until the scan is done with the page's tuples, preventing concurrent modifications from causing inconsistent results. Updated README to document this special case in the btree locking mechanism.
1 parent f2b1102 commit 71b49ea

File tree

5 files changed

+46
-7
lines changed

5 files changed

+46
-7
lines changed

src/backend/access/nbtree/README

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ move right until we find a page whose right-link matches the page we
8585
came from. (Actually, it's even harder than that; see page deletion
8686
discussion below.)
8787

88-
Page read locks are held only for as long as a scan is examining a page.
88+
Page read locks are held only for as long as a scan is examining a page
89+
(with exception for SnapshotDirty and SnapshotSelf scans - see below).
8990
To minimize lock/unlock traffic, an index scan always searches a leaf page
9091
to identify all the matching items at once, copying their heap tuple IDs
9192
into backend-local storage. The heap tuple IDs are then processed while
@@ -103,6 +104,16 @@ We also remember the left-link, and follow it when the scan moves backwards
103104
(though this requires extra handling to account for concurrent splits of
104105
the left sibling; see detailed move-left algorithm below).
105106

107+
Despite the described mechanics in place, inconsistent results may still occur
108+
during non-MVCC scans (SnapshotDirty and SnapshotSelf). This issue can occur if a
109+
concurrent transaction deletes a tuple and inserts a new tuple with a new TID in the
110+
same page. If the scan has already visited the page and cached its content in the
111+
backend-local storage, it might skip the old tuple due to deletion and miss the new
112+
tuple because the scan does not re-read the page. To address this issue, for
113+
SnapshotDirty and SnapshotSelf scans, we retain the read lock on the page until
114+
we're completely done processing all the tuples from that page, preventing
115+
concurrent modifications that could lead to inconsistent results.
116+
106117
In most cases we release our lock and pin on a page before attempting
107118
to acquire pin and lock on the page we are moving to. In a few places
108119
it is necessary to lock the next page before releasing the current one.

src/backend/access/nbtree/nbtree.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,10 +393,22 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
393393
/* Before leaving current page, deal with any killed items */
394394
if (so->numKilled > 0)
395395
_bt_killitems(scan);
396+
else if (!so->dropLock) /* _bt_killitems always releases lock */
397+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
396398
BTScanPosUnpinIfPinned(so->currPos);
397399
BTScanPosInvalidate(so->currPos);
398400
}
399401

402+
/*
403+
* For SnapshotDirty and SnapshotSelf scans, we don't unlock the buffer
404+
* and keep the lock should be until we're completely done with this page.
405+
* This prevents concurrent modifications from causing inconsistent
406+
* results during non-MVCC scans.
407+
*
408+
* See nbtree/README for information about SnapshotDirty and SnapshotSelf.
409+
*/
410+
so->dropLock = scan->xs_snapshot->snapshot_type != SNAPSHOT_DIRTY
411+
&& scan->xs_snapshot->snapshot_type != SNAPSHOT_SELF;
400412
/*
401413
* We prefer to eagerly drop leaf page pins before btgettuple returns.
402414
* This avoids making VACUUM wait to acquire a cleanup lock on the page.
@@ -420,7 +432,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
420432
*
421433
* Note: so->dropPin should never change across rescans.
422434
*/
423-
so->dropPin = (!scan->xs_want_itup &&
435+
so->dropPin = (so->dropLock &&
436+
!scan->xs_want_itup &&
424437
IsMVCCSnapshot(scan->xs_snapshot) &&
425438
RelationNeedsWAL(scan->indexRelation) &&
426439
scan->heapRelation != NULL);
@@ -477,6 +490,8 @@ btendscan(IndexScanDesc scan)
477490
/* Before leaving current page, deal with any killed items */
478491
if (so->numKilled > 0)
479492
_bt_killitems(scan);
493+
else if (!so->dropLock) /* _bt_killitems always releases lock */
494+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
480495
BTScanPosUnpinIfPinned(so->currPos);
481496
}
482497

@@ -557,6 +572,8 @@ btrestrpos(IndexScanDesc scan)
557572
/* Before leaving current page, deal with any killed items */
558573
if (so->numKilled > 0)
559574
_bt_killitems(scan);
575+
else if (!so->dropLock) /* _bt_killitems always releases lock */
576+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
560577
BTScanPosUnpinIfPinned(so->currPos);
561578
}
562579

src/backend/access/nbtree/nbtsearch.c

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,14 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
5757
/*
5858
* _bt_drop_lock_and_maybe_pin()
5959
*
60-
* Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too.
60+
* Unlock so->currPos.buf if so->dropLock. If scan is so->dropPin, drop the pin, too.
6161
* Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
6262
*/
6363
static inline void
6464
_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
6565
{
66+
if (!so->dropLock)
67+
return;
6668
if (!so->dropPin)
6769
{
6870
/* Just drop the lock (not the pin) */
@@ -1532,7 +1534,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
15321534
* _bt_next() -- Get the next item in a scan.
15331535
*
15341536
* On entry, so->currPos describes the current page, which may be pinned
1535-
* but is not locked, and so->currPos.itemIndex identifies which item was
1537+
* but is not locked (except for SnapshotDirty and SnapshotSelf scans, where
1538+
* the page remains locked), and so->currPos.itemIndex identifies which item was
15361539
* previously returned.
15371540
*
15381541
* On success exit, so->currPos is updated as needed, and _bt_returnitem
@@ -2111,7 +2114,9 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
21112114
* Wrapper on _bt_readnextpage that performs final steps for the current page.
21122115
*
21132116
* On entry, so->currPos must be valid. Its buffer will be pinned, though
2114-
* never locked. (Actually, when so->dropPin there won't even be a pin held,
2117+
* never locked, except for SnapshotDirty and SnapshotSelf scans where the buffer
2118+
* remains locked until we're done with all tuples from the page
2119+
* (Actually, when so->dropPin there won't even be a pin held,
21152120
* though so->currPos.currPage must still be set to a valid block number.)
21162121
*/
21172122
static bool
@@ -2126,6 +2131,8 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
21262131
/* Before leaving current page, deal with any killed items */
21272132
if (so->numKilled > 0)
21282133
_bt_killitems(scan);
2134+
else if (!so->dropLock) /* _bt_killitems always releases lock */
2135+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
21292136

21302137
/*
21312138
* Before we modify currPos, make a copy of the page data if there was a
@@ -2265,7 +2272,8 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
22652272
}
22662273

22672274
/* There's no actually-matching data on the page in so->currPos.buf */
2268-
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2275+
if (so->dropLock)
2276+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
22692277

22702278
/* Call _bt_readnextpage using its _bt_steppage wrapper function */
22712279
if (!_bt_steppage(scan, dir))

src/backend/access/nbtree/nbtutils.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3379,8 +3379,10 @@ _bt_killitems(IndexScanDesc scan)
33793379
* concurrent VACUUMs from recycling any of the TIDs on the page.
33803380
*/
33813381
Assert(BTScanPosIsPinned(so->currPos));
3382+
/* Lock only if the lock is dropped. */
33823383
buf = so->currPos.buf;
3383-
_bt_lockbuf(rel, buf, BT_READ);
3384+
if (so->dropLock)
3385+
_bt_lockbuf(rel, buf, BT_READ);
33843386
}
33853387
else
33863388
{

src/include/access/nbtree.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,7 @@ typedef struct BTScanOpaqueData
10701070
/* info about killed items if any (killedItems is NULL if never used) */
10711071
int *killedItems; /* currPos.items indexes of killed items */
10721072
int numKilled; /* number of currently stored items */
1073+
bool dropLock; /* drop lock on before btgettuple returns? */
10731074
bool dropPin; /* drop leaf pin before btgettuple returns? */
10741075

10751076
/*

0 commit comments

Comments
 (0)