diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/backend/access/hash/Makefile | 2 | ||||
| -rw-r--r-- | src/backend/access/hash/README | 138 | ||||
| -rw-r--r-- | src/backend/access/hash/hash.c | 81 | ||||
| -rw-r--r-- | src/backend/access/hash/hash_xlog.c | 963 | ||||
| -rw-r--r-- | src/backend/access/hash/hashinsert.c | 59 | ||||
| -rw-r--r-- | src/backend/access/hash/hashovfl.c | 209 | ||||
| -rw-r--r-- | src/backend/access/hash/hashpage.c | 236 | ||||
| -rw-r--r-- | src/backend/access/hash/hashsearch.c | 5 | ||||
| -rw-r--r-- | src/backend/access/rmgrdesc/hashdesc.c | 134 | ||||
| -rw-r--r-- | src/backend/commands/indexcmds.c | 5 | ||||
| -rw-r--r-- | src/backend/utils/cache/relcache.c | 12 | ||||
| -rw-r--r-- | src/include/access/hash_xlog.h | 232 | ||||
| -rw-r--r-- | src/test/regress/expected/create_index.out | 5 | ||||
| -rw-r--r-- | src/test/regress/expected/enum.out | 1 | ||||
| -rw-r--r-- | src/test/regress/expected/hash_index.out | 2 | ||||
| -rw-r--r-- | src/test/regress/expected/macaddr.out | 1 | ||||
| -rw-r--r-- | src/test/regress/expected/replica_identity.out | 1 | ||||
| -rw-r--r-- | src/test/regress/expected/uuid.out | 1 |
18 files changed, 1998 insertions, 89 deletions
diff --git a/src/backend/access/hash/Makefile b/src/backend/access/hash/Makefile index e2e7e914931..b154569b465 100644 --- a/src/backend/access/hash/Makefile +++ b/src/backend/access/hash/Makefile @@ -13,6 +13,6 @@ top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashsearch.o \ - hashsort.o hashutil.o hashvalidate.o + hashsort.o hashutil.o hashvalidate.o hash_xlog.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README index 703ae982071..00beb86ffae 100644 --- a/src/backend/access/hash/README +++ b/src/backend/access/hash/README @@ -213,7 +213,7 @@ this flag must be clear before splitting a bucket; thus, a bucket can't be split again until the previous split is totally complete. The moved-by-split flag on a tuple indicates that tuple is moved from old to -new bucket. Concurrent scans can skip such tuples till the split operation +new bucket. Concurrent scans will skip such tuples until the split operation is finished. Once the tuple is marked as moved-by-split, it will remain so forever but that does no harm. We have intentionally not cleared it as that can generate an additional I/O which is not necessary. @@ -287,13 +287,17 @@ The insertion algorithm is rather similar: if current page is full, release lock but not pin, read/exclusive-lock next page; repeat as needed >> see below if no space in any page of bucket + take buffer content lock in exclusive mode on metapage insert tuple at appropriate place in page - mark current page dirty and release buffer content lock and pin - if the current page is not a bucket page, release the pin on bucket page - pin meta page and take buffer content lock in exclusive mode + mark current page dirty increment tuple count, decide if split needed - mark meta page dirty and release buffer content lock and pin - done if no split needed, else enter Split algorithm below + mark meta page dirty + write WAL for insertion of tuple + release the buffer content lock on metapage + release buffer content lock on current page + if current page is not a bucket page, release the pin on bucket page + if split is needed, enter Split algorithm below + release the pin on metapage To speed searches, the index entries within any individual index page are kept sorted by hash code; the insertion code must take care to insert new @@ -328,12 +332,17 @@ existing bucket in two, thereby lowering the fill ratio: try to finish the split and the cleanup work if that succeeds, start over; if it fails, give up mark the old and new buckets indicating split is in progress + mark both old and new buckets as dirty + write WAL for allocation of new page for split copy the tuples that belongs to new bucket from old bucket, marking them as moved-by-split + write WAL record for moving tuples to new page once the new page is full + or all the pages of old bucket are finished release lock but not pin for primary bucket page of old bucket, read/shared-lock next page; repeat as needed clear the bucket-being-split and bucket-being-populated flags mark the old bucket indicating split-cleanup + write WAL for changing the flags on both old and new buckets The split operation's attempt to acquire cleanup-lock on the old bucket number could fail if another process holds any lock or pin on it. We do not want to @@ -369,6 +378,8 @@ The fourth operation is garbage collection (bulk deletion): acquire cleanup lock on primary bucket page loop: scan and remove tuples + mark the target page dirty + write WAL for deleting tuples from target page if this is the last bucket page, break out of loop pin and x-lock next page release prior lock and pin (except keep pin on primary bucket page) @@ -383,7 +394,8 @@ The fourth operation is garbage collection (bulk deletion): check if number of buckets changed if so, release content lock and pin and return to for-each-bucket loop else update metapage tuple count - mark meta page dirty and release buffer content lock and pin + mark meta page dirty and write WAL for update of metapage + release buffer content lock and pin Note that this is designed to allow concurrent splits and scans. If a split occurs, tuples relocated into the new bucket will be visited twice by the @@ -425,18 +437,16 @@ Obtaining an overflow page: search for a free page (zero bit in bitmap) if found: set bit in bitmap - mark bitmap page dirty and release content lock + mark bitmap page dirty take metapage buffer content lock in exclusive mode if first-free-bit value did not change, update it and mark meta page dirty - release meta page buffer content lock - return page number else (not found): release bitmap page buffer content lock loop back to try next bitmap page, if any -- here when we have checked all bitmap pages; we hold meta excl. lock extend index to add another overflow page; update meta information - mark meta page dirty and release buffer content lock + mark meta page dirty return page number It is slightly annoying to release and reacquire the metapage lock @@ -456,12 +466,17 @@ like this: -- having determined that no space is free in the target bucket: remember last page of bucket, drop write lock on it - call free-page-acquire routine re-write-lock last page of bucket if it is not last anymore, step to the last page - update (former) last page to point to new page + execute free-page-acquire (obtaining an overflow page) mechanism + described above + update (former) last page to point to the new page and mark buffer dirty write-lock and initialize new page, with back link to former last page - write and release former last page + write WAL for addition of overflow page + release the locks on meta page and bitmap page acquired in + free-page-acquire algorithm + release the lock on former last page + release the lock on new overflow page insert tuple into new page -- etc. @@ -488,12 +503,14 @@ accessors of pages in the bucket. The algorithm is: determine which bitmap page contains the free space bit for page release meta page buffer content lock pin bitmap page and take buffer content lock in exclusive mode - update bitmap bit - mark bitmap page dirty and release buffer content lock and pin - if page number is less than what we saw as first-free-bit in meta: retake meta page buffer content lock in exclusive mode + move (insert) tuples that belong to the overflow page being freed + update bitmap bit + mark bitmap page dirty if page number is still less than first-free-bit, update first-free-bit field and mark meta page dirty + write WAL for delinking overflow page operation + release buffer content lock and pin release meta page buffer content lock and pin We have to do it this way because we must clear the bitmap bit before @@ -504,8 +521,91 @@ page acquirer will scan more bitmap bits than he needs to. What must be avoided is having first-free-bit greater than the actual first free bit, because then that free page would never be found by searchers. -All the freespace operations should be called while holding no buffer -locks. Since they need no lmgr locks, deadlock is not possible. +The reason of moving tuples from overflow page while delinking the later is +to make that as an atomic operation. Not doing so could lead to spurious reads +on standby. Basically, the user might see the same tuple twice. + + +WAL Considerations +------------------ + +The hash index operations like create index, insert, delete, bucket split, +allocate overflow page, and squeeze in themselves don't guarantee hash index +consistency after a crash. To provide robustness, we write WAL for each of +these operations. + +CREATE INDEX writes multiple WAL records. First, we write a record to cover +the initializatoin of the metapage, followed by one for each new bucket +created, followed by one for the initial bitmap page. It's not important for +index creation to appear atomic, because the index isn't yet visible to any +other transaction, and the creating transaction will roll back in the event of +a crash. It would be difficult to cover the whole operation with a single +write-ahead log record anyway, because we can log only a fixed number of +pages, as given by XLR_MAX_BLOCK_ID (32), with current XLog machinery. + +Ordinary item insertions (that don't force a page split or need a new overflow +page) are single WAL entries. They touch a single bucket page and the +metapage. The metapage is updated during replay as it is updated during +original operation. + +If an insertion causes the addition of an overflow page, there will be one +WAL entry for the new overflow page and second entry for insert itself. + +If an insertion causes a bucket split, there will be one WAL entry for insert +itself, followed by a WAL entry for allocating a new bucket, followed by a WAL +entry for each overflow bucket page in the new bucket to which the tuples are +moved from old bucket, followed by a WAL entry to indicate that split is +complete for both old and new buckets. A split operation which requires +overflow pages to complete the operation will need to write a WAL record for +each new allocation of an overflow page. + +As splitting involves multiple atomic actions, it's possible that the system +crashes between moving tuples from bucket pages of the old bucket to new +bucket. In such a case, after recovery, the old and new buckets will be +marked with bucket-being-split and bucket-being-populated flags respectively +which indicates that split is in progress for those buckets. The reader +algorithm works correctly, as it will scan both the old and new buckets when +the split is in progress as explained in the reader algorithm section above. + +We finish the split at next insert or split operation on the old bucket as +explained in insert and split algorithm above. It could be done during +searches, too, but it seems best not to put any extra updates in what would +otherwise be a read-only operation (updating is not possible in hot standby +mode anyway). It would seem natural to complete the split in VACUUM, but since +splitting a bucket might require allocating a new page, it might fail if you +run out of disk space. That would be bad during VACUUM - the reason for +running VACUUM in the first place might be that you run out of disk space, +and now VACUUM won't finish because you're out of disk space. In contrast, +an insertion can require enlarging the physical file anyway. + +Deletion of tuples from a bucket is performed for two reasons: to remove dead +tuples, and to remove tuples that were moved by a bucket split. A WAL entry +is made for each bucket page from which tuples are removed, and then another +WAL entry is made when we clear the needs-split-cleanup flag. If dead tuples +are removed, a separate WAL entry is made to update the metapage. + +As deletion involves multiple atomic operations, it is quite possible that +system crashes after (a) removing tuples from some of the bucket pages, (b) +before clearing the garbage flag, or (c) before updating the metapage. If the +system crashes before completing (b), it will again try to clean the bucket +during next vacuum or insert after recovery which can have some performance +impact, but it will work fine. If the system crashes before completing (c), +after recovery there could be some additional splits until the next vacuum +updates the metapage, but the other operations like insert, delete and scan +will work correctly. We can fix this problem by actually updating the +metapage based on delete operation during replay, but it's not clear whether +it's worth the complication. + +A squeeze operation moves tuples from one of the buckets later in the chain to +one of the bucket earlier in chain and writes WAL record when either the +bucket to which it is writing tuples is filled or bucket from which it +is removing the tuples becomes empty. + +As a squeeze operation involves writing multiple atomic operations, it is +quite possible that the system crashes before completing the operation on +entire bucket. After recovery, the operations will work correctly, but +the index will remain bloated and this can impact performance of read and +insert operations until the next vacuum squeeze the bucket completely. Other Notes diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 1f8a7f61c72..641676964bb 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -28,6 +28,7 @@ #include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/rel.h" +#include "miscadmin.h" /* Working state for hashbuild and its callback */ @@ -303,6 +304,11 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) buf = so->hashso_curbuf; Assert(BufferIsValid(buf)); page = BufferGetPage(buf); + + /* + * We don't need test for old snapshot here as the current buffer is + * pinned, so vacuum can't clean the page. + */ maxoffnum = PageGetMaxOffsetNumber(page); for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; @@ -623,6 +629,7 @@ loop_top: } /* Okay, we're really done. Update tuple count in metapage. */ + START_CRIT_SECTION(); if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) @@ -649,6 +656,26 @@ loop_top: } MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_update_meta_page xlrec; + XLogRecPtr recptr; + + xlrec.ntuples = metap->hashm_ntuples; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(SizeOfHashUpdateMetaPage)); + + XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + _hash_relbuf(rel, metabuf); /* return statistics */ @@ -816,9 +843,40 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, */ if (ndeletable > 0) { + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + PageIndexMultiDelete(page, deletable, ndeletable); bucket_dirty = true; MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_delete xlrec; + XLogRecPtr recptr; + + xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashDelete); + + /* + * bucket buffer needs to be registered to ensure that we can + * acquire a cleanup lock on it during replay. + */ + if (!xlrec.is_primary_bucket_page) + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); + + XLogRegisterBuffer(1, buf, REGBUF_STANDARD); + XLogRegisterBufData(1, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE); + PageSetLSN(BufferGetPage(buf), recptr); + } + + END_CRIT_SECTION(); } /* bail out if there are no more pages to scan. */ @@ -866,8 +924,25 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, page = BufferGetPage(bucket_buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; MarkBufferDirty(bucket_buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); } /* @@ -881,9 +956,3 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, else LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK); } - -void -hash_redo(XLogReaderState *record) -{ - elog(PANIC, "hash_redo: unimplemented"); -} diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c new file mode 100644 index 00000000000..d435215259b --- /dev/null +++ b/src/backend/access/hash/hash_xlog.c @@ -0,0 +1,963 @@ +/*------------------------------------------------------------------------- + * + * hash_xlog.c + * WAL replay logic for hash index. + * + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/hash/hash_xlog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/hash.h" +#include "access/hash_xlog.h" +#include "access/xlogutils.h" + +/* + * replay a hash index meta page + */ +static void +hash_xlog_init_meta_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Page page; + Buffer metabuf; + + xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) XLogRecGetData(record); + + /* create the index' metapage */ + metabuf = XLogInitBufferForRedo(record, 0); + Assert(BufferIsValid(metabuf)); + _hash_init_metabuffer(metabuf, xlrec->num_tuples, xlrec->procid, + xlrec->ffactor, true); + page = (Page) BufferGetPage(metabuf); + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + /* all done */ + UnlockReleaseBuffer(metabuf); +} + +/* + * replay a hash index bitmap page + */ +static void +hash_xlog_init_bitmap_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer bitmapbuf; + Buffer metabuf; + Page page; + HashMetaPage metap; + uint32 num_buckets; + + xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) XLogRecGetData(record); + + /* + * Initialize bitmap page + */ + bitmapbuf = XLogInitBufferForRedo(record, 0); + _hash_initbitmapbuffer(bitmapbuf, xlrec->bmsize, true); + PageSetLSN(BufferGetPage(bitmapbuf), lsn); + MarkBufferDirty(bitmapbuf); + UnlockReleaseBuffer(bitmapbuf); + + /* add the new bitmap page to the metapage's list of bitmaps */ + if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) + { + /* + * Note: in normal operation, we'd update the metapage while still + * holding lock on the bitmap page. But during replay it's not + * necessary to hold that lock, since nobody can see it yet; the + * creating transaction hasn't yet committed. + */ + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + + num_buckets = metap->hashm_maxbucket + 1; + metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; + metap->hashm_nmaps++; + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +/* + * replay a hash index insert without split + */ +static void +hash_xlog_insert(XLogReaderState *record) +{ + HashMetaPage metap; + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_insert *xlrec = (xl_hash_insert *) XLogRecGetData(record); + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Size datalen; + char *datapos = XLogRecGetBlockData(record, 0, &datalen); + + page = BufferGetPage(buffer); + + if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "hash_xlog_insert: failed to add item"); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + { + /* + * Note: in normal operation, we'd update the metapage while still + * holding lock on the page we inserted into. But during replay it's + * not necessary to hold that lock, since no other index updates can + * be happening concurrently. + */ + page = BufferGetPage(buffer); + metap = HashPageGetMeta(page); + metap->hashm_ntuples += 1; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * replay addition of overflow page for hash index + */ +static void +hash_xlog_add_ovfl_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) XLogRecGetData(record); + Buffer leftbuf; + Buffer ovflbuf; + Buffer metabuf; + BlockNumber leftblk; + BlockNumber rightblk; + BlockNumber newmapblk = InvalidBlockNumber; + Page ovflpage; + HashPageOpaque ovflopaque; + uint32 *num_bucket; + char *data; + Size datalen PG_USED_FOR_ASSERTS_ONLY; + bool new_bmpage = false; + + XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk); + XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk); + + ovflbuf = XLogInitBufferForRedo(record, 0); + Assert(BufferIsValid(ovflbuf)); + + data = XLogRecGetBlockData(record, 0, &datalen); + num_bucket = (uint32 *) data; + Assert(datalen == sizeof(uint32)); + _hash_initbuf(ovflbuf, InvalidBlockNumber, *num_bucket, LH_OVERFLOW_PAGE, + true); + /* update backlink */ + ovflpage = BufferGetPage(ovflbuf); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + ovflopaque->hasho_prevblkno = leftblk; + + PageSetLSN(ovflpage, lsn); + MarkBufferDirty(ovflbuf); + + if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) + { + Page leftpage; + HashPageOpaque leftopaque; + + leftpage = BufferGetPage(leftbuf); + leftopaque = (HashPageOpaque) PageGetSpecialPointer(leftpage); + leftopaque->hasho_nextblkno = rightblk; + + PageSetLSN(leftpage, lsn); + MarkBufferDirty(leftbuf); + } + + if (BufferIsValid(leftbuf)) + UnlockReleaseBuffer(leftbuf); + UnlockReleaseBuffer(ovflbuf); + + /* + * Note: in normal operation, we'd update the bitmap and meta page while + * still holding lock on the overflow pages. But during replay it's not + * necessary to hold those locks, since no other index updates can be + * happening concurrently. + */ + if (XLogRecHasBlockRef(record, 2)) + { + Buffer mapbuffer; + + if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) + { + Page mappage = (Page) BufferGetPage(mapbuffer); + uint32 *freep = NULL; + char *data; + uint32 *bitmap_page_bit; + + freep = HashPageGetBitmap(mappage); + + data = XLogRecGetBlockData(record, 2, &datalen); + bitmap_page_bit = (uint32 *) data; + + SETBIT(freep, *bitmap_page_bit); + + PageSetLSN(mappage, lsn); + MarkBufferDirty(mapbuffer); + } + if (BufferIsValid(mapbuffer)) + UnlockReleaseBuffer(mapbuffer); + } + + if (XLogRecHasBlockRef(record, 3)) + { + Buffer newmapbuf; + + newmapbuf = XLogInitBufferForRedo(record, 3); + + _hash_initbitmapbuffer(newmapbuf, xlrec->bmsize, true); + + new_bmpage = true; + newmapblk = BufferGetBlockNumber(newmapbuf); + + MarkBufferDirty(newmapbuf); + PageSetLSN(BufferGetPage(newmapbuf), lsn); + + UnlockReleaseBuffer(newmapbuf); + } + + if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) + { + HashMetaPage metap; + Page page; + uint32 *firstfree_ovflpage; + + data = XLogRecGetBlockData(record, 4, &datalen); + firstfree_ovflpage = (uint32 *) data; + + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + metap->hashm_firstfree = *firstfree_ovflpage; + + if (!xlrec->bmpage_found) + { + metap->hashm_spares[metap->hashm_ovflpoint]++; + + if (new_bmpage) + { + Assert(BlockNumberIsValid(newmapblk)); + + metap->hashm_mapp[metap->hashm_nmaps] = newmapblk; + metap->hashm_nmaps++; + metap->hashm_spares[metap->hashm_ovflpoint]++; + } + } + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +/* + * replay allocation of page for split operation + */ +static void +hash_xlog_split_allocate_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) XLogRecGetData(record); + Buffer oldbuf; + Buffer newbuf; + Buffer metabuf; + Size datalen PG_USED_FOR_ASSERTS_ONLY; + char *data; + XLogRedoAction action; + + /* + * To be consistent with normal operation, here we take cleanup locks on + * both the old and new buckets even though there can't be any concurrent + * inserts. + */ + + /* replay the record for old bucket */ + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the special space is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) + { + Page oldpage; + HashPageOpaque oldopaque; + + oldpage = BufferGetPage(oldbuf); + oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); + + oldopaque->hasho_flag = xlrec->old_bucket_flag; + oldopaque->hasho_prevblkno = xlrec->new_bucket; + + PageSetLSN(oldpage, lsn); + MarkBufferDirty(oldbuf); + } + + /* replay the record for new bucket */ + newbuf = XLogInitBufferForRedo(record, 1); + _hash_initbuf(newbuf, xlrec->new_bucket, xlrec->new_bucket, + xlrec->new_bucket_flag, true); + if (!IsBufferCleanupOK(newbuf)) + elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock"); + MarkBufferDirty(newbuf); + PageSetLSN(BufferGetPage(newbuf), lsn); + + /* + * We can release the lock on old bucket early as well but doing here to + * consistent with normal operation. + */ + if (BufferIsValid(oldbuf)) + UnlockReleaseBuffer(oldbuf); + if (BufferIsValid(newbuf)) + UnlockReleaseBuffer(newbuf); + + /* + * Note: in normal operation, we'd update the meta page while still + * holding lock on the old and new bucket pages. But during replay it's + * not necessary to hold those locks, since no other bucket splits can be + * happening concurrently. + */ + + /* replay the record for metapage changes */ + if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) + { + Page page; + HashMetaPage metap; + + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + metap->hashm_maxbucket = xlrec->new_bucket; + + data = XLogRecGetBlockData(record, 2, &datalen); + + if (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) + { + uint32 lowmask; + uint32 *highmask; + + /* extract low and high masks. */ + memcpy(&lowmask, data, sizeof(uint32)); + highmask = (uint32 *) ((char *) data + sizeof(uint32)); + + /* update metapage */ + metap->hashm_lowmask = lowmask; + metap->hashm_highmask = *highmask; + + data += sizeof(uint32) * 2; + } + + if (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) + { + uint32 ovflpoint; + uint32 *ovflpages; + + /* extract information of overflow pages. */ + memcpy(&ovflpoint, data, sizeof(uint32)); + ovflpages = (uint32 *) ((char *) data + sizeof(uint32)); + + /* update metapage */ + metap->hashm_spares[ovflpoint] = *ovflpages; + metap->hashm_ovflpoint = ovflpoint; + } + + MarkBufferDirty(metabuf); + PageSetLSN(BufferGetPage(metabuf), lsn); + } + + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +/* + * replay of split operation + */ +static void +hash_xlog_split_page(XLogReaderState *record) +{ + Buffer buf; + + if (XLogReadBufferForRedo(record, 0, &buf) != BLK_RESTORED) + elog(ERROR, "Hash split record did not contain a full-page image"); + + UnlockReleaseBuffer(buf); +} + +/* + * replay completion of split operation + */ +static void +hash_xlog_split_complete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_split_complete *xlrec = (xl_hash_split_complete *) XLogRecGetData(record); + Buffer oldbuf; + Buffer newbuf; + XLogRedoAction action; + + /* replay the record for old bucket */ + action = XLogReadBufferForRedo(record, 0, &oldbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the bucket flag is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) + { + Page oldpage; + HashPageOpaque oldopaque; + + oldpage = BufferGetPage(oldbuf); + oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); + + oldopaque->hasho_flag = xlrec->old_bucket_flag; + + PageSetLSN(oldpage, lsn); + MarkBufferDirty(oldbuf); + } + if (BufferIsValid(oldbuf)) + UnlockReleaseBuffer(oldbuf); + + /* replay the record for new bucket */ + action = XLogReadBufferForRedo(record, 1, &newbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the bucket flag is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) + { + Page newpage; + HashPageOpaque nopaque; + + newpage = BufferGetPage(newbuf); + nopaque = (HashPageOpaque) PageGetSpecialPointer(newpage); + + nopaque->hasho_flag = xlrec->new_bucket_flag; + + PageSetLSN(newpage, lsn); + MarkBufferDirty(newbuf); + } + if (BufferIsValid(newbuf)) + UnlockReleaseBuffer(newbuf); +} + +/* + * replay move of page contents for squeeze operation of hash index + */ +static void +hash_xlog_move_page_contents(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record); + Buffer bucketbuf = InvalidBuffer; + Buffer writebuf = InvalidBuffer; + Buffer deletebuf = InvalidBuffer; + XLogRedoAction action; + + /* + * Ensure we have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_prim_bucket_same_wrt) + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); + else + { + /* + * we don't care for return value as the purpose of reading bucketbuf + * is to ensure a cleanup lock on primary bucket page. + */ + (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); + + action = XLogReadBufferForRedo(record, 1, &writebuf); + } + + /* replay the record for adding entries in overflow buffer */ + if (action == BLK_NEEDS_REDO) + { + Page writepage; + char *begin; + char *data; + Size datalen; + uint16 ninserted = 0; + + data = begin = XLogRecGetBlockData(record, 1, &datalen); + + writepage = (Page) BufferGetPage(writebuf); + + if (xldata->ntups > 0) + { + OffsetNumber *towrite = (OffsetNumber *) data; + + data += sizeof(OffsetNumber) * xldata->ntups; + + while (data - begin < datalen) + { + IndexTuple itup = (IndexTuple) data; + Size itemsz; + OffsetNumber l; + + itemsz = IndexTupleDSize(*itup); + itemsz = MAXALIGN(itemsz); + + data += itemsz; + + l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "hash_xlog_move_page_contents: failed to add item to hash index page, size %d bytes", + (int) itemsz); + + ninserted++; + } + } + + /* + * number of tuples inserted must be same as requested in REDO record. + */ + Assert(ninserted == xldata->ntups); + + PageSetLSN(writepage, lsn); + MarkBufferDirty(writebuf); + } + + /* replay the record for deleting entries from overflow buffer */ + if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO) + { + Page page; + char *ptr; + Size len; + + ptr = XLogRecGetBlockData(record, 2, &len); + + page = (Page) BufferGetPage(deletebuf); + + if (len > 0) + { + OffsetNumber *unused; + OffsetNumber *unend; + + unused = (OffsetNumber *) ptr; + unend = (OffsetNumber *) ((char *) ptr + len); + + if ((unend - unused) > 0) + PageIndexMultiDelete(page, unused, unend - unused); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(deletebuf); + } + + /* + * Replay is complete, now we can release the buffers. We release locks at + * end of replay operation to ensure that we hold lock on primary bucket + * page till end of operation. We can optimize by releasing the lock on + * write buffer as soon as the operation for same is complete, if it is + * not same as primary bucket page, but that doesn't seem to be worth + * complicating the code. + */ + if (BufferIsValid(deletebuf)) + UnlockReleaseBuffer(deletebuf); + + if (BufferIsValid(writebuf)) + UnlockReleaseBuffer(writebuf); + + if (BufferIsValid(bucketbuf)) + UnlockReleaseBuffer(bucketbuf); +} + +/* + * replay squeeze page operation of hash index + */ +static void +hash_xlog_squeeze_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record); + Buffer bucketbuf = InvalidBuffer; + Buffer writebuf; + Buffer ovflbuf; + Buffer prevbuf = InvalidBuffer; + Buffer mapbuf; + XLogRedoAction action; + + /* + * Ensure we have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_prim_bucket_same_wrt) + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); + else + { + /* + * we don't care for return value as the purpose of reading bucketbuf + * is to ensure a cleanup lock on primary bucket page. + */ + (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); + + action = XLogReadBufferForRedo(record, 1, &writebuf); + } + + /* replay the record for adding entries in overflow buffer */ + if (action == BLK_NEEDS_REDO) + { + Page writepage; + char *begin; + char *data; + Size datalen; + uint16 ninserted = 0; + + data = begin = XLogRecGetBlockData(record, 1, &datalen); + + writepage = (Page) BufferGetPage(writebuf); + + if (xldata->ntups > 0) + { + OffsetNumber *towrite = (OffsetNumber *) data; + + data += sizeof(OffsetNumber) * xldata->ntups; + + while (data - begin < datalen) + { + IndexTuple itup = (IndexTuple) data; + Size itemsz; + OffsetNumber l; + + itemsz = IndexTupleDSize(*itup); + itemsz = MAXALIGN(itemsz); + + data += itemsz; + + l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes", + (int) itemsz); + + ninserted++; + } + } + + /* + * number of tuples inserted must be same as requested in REDO record. + */ + Assert(ninserted == xldata->ntups); + + /* + * if the page on which are adding tuples is a page previous to freed + * overflow page, then update its nextblno. + */ + if (xldata->is_prev_bucket_same_wrt) + { + HashPageOpaque writeopaque = (HashPageOpaque) PageGetSpecialPointer(writepage); + + writeopaque->hasho_nextblkno = xldata->nextblkno; + } + + PageSetLSN(writepage, lsn); + MarkBufferDirty(writebuf); + } + + /* replay the record for initializing overflow buffer */ + if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) + { + Page ovflpage; + + ovflpage = BufferGetPage(ovflbuf); + + _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); + + PageSetLSN(ovflpage, lsn); + MarkBufferDirty(ovflbuf); + } + if (BufferIsValid(ovflbuf)) + UnlockReleaseBuffer(ovflbuf); + + /* replay the record for page previous to the freed overflow page */ + if (!xldata->is_prev_bucket_same_wrt && + XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) + { + Page prevpage = BufferGetPage(prevbuf); + HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); + + prevopaque->hasho_nextblkno = xldata->nextblkno; + + PageSetLSN(prevpage, lsn); + MarkBufferDirty(prevbuf); + } + if (BufferIsValid(prevbuf)) + UnlockReleaseBuffer(prevbuf); + + /* replay the record for page next to the freed overflow page */ + if (XLogRecHasBlockRef(record, 4)) + { + Buffer nextbuf; + + if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) + { + Page nextpage = BufferGetPage(nextbuf); + HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); + + nextopaque->hasho_prevblkno = xldata->prevblkno; + + PageSetLSN(nextpage, lsn); + MarkBufferDirty(nextbuf); + } + if (BufferIsValid(nextbuf)) + UnlockReleaseBuffer(nextbuf); + } + + if (BufferIsValid(writebuf)) + UnlockReleaseBuffer(writebuf); + + if (BufferIsValid(bucketbuf)) + UnlockReleaseBuffer(bucketbuf); + + /* + * Note: in normal operation, we'd update the bitmap and meta page while + * still holding lock on the primary bucket page and overflow pages. But + * during replay it's not necessary to hold those locks, since no other + * index updates can be happening concurrently. + */ + /* replay the record for bitmap page */ + if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) + { + Page mappage = (Page) BufferGetPage(mapbuf); + uint32 *freep = NULL; + char *data; + uint32 *bitmap_page_bit; + Size datalen; + + freep = HashPageGetBitmap(mappage); + + data = XLogRecGetBlockData(record, 5, &datalen); + bitmap_page_bit = (uint32 *) data; + + CLRBIT(freep, *bitmap_page_bit); + + PageSetLSN(mappage, lsn); + MarkBufferDirty(mapbuf); + } + if (BufferIsValid(mapbuf)) + UnlockReleaseBuffer(mapbuf); + + /* replay the record for meta page */ + if (XLogRecHasBlockRef(record, 6)) + { + Buffer metabuf; + + if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) + { + HashMetaPage metap; + Page page; + char *data; + uint32 *firstfree_ovflpage; + Size datalen; + + data = XLogRecGetBlockData(record, 6, &datalen); + firstfree_ovflpage = (uint32 *) data; + + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + metap->hashm_firstfree = *firstfree_ovflpage; + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); + } +} + +/* + * replay delete operation of hash index + */ +static void +hash_xlog_delete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record); + Buffer bucketbuf = InvalidBuffer; + Buffer deletebuf; + Page page; + XLogRedoAction action; + + /* + * Ensure we have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_primary_bucket_page) + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf); + else + { + /* + * we don't care for return value as the purpose of reading bucketbuf + * is to ensure a cleanup lock on primary bucket page. + */ + (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); + + action = XLogReadBufferForRedo(record, 1, &deletebuf); + } + + /* replay the record for deleting entries in bucket page */ + if (action == BLK_NEEDS_REDO) + { + char *ptr; + Size len; + + ptr = XLogRecGetBlockData(record, 1, &len); + + page = (Page) BufferGetPage(deletebuf); + + if (len > 0) + { + OffsetNumber *unused; + OffsetNumber *unend; + + unused = (OffsetNumber *) ptr; + unend = (OffsetNumber *) ((char *) ptr + len); + + if ((unend - unused) > 0) + PageIndexMultiDelete(page, unused, unend - unused); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(deletebuf); + } + if (BufferIsValid(deletebuf)) + UnlockReleaseBuffer(deletebuf); + + if (BufferIsValid(bucketbuf)) + UnlockReleaseBuffer(bucketbuf); +} + +/* + * replay split cleanup flag operation for primary bucket page. + */ +static void +hash_xlog_split_cleanup(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + HashPageOpaque bucket_opaque; + + page = (Page) BufferGetPage(buffer); + + bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); + bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * replay for update meta page + */ +static void +hash_xlog_update_meta_page(XLogReaderState *record) +{ + HashMetaPage metap; + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_update_meta_page *xldata = (xl_hash_update_meta_page *) XLogRecGetData(record); + Buffer metabuf; + Page page; + + if (XLogReadBufferForRedo(record, 0, &metabuf) == BLK_NEEDS_REDO) + { + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + + metap->hashm_ntuples = xldata->ntuples; + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +void +hash_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_HASH_INIT_META_PAGE: + hash_xlog_init_meta_page(record); + break; + case XLOG_HASH_INIT_BITMAP_PAGE: + hash_xlog_init_bitmap_page(record); + break; + case XLOG_HASH_INSERT: + hash_xlog_insert(record); + break; + case XLOG_HASH_ADD_OVFL_PAGE: + hash_xlog_add_ovfl_page(record); + break; + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + hash_xlog_split_allocate_page(record); + break; + case XLOG_HASH_SPLIT_PAGE: + hash_xlog_split_page(record); + break; + case XLOG_HASH_SPLIT_COMPLETE: + hash_xlog_split_complete(record); + break; + case XLOG_HASH_MOVE_PAGE_CONTENTS: + hash_xlog_move_page_contents(record); + break; + case XLOG_HASH_SQUEEZE_PAGE: + hash_xlog_squeeze_page(record); + break; + case XLOG_HASH_DELETE: + hash_xlog_delete(record); + break; + case XLOG_HASH_SPLIT_CLEANUP: + hash_xlog_split_cleanup(record); + break; + case XLOG_HASH_UPDATE_META_PAGE: + hash_xlog_update_meta_page(record); + break; + default: + elog(PANIC, "hash_redo: unknown op code %u", info); + } +} diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index 354e7339cf4..241728fe6b1 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -16,6 +16,8 @@ #include "postgres.h" #include "access/hash.h" +#include "access/hash_xlog.h" +#include "miscadmin.h" #include "utils/rel.h" @@ -40,6 +42,7 @@ _hash_doinsert(Relation rel, IndexTuple itup) bool do_expand; uint32 hashkey; Bucket bucket; + OffsetNumber itup_off; /* * Get the hash key for the item (it's stored in the index tuple itself). @@ -158,25 +161,20 @@ restart_insert: Assert(pageopaque->hasho_bucket == bucket); } - /* found page with enough space, so add the item here */ - (void) _hash_pgaddtup(rel, buf, itemsz, itup); - - /* - * dirty and release the modified page. if the page we modified was an - * overflow page, we also need to separately drop the pin we retained on - * the primary bucket page. - */ - MarkBufferDirty(buf); - _hash_relbuf(rel, buf); - if (buf != bucket_buf) - _hash_dropbuf(rel, bucket_buf); - /* * Write-lock the metapage so we can increment the tuple count. After * incrementing it, check to see if it's time for a split. */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + /* Do the update. No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + /* found page with enough space, so add the item here */ + itup_off = _hash_pgaddtup(rel, buf, itemsz, itup); + MarkBufferDirty(buf); + + /* metapage operations */ metap = HashPageGetMeta(metapage); metap->hashm_ntuples += 1; @@ -184,10 +182,43 @@ restart_insert: do_expand = metap->hashm_ntuples > (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); - /* Write out the metapage and drop lock, but keep pin */ MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_insert xlrec; + XLogRecPtr recptr; + + xlrec.offnum = itup_off; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInsert); + + XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); + + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) itup, IndexTupleDSize(*itup)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT); + + PageSetLSN(BufferGetPage(buf), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* drop lock on metapage, but keep pin */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + /* + * Release the modified page and ensure to release the pin on primary + * page. + */ + _hash_relbuf(rel, buf); + if (buf != bucket_buf) + _hash_dropbuf(rel, bucket_buf); + /* Attempt to split if a split is needed */ if (do_expand) _hash_expandtable(rel, metabuf); diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 1087480f7eb..a3cae21c605 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -18,6 +18,8 @@ #include "postgres.h" #include "access/hash.h" +#include "access/hash_xlog.h" +#include "miscadmin.h" #include "utils/rel.h" @@ -136,6 +138,13 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) * page is released, then finally acquire the lock on new overflow buffer. * We need this locking order to avoid deadlock with backends that are * doing inserts. + * + * Note: We could have avoided locking many buffers here if we made two + * WAL records for acquiring an overflow page (one to allocate an overflow + * page and another to add it to overflow bucket chain). However, doing + * so can leak an overflow page, if the system crashes after allocation. + * Needless to say, it is better to have a single record from a + * performance point of view as well. */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); @@ -303,8 +312,12 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) found: /* - * Do the update. + * Do the update. No ereport(ERROR) until changes are logged. We want to + * log the changes for bitmap page and overflow page together to avoid + * loss of pages in case the new page is added. */ + START_CRIT_SECTION(); + if (page_found) { Assert(BufferIsValid(mapbuf)); @@ -362,6 +375,51 @@ found: MarkBufferDirty(buf); + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_hash_add_ovfl_page xlrec; + + xlrec.bmpage_found = page_found; + xlrec.bmsize = metap->hashm_bmsize; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage); + + XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT); + XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket)); + + XLogRegisterBuffer(1, buf, REGBUF_STANDARD); + + if (BufferIsValid(mapbuf)) + { + XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32)); + } + + if (BufferIsValid(newmapbuf)) + XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT); + + XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD); + XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE); + + PageSetLSN(BufferGetPage(ovflbuf), recptr); + PageSetLSN(BufferGetPage(buf), recptr); + + if (BufferIsValid(mapbuf)) + PageSetLSN(BufferGetPage(mapbuf), recptr); + + if (BufferIsValid(newmapbuf)) + PageSetLSN(BufferGetPage(newmapbuf), recptr); + + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + if (retain_pin) LockBuffer(buf, BUFFER_LOCK_UNLOCK); else @@ -408,7 +466,11 @@ _hash_firstfreebit(uint32 map) * Remove this overflow page from its bucket's chain, and mark the page as * free. On entry, ovflbuf is write-locked; it is released before exiting. * - * Add the tuples (itups) to wbuf. + * Add the tuples (itups) to wbuf in this function. We could do that in the + * caller as well, but the advantage of doing it here is we can easily write + * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and + * removal of overflow page has to done as an atomic operation, otherwise + * during replay on standby users might find duplicate records. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. @@ -430,8 +492,6 @@ _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, HashMetaPage metap; Buffer metabuf; Buffer mapbuf; - Buffer prevbuf = InvalidBuffer; - Buffer nextbuf = InvalidBuffer; BlockNumber ovflblkno; BlockNumber prevblkno; BlockNumber blkno; @@ -445,6 +505,9 @@ _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, int32 bitmappage, bitmapbit; Bucket bucket PG_USED_FOR_ASSERTS_ONLY; + Buffer prevbuf = InvalidBuffer; + Buffer nextbuf = InvalidBuffer; + bool update_metap = false; /* Get information from the doomed page */ _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); @@ -508,6 +571,12 @@ _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, /* Get write-lock on metapage to update firstfree */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + /* This operation needs to log multiple tuples, prepare WAL for that */ + if (RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups); + + START_CRIT_SECTION(); + /* * we have to insert tuples on the "write" page, being careful to preserve * hashkey ordering. (If we insert many tuples into the same "write" page @@ -519,7 +588,11 @@ _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, MarkBufferDirty(wbuf); } - /* Initialize the freed overflow page. */ + /* + * Initialize the freed overflow page. Just zeroing the page won't work, + * because WAL replay routines expect pages to be initialized. See + * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. + */ _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); MarkBufferDirty(ovflbuf); @@ -550,9 +623,83 @@ _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, if (ovflbitno < metap->hashm_firstfree) { metap->hashm_firstfree = ovflbitno; + update_metap = true; MarkBufferDirty(metabuf); } + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_squeeze_page xlrec; + XLogRecPtr recptr; + int i; + + xlrec.prevblkno = prevblkno; + xlrec.nextblkno = nextblkno; + xlrec.ntups = nitups; + xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf); + xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage); + + /* + * bucket buffer needs to be registered to ensure that we can acquire + * a cleanup lock on it during replay. + */ + if (!xlrec.is_prim_bucket_same_wrt) + XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE); + + XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); + if (xlrec.ntups > 0) + { + XLogRegisterBufData(1, (char *) itup_offsets, + nitups * sizeof(OffsetNumber)); + for (i = 0; i < nitups; i++) + XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); + } + + XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD); + + /* + * If prevpage and the writepage (block in which we are moving tuples + * from overflow) are same, then no need to separately register + * prevpage. During replay, we can directly update the nextblock in + * writepage. + */ + if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) + XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD); + + if (BufferIsValid(nextbuf)) + XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD); + + XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD); + XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32)); + + if (update_metap) + { + XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD); + XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32)); + } + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE); + + PageSetLSN(BufferGetPage(wbuf), recptr); + PageSetLSN(BufferGetPage(ovflbuf), recptr); + + if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) + PageSetLSN(BufferGetPage(prevbuf), recptr); + if (BufferIsValid(nextbuf)) + PageSetLSN(BufferGetPage(nextbuf), recptr); + + PageSetLSN(BufferGetPage(mapbuf), recptr); + + if (update_metap) + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + /* release previous bucket if it is not same as write bucket */ if (BufferIsValid(prevbuf) && prevblkno != writeblkno) _hash_relbuf(rel, prevbuf); @@ -601,7 +748,11 @@ _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage) freep = HashPageGetBitmap(pg); MemSet(freep, 0xFF, bmsize); - /* Set pd_lower just past the end of the bitmap page data. */ + /* + * Set pd_lower just past the end of the bitmap page data. We could even + * set pd_lower equal to pd_upper, but this is more precise and makes the + * page look compressible to xlog.c. + */ ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg; } @@ -761,6 +912,15 @@ readpage: Assert(nitups == ndeletable); /* + * This operation needs to log multiple tuples, prepare + * WAL for that. + */ + if (RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(0, 3 + nitups); + + START_CRIT_SECTION(); + + /* * we have to insert tuples on the "write" page, being * careful to preserve hashkey ordering. (If we insert * many tuples into the same "write" page it would be @@ -773,6 +933,43 @@ readpage: PageIndexMultiDelete(rpage, deletable, ndeletable); MarkBufferDirty(rbuf); + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_hash_move_page_contents xlrec; + + xlrec.ntups = nitups; + xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents); + + /* + * bucket buffer needs to be registered to ensure that + * we can acquire a cleanup lock on it during replay. + */ + if (!xlrec.is_prim_bucket_same_wrt) + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); + + XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); + XLogRegisterBufData(1, (char *) itup_offsets, + nitups * sizeof(OffsetNumber)); + for (i = 0; i < nitups; i++) + XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); + + XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS); + + PageSetLSN(BufferGetPage(wbuf), recptr); + PageSetLSN(BufferGetPage(rbuf), recptr); + } + + END_CRIT_SECTION(); + tups_moved = true; } diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index c73929cebbb..dc606f162e1 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -29,6 +29,7 @@ #include "postgres.h" #include "access/hash.h" +#include "access/hash_xlog.h" #include "miscadmin.h" #include "storage/lmgr.h" #include "storage/smgr.h" @@ -43,6 +44,7 @@ static void _hash_splitbucket(Relation rel, Buffer metabuf, HTAB *htab, uint32 maxbucket, uint32 highmask, uint32 lowmask); +static void log_split_page(Relation rel, Buffer buf); /* @@ -381,6 +383,25 @@ _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) pg = BufferGetPage(metabuf); metap = HashPageGetMeta(pg); + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_init_meta_page xlrec; + XLogRecPtr recptr; + + xlrec.num_tuples = num_tuples; + xlrec.procid = metap->hashm_procid; + xlrec.ffactor = metap->hashm_ffactor; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage); + XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE); + + PageSetLSN(BufferGetPage(metabuf), recptr); + } + num_buckets = metap->hashm_maxbucket + 1; /* @@ -405,6 +426,12 @@ _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) buf = _hash_getnewbuf(rel, blkno, forkNum); _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false); MarkBufferDirty(buf); + + log_newpage(&rel->rd_node, + forkNum, + blkno, + BufferGetPage(buf), + true); _hash_relbuf(rel, buf); } @@ -431,6 +458,31 @@ _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) metap->hashm_nmaps++; MarkBufferDirty(metabuf); + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_init_bitmap_page xlrec; + XLogRecPtr recptr; + + xlrec.bmsize = metap->hashm_bmsize; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage); + XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT); + + /* + * This is safe only because nobody else can be modifying the index at + * this stage; it's only visible to the transaction that is creating + * it. + */ + XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE); + + PageSetLSN(BufferGetPage(bitmapbuf), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + /* all done */ _hash_relbuf(rel, bitmapbuf); _hash_relbuf(rel, metabuf); @@ -525,7 +577,10 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, metap->hashm_ovflpoint = log2_num_buckets; metap->hashm_firstfree = 0; - /* Set pd_lower just past the end of the metadata. */ + /* + * Set pd_lower just past the end of the metadata. This is to log full + * page image of metapage in xloginsert.c. + */ ((PageHeader) page)->pd_lower = ((char *) metap + sizeof(HashMetaPageData)) - (char *) page; } @@ -569,6 +624,8 @@ _hash_expandtable(Relation rel, Buffer metabuf) uint32 maxbucket; uint32 highmask; uint32 lowmask; + bool metap_update_masks = false; + bool metap_update_splitpoint = false; restart_expand: @@ -728,7 +785,11 @@ restart_expand: * The number of buckets in the new splitpoint is equal to the total * number already in existence, i.e. new_bucket. Currently this maps * one-to-one to blocks required, but someday we may need a more - * complicated calculation here. + * complicated calculation here. We treat allocation of buckets as a + * separate WAL-logged action. Even if we fail after this operation, + * won't leak bucket pages; rather, the next split will consume this + * space. In any case, even without failure we don't use all the space + * in one split operation. */ if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket)) { @@ -757,8 +818,7 @@ restart_expand: * Since we are scribbling on the pages in the shared buffers, establish a * critical section. Any failure in this next code leaves us with a big * problem: the metapage is effectively corrupt but could get written back - * to disk. We don't really expect any failure, but just to be sure, - * establish a critical section. + * to disk. */ START_CRIT_SECTION(); @@ -772,6 +832,7 @@ restart_expand: /* Starting a new doubling */ metap->hashm_lowmask = metap->hashm_highmask; metap->hashm_highmask = new_bucket | metap->hashm_lowmask; + metap_update_masks = true; } /* @@ -784,6 +845,7 @@ restart_expand: { metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; metap->hashm_ovflpoint = spare_ndx; + metap_update_splitpoint = true; } MarkBufferDirty(metabuf); @@ -829,6 +891,49 @@ restart_expand: MarkBufferDirty(buf_nblkno); + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_split_allocate_page xlrec; + XLogRecPtr recptr; + + xlrec.new_bucket = maxbucket; + xlrec.old_bucket_flag = oopaque->hasho_flag; + xlrec.new_bucket_flag = nopaque->hasho_flag; + xlrec.flags = 0; + + XLogBeginInsert(); + + XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD); + XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT); + XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD); + + if (metap_update_masks) + { + xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS; + XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32)); + XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32)); + } + + if (metap_update_splitpoint) + { + xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT; + XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint, + sizeof(uint32)); + XLogRegisterBufData(2, + (char *) &metap->hashm_spares[metap->hashm_ovflpoint], + sizeof(uint32)); + } + + XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE); + + PageSetLSN(BufferGetPage(buf_oblkno), recptr); + PageSetLSN(BufferGetPage(buf_nblkno), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + END_CRIT_SECTION(); /* drop lock, but keep pin */ @@ -883,6 +988,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) { BlockNumber lastblock; char zerobuf[BLCKSZ]; + Page page; lastblock = firstblock + nblocks - 1; @@ -893,7 +999,20 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) if (lastblock < firstblock || lastblock == InvalidBlockNumber) return false; - MemSet(zerobuf, 0, sizeof(zerobuf)); + page = (Page) zerobuf; + + /* + * Initialize the freed overflow page. Just zeroing the page won't work, + * See _hash_freeovflpage for similar usage. + */ + _hash_pageinit(page, BLCKSZ); + + if (RelationNeedsWAL(rel)) + log_newpage(&rel->rd_node, + MAIN_FORKNUM, + lastblock, + zerobuf, + true); RelationOpenSmgr(rel); smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false); @@ -951,6 +1070,11 @@ _hash_splitbucket(Relation rel, Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; + OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; + IndexTuple itups[MaxIndexTuplesPerPage]; + Size all_tups_size = 0; + int i; + uint16 nitups = 0; bucket_obuf = obuf; opage = BufferGetPage(obuf); @@ -1029,29 +1153,38 @@ _hash_splitbucket(Relation rel, itemsz = IndexTupleDSize(*new_itup); itemsz = MAXALIGN(itemsz); - if (PageGetFreeSpace(npage) < itemsz) + if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz)) { - /* write out nbuf and drop lock, but keep pin */ + /* + * Change the shared buffer state in critical section, + * otherwise any error could make it unrecoverable. + */ + START_CRIT_SECTION(); + + _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); MarkBufferDirty(nbuf); + /* log the split operation before releasing the lock */ + log_split_page(rel, nbuf); + + END_CRIT_SECTION(); + /* drop lock, but keep pin */ LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); + nitups = 0; + all_tups_size = 0; + /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false); npage = BufferGetPage(nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); } - /* - * Insert tuple on new page, using _hash_pgaddtup to ensure - * correct ordering by hashkey. This is a tad inefficient - * since we may have to shuffle itempointers repeatedly. - * Possible future improvement: accumulate all the items for - * the new page and qsort them before insertion. - */ - (void) _hash_pgaddtup(rel, nbuf, itemsz, new_itup); - - /* be tidy */ - pfree(new_itup); + itups[nitups++] = new_itup; + all_tups_size += itemsz; } else { @@ -1073,11 +1206,27 @@ _hash_splitbucket(Relation rel, /* Exit loop if no more overflow pages in old bucket */ if (!BlockNumberIsValid(oblkno)) { + /* + * Change the shared buffer state in critical section, otherwise + * any error could make it unrecoverable. + */ + START_CRIT_SECTION(); + + _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); MarkBufferDirty(nbuf); + /* log the split operation before releasing the lock */ + log_split_page(rel, nbuf); + + END_CRIT_SECTION(); + if (nbuf == bucket_nbuf) LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, nbuf); + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); break; } @@ -1103,6 +1252,8 @@ _hash_splitbucket(Relation rel, npage = BufferGetPage(bucket_nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + START_CRIT_SECTION(); + oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT; nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED; @@ -1119,6 +1270,29 @@ _hash_splitbucket(Relation rel, */ MarkBufferDirty(bucket_obuf); MarkBufferDirty(bucket_nbuf); + + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_hash_split_complete xlrec; + + xlrec.old_bucket_flag = oopaque->hasho_flag; + xlrec.new_bucket_flag = nopaque->hasho_flag; + + XLogBeginInsert(); + + XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete); + + XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD); + XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE); + + PageSetLSN(BufferGetPage(bucket_obuf), recptr); + PageSetLSN(BufferGetPage(bucket_nbuf), recptr); + } + + END_CRIT_SECTION(); } /* @@ -1245,6 +1419,32 @@ _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, } /* + * log_split_page() -- Log the split operation + * + * We log the split operation when the new page in new bucket gets full, + * so we log the entire page. + * + * 'buf' must be locked by the caller which is also responsible for unlocking + * it. + */ +static void +log_split_page(Relation rel, Buffer buf) +{ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + + XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE); + + PageSetLSN(BufferGetPage(buf), recptr); + } +} + +/* * _hash_getcachedmetap() -- Returns cached metapage data. * * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 9e5d7e4babe..d7337703b0b 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -123,6 +123,7 @@ _hash_readnext(IndexScanDesc scan, if (block_found) { *pagep = BufferGetPage(*bufp); + TestForOldSnapshot(scan->xs_snapshot, rel, *pagep); *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); } } @@ -168,6 +169,7 @@ _hash_readprev(IndexScanDesc scan, *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); *pagep = BufferGetPage(*bufp); + TestForOldSnapshot(scan->xs_snapshot, rel, *pagep); *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); /* @@ -283,6 +285,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL); page = BufferGetPage(buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); opaque = (HashPageOpaque) PageGetSpecialPointer(page); bucket = opaque->hasho_bucket; @@ -318,6 +321,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) LockBuffer(buf, BUFFER_LOCK_UNLOCK); old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE); + TestForOldSnapshot(scan->xs_snapshot, rel, BufferGetPage(old_buf)); /* * remember the split bucket buffer so as to use it later for @@ -520,6 +524,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) _hash_readprev(scan, &buf, &page, &opaque); if (BufferIsValid(buf)) { + TestForOldSnapshot(scan->xs_snapshot, rel, page); maxoff = PageGetMaxOffsetNumber(page); offnum = _hash_binsearch_last(page, so->hashso_sk_hash); } diff --git a/src/backend/access/rmgrdesc/hashdesc.c b/src/backend/access/rmgrdesc/hashdesc.c index 7eac8191cad..f1cc9ff9514 100644 --- a/src/backend/access/rmgrdesc/hashdesc.c +++ b/src/backend/access/rmgrdesc/hashdesc.c @@ -19,10 +19,142 @@ void hash_desc(StringInfo buf, XLogReaderState *record) { + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_HASH_INIT_META_PAGE: + { + xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) rec; + + appendStringInfo(buf, "num_tuples %g, fillfactor %d", + xlrec->num_tuples, xlrec->ffactor); + break; + } + case XLOG_HASH_INIT_BITMAP_PAGE: + { + xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) rec; + + appendStringInfo(buf, "bmsize %d", xlrec->bmsize); + break; + } + case XLOG_HASH_INSERT: + { + xl_hash_insert *xlrec = (xl_hash_insert *) rec; + + appendStringInfo(buf, "off %u", xlrec->offnum); + break; + } + case XLOG_HASH_ADD_OVFL_PAGE: + { + xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) rec; + + appendStringInfo(buf, "bmsize %d, bmpage_found %c", + xlrec->bmsize, (xlrec->bmpage_found) ? 'T' : 'F'); + break; + } + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + { + xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) rec; + + appendStringInfo(buf, "new_bucket %u, meta_page_masks_updated %c, issplitpoint_changed %c", + xlrec->new_bucket, + (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) ? 'T' : 'F', + (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) ? 'T' : 'F'); + break; + } + case XLOG_HASH_SPLIT_COMPLETE: + { + xl_hash_split_complete *xlrec = (xl_hash_split_complete *) rec; + + appendStringInfo(buf, "old_bucket_flag %u, new_bucket_flag %u", + xlrec->old_bucket_flag, xlrec->new_bucket_flag); + break; + } + case XLOG_HASH_MOVE_PAGE_CONTENTS: + { + xl_hash_move_page_contents *xlrec = (xl_hash_move_page_contents *) rec; + + appendStringInfo(buf, "ntups %d, is_primary %c", + xlrec->ntups, + xlrec->is_prim_bucket_same_wrt ? 'T' : 'F'); + break; + } + case XLOG_HASH_SQUEEZE_PAGE: + { + xl_hash_squeeze_page *xlrec = (xl_hash_squeeze_page *) rec; + + appendStringInfo(buf, "prevblkno %u, nextblkno %u, ntups %d, is_primary %c", + xlrec->prevblkno, + xlrec->nextblkno, + xlrec->ntups, + xlrec->is_prim_bucket_same_wrt ? 'T' : 'F'); + break; + } + case XLOG_HASH_DELETE: + { + xl_hash_delete *xlrec = (xl_hash_delete *) rec; + + appendStringInfo(buf, "is_primary %c", + xlrec->is_primary_bucket_page ? 'T' : 'F'); + break; + } + case XLOG_HASH_UPDATE_META_PAGE: + { + xl_hash_update_meta_page *xlrec = (xl_hash_update_meta_page *) rec; + + appendStringInfo(buf, "ntuples %g", + xlrec->ntuples); + break; + } + } } const char * hash_identify(uint8 info) { - return NULL; + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_HASH_INIT_META_PAGE: + id = "INIT_META_PAGE"; + break; + case XLOG_HASH_INIT_BITMAP_PAGE: + id = "INIT_BITMAP_PAGE"; + break; + case XLOG_HASH_INSERT: + id = "INSERT"; + break; + case XLOG_HASH_ADD_OVFL_PAGE: + id = "ADD_OVFL_PAGE"; + break; + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + id = "SPLIT_ALLOCATE_PAGE"; + break; + case XLOG_HASH_SPLIT_PAGE: + id = "SPLIT_PAGE"; + break; + case XLOG_HASH_SPLIT_COMPLETE: + id = "SPLIT_COMPLETE"; + break; + case XLOG_HASH_MOVE_PAGE_CONTENTS: + id = "MOVE_PAGE_CONTENTS"; + break; + case XLOG_HASH_SQUEEZE_PAGE: + id = "SQUEEZE_PAGE"; + break; + case XLOG_HASH_DELETE: + id = "DELETE"; + break; + case XLOG_HASH_SPLIT_CLEANUP: + id = "SPLIT_CLEANUP"; + break; + case XLOG_HASH_UPDATE_META_PAGE: + id = "UPDATE_META_PAGE"; + break; + } + + return id; } diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 72bb06c7602..9618032356a 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -506,11 +506,6 @@ DefineIndex(Oid relationId, accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); amRoutine = GetIndexAmRoutine(accessMethodForm->amhandler); - if (strcmp(accessMethodName, "hash") == 0 && - RelationNeedsWAL(rel)) - ereport(WARNING, - (errmsg("hash indexes are not WAL-logged and their use is discouraged"))); - if (stmt->unique && !amRoutine->amcanunique) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 9001e202b03..ce55fc52777 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -5880,13 +5880,10 @@ RelationIdIsInInitFile(Oid relationId) /* * Tells whether any index for the relation is unlogged. * - * Any index using the hash AM is implicitly unlogged. - * * Note: There doesn't seem to be any way to have an unlogged index attached - * to a permanent table except to create a hash index, but it seems best to - * keep this general so that it returns sensible results even when they seem - * obvious (like for an unlogged table) and to handle possible future unlogged - * indexes on permanent tables. + * to a permanent table, but it seems best to keep this general so that it + * returns sensible results even when they seem obvious (like for an unlogged + * table) and to handle possible future unlogged indexes on permanent tables. */ bool RelationHasUnloggedIndex(Relation rel) @@ -5908,8 +5905,7 @@ RelationHasUnloggedIndex(Relation rel) elog(ERROR, "cache lookup failed for relation %u", indexoid); reltup = (Form_pg_class) GETSTRUCT(tp); - if (reltup->relpersistence == RELPERSISTENCE_UNLOGGED - || reltup->relam == HASH_AM_OID) + if (reltup->relpersistence == RELPERSISTENCE_UNLOGGED) result = true; ReleaseSysCache(tp); diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h index cc231632e12..2075ab7afad 100644 --- a/src/include/access/hash_xlog.h +++ b/src/include/access/hash_xlog.h @@ -16,7 +16,239 @@ #include "access/xlogreader.h" #include "lib/stringinfo.h" +#include "storage/off.h" +/* Number of buffers required for XLOG_HASH_SQUEEZE_PAGE operation */ +#define HASH_XLOG_FREE_OVFL_BUFS 6 + +/* + * XLOG records for hash operations + */ +#define XLOG_HASH_INIT_META_PAGE 0x00 /* initialize the meta page */ +#define XLOG_HASH_INIT_BITMAP_PAGE 0x10 /* initialize the bitmap page */ +#define XLOG_HASH_INSERT 0x20 /* add index tuple without split */ +#define XLOG_HASH_ADD_OVFL_PAGE 0x30 /* add overflow page */ +#define XLOG_HASH_SPLIT_ALLOCATE_PAGE 0x40 /* allocate new page for split */ +#define XLOG_HASH_SPLIT_PAGE 0x50 /* split page */ +#define XLOG_HASH_SPLIT_COMPLETE 0x60 /* completion of split + * operation */ +#define XLOG_HASH_MOVE_PAGE_CONTENTS 0x70 /* remove tuples from one page + * and add to another page */ +#define XLOG_HASH_SQUEEZE_PAGE 0x80 /* add tuples to one of the previous + * pages in chain and free the ovfl + * page */ +#define XLOG_HASH_DELETE 0x90 /* delete index tuples from a page */ +#define XLOG_HASH_SPLIT_CLEANUP 0xA0 /* clear split-cleanup flag in primary + * bucket page after deleting tuples + * that are moved due to split */ +#define XLOG_HASH_UPDATE_META_PAGE 0xB0 /* update meta page after + * vacuum */ + + +/* + * xl_hash_split_allocate_page flag values, 8 bits are available. + */ +#define XLH_SPLIT_META_UPDATE_MASKS (1<<0) +#define XLH_SPLIT_META_UPDATE_SPLITPOINT (1<<1) + +/* + * This is what we need to know about a HASH index create. + * + * Backup block 0: metapage + */ +typedef struct xl_hash_createidx +{ + double num_tuples; + RegProcedure procid; + uint16 ffactor; +} xl_hash_createidx; +#define SizeOfHashCreateIdx (offsetof(xl_hash_createidx, ffactor) + sizeof(uint16)) + +/* + * This is what we need to know about simple (without split) insert. + * + * This data record is used for XLOG_HASH_INSERT + * + * Backup Blk 0: original page (data contains the inserted tuple) + * Backup Blk 1: metapage (HashMetaPageData) + */ +typedef struct xl_hash_insert +{ + OffsetNumber offnum; +} xl_hash_insert; + +#define SizeOfHashInsert (offsetof(xl_hash_insert, offnum) + sizeof(OffsetNumber)) + +/* + * This is what we need to know about addition of overflow page. + * + * This data record is used for XLOG_HASH_ADD_OVFL_PAGE + * + * Backup Blk 0: newly allocated overflow page + * Backup Blk 1: page before new overflow page in the bucket chain + * Backup Blk 2: bitmap page + * Backup Blk 3: new bitmap page + * Backup Blk 4: metapage + */ +typedef struct xl_hash_add_ovfl_page +{ + uint16 bmsize; + bool bmpage_found; +} xl_hash_add_ovfl_page; + +#define SizeOfHashAddOvflPage \ + (offsetof(xl_hash_add_ovfl_page, bmpage_found) + sizeof(bool)) + +/* + * This is what we need to know about allocating a page for split. + * + * This data record is used for XLOG_HASH_SPLIT_ALLOCATE_PAGE + * + * Backup Blk 0: page for old bucket + * Backup Blk 1: page for new bucket + * Backup Blk 2: metapage + */ +typedef struct xl_hash_split_allocate_page +{ + uint32 new_bucket; + uint16 old_bucket_flag; + uint16 new_bucket_flag; + uint8 flags; +} xl_hash_split_allocate_page; + +#define SizeOfHashSplitAllocPage \ + (offsetof(xl_hash_split_allocate_page, flags) + sizeof(uint8)) + +/* + * This is what we need to know about completing the split operation. + * + * This data record is used for XLOG_HASH_SPLIT_COMPLETE + * + * Backup Blk 0: page for old bucket + * Backup Blk 1: page for new bucket + */ +typedef struct xl_hash_split_complete +{ + uint16 old_bucket_flag; + uint16 new_bucket_flag; +} xl_hash_split_complete; + +#define SizeOfHashSplitComplete \ + (offsetof(xl_hash_split_complete, new_bucket_flag) + sizeof(uint16)) + +/* + * This is what we need to know about move page contents required during + * squeeze operation. + * + * This data record is used for XLOG_HASH_MOVE_PAGE_CONTENTS + * + * Backup Blk 0: bucket page + * Backup Blk 1: page containing moved tuples + * Backup Blk 2: page from which tuples will be removed + */ +typedef struct xl_hash_move_page_contents +{ + uint16 ntups; + bool is_prim_bucket_same_wrt; /* TRUE if the page to which + * tuples are moved is same as + * primary bucket page */ +} xl_hash_move_page_contents; + +#define SizeOfHashMovePageContents \ + (offsetof(xl_hash_move_page_contents, is_prim_bucket_same_wrt) + sizeof(bool)) + +/* + * This is what we need to know about the squeeze page operation. + * + * This data record is used for XLOG_HASH_SQUEEZE_PAGE + * + * Backup Blk 0: page containing tuples moved from freed overflow page + * Backup Blk 1: freed overflow page + * Backup Blk 2: page previous to the freed overflow page + * Backup Blk 3: page next to the freed overflow page + * Backup Blk 4: bitmap page containing info of freed overflow page + * Backup Blk 5: meta page + */ +typedef struct xl_hash_squeeze_page +{ + BlockNumber prevblkno; + BlockNumber nextblkno; + uint16 ntups; + bool is_prim_bucket_same_wrt; /* TRUE if the page to which + * tuples are moved is same as + * primary bucket page */ + bool is_prev_bucket_same_wrt; /* TRUE if the page to which + * tuples are moved is the + * page previous to the freed + * overflow page */ +} xl_hash_squeeze_page; + +#define SizeOfHashSqueezePage \ + (offsetof(xl_hash_squeeze_page, is_prev_bucket_same_wrt) + sizeof(bool)) + +/* + * This is what we need to know about the deletion of index tuples from a page. + * + * This data record is used for XLOG_HASH_DELETE + * + * Backup Blk 0: primary bucket page + * Backup Blk 1: page from which tuples are deleted + */ +typedef struct xl_hash_delete +{ + bool is_primary_bucket_page; /* TRUE if the operation is for + * primary bucket page */ +} xl_hash_delete; + +#define SizeOfHashDelete (offsetof(xl_hash_delete, is_primary_bucket_page) + sizeof(bool)) + +/* + * This is what we need for metapage update operation. + * + * This data record is used for XLOG_HASH_UPDATE_META_PAGE + * + * Backup Blk 0: meta page + */ +typedef struct xl_hash_update_meta_page +{ + double ntuples; +} xl_hash_update_meta_page; + +#define SizeOfHashUpdateMetaPage \ + (offsetof(xl_hash_update_meta_page, ntuples) + sizeof(double)) + +/* + * This is what we need to initialize metapage. + * + * This data record is used for XLOG_HASH_INIT_META_PAGE + * + * Backup Blk 0: meta page + */ +typedef struct xl_hash_init_meta_page +{ + double num_tuples; + RegProcedure procid; + uint16 ffactor; +} xl_hash_init_meta_page; + +#define SizeOfHashInitMetaPage \ + (offsetof(xl_hash_init_meta_page, ffactor) + sizeof(uint16)) + +/* + * This is what we need to initialize bitmap page. + * + * This data record is used for XLOG_HASH_INIT_BITMAP_PAGE + * + * Backup Blk 0: bitmap page + * Backup Blk 1: meta page + */ +typedef struct xl_hash_init_bitmap_page +{ + uint16 bmsize; +} xl_hash_init_bitmap_page; + +#define SizeOfHashInitBitmapPage \ + (offsetof(xl_hash_init_bitmap_page, bmsize) + sizeof(uint16)) extern void hash_redo(XLogReaderState *record); extern void hash_desc(StringInfo buf, XLogReaderState *record); diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index e519fdb0f69..26cd05933ca 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -2335,13 +2335,9 @@ Options: fastupdate=on, gin_pending_list_limit=128 -- HASH -- CREATE INDEX hash_i4_index ON hash_i4_heap USING hash (random int4_ops); -WARNING: hash indexes are not WAL-logged and their use is discouraged CREATE INDEX hash_name_index ON hash_name_heap USING hash (random name_ops); -WARNING: hash indexes are not WAL-logged and their use is discouraged CREATE INDEX hash_txt_index ON hash_txt_heap USING hash (random text_ops); -WARNING: hash indexes are not WAL-logged and their use is discouraged CREATE INDEX hash_f8_index ON hash_f8_heap USING hash (random float8_ops); -WARNING: hash indexes are not WAL-logged and their use is discouraged CREATE UNLOGGED TABLE unlogged_hash_table (id int4); CREATE INDEX unlogged_hash_index ON unlogged_hash_table USING hash (id int4_ops); DROP TABLE unlogged_hash_table; @@ -2350,7 +2346,6 @@ DROP TABLE unlogged_hash_table; -- maintenance_work_mem setting and fillfactor: SET maintenance_work_mem = '1MB'; CREATE INDEX hash_tuplesort_idx ON tenk1 USING hash (stringu1 name_ops) WITH (fillfactor = 10); -WARNING: hash indexes are not WAL-logged and their use is discouraged EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; QUERY PLAN diff --git a/src/test/regress/expected/enum.out b/src/test/regress/expected/enum.out index 514d1d01a10..0e6030443f9 100644 --- a/src/test/regress/expected/enum.out +++ b/src/test/regress/expected/enum.out @@ -383,7 +383,6 @@ DROP INDEX enumtest_btree; -- Hash index / opclass with the = operator -- CREATE INDEX enumtest_hash ON enumtest USING hash (col); -WARNING: hash indexes are not WAL-logged and their use is discouraged SELECT * FROM enumtest WHERE col = 'orange'; col -------- diff --git a/src/test/regress/expected/hash_index.out b/src/test/regress/expected/hash_index.out index f8b9f029b21..0a18efacfc4 100644 --- a/src/test/regress/expected/hash_index.out +++ b/src/test/regress/expected/hash_index.out @@ -201,7 +201,6 @@ SELECT h.seqno AS f20000 -- CREATE TABLE hash_split_heap (keycol INT); CREATE INDEX hash_split_index on hash_split_heap USING HASH (keycol); -WARNING: hash indexes are not WAL-logged and their use is discouraged INSERT INTO hash_split_heap SELECT 1 FROM generate_series(1, 70000) a; VACUUM FULL hash_split_heap; -- Let's do a backward scan. @@ -230,5 +229,4 @@ DROP TABLE hash_temp_heap CASCADE; CREATE TABLE hash_heap_float4 (x float4, y int); INSERT INTO hash_heap_float4 VALUES (1.1,1); CREATE INDEX hash_idx ON hash_heap_float4 USING hash (x); -WARNING: hash indexes are not WAL-logged and their use is discouraged DROP TABLE hash_heap_float4 CASCADE; diff --git a/src/test/regress/expected/macaddr.out b/src/test/regress/expected/macaddr.out index e84ff5f8c0a..151f9ce59bb 100644 --- a/src/test/regress/expected/macaddr.out +++ b/src/test/regress/expected/macaddr.out @@ -41,7 +41,6 @@ SELECT * FROM macaddr_data; CREATE INDEX macaddr_data_btree ON macaddr_data USING btree (b); CREATE INDEX macaddr_data_hash ON macaddr_data USING hash (b); -WARNING: hash indexes are not WAL-logged and their use is discouraged SELECT a, b, trunc(b) FROM macaddr_data ORDER BY 2, 1; a | b | trunc ----+-------------------+------------------- diff --git a/src/test/regress/expected/replica_identity.out b/src/test/regress/expected/replica_identity.out index fa63235fc9d..67c34a92a4e 100644 --- a/src/test/regress/expected/replica_identity.out +++ b/src/test/regress/expected/replica_identity.out @@ -12,7 +12,6 @@ CREATE UNIQUE INDEX test_replica_identity_keyab_key ON test_replica_identity (ke CREATE UNIQUE INDEX test_replica_identity_oid_idx ON test_replica_identity (oid); CREATE UNIQUE INDEX test_replica_identity_nonkey ON test_replica_identity (keya, nonkey); CREATE INDEX test_replica_identity_hash ON test_replica_identity USING hash (nonkey); -WARNING: hash indexes are not WAL-logged and their use is discouraged CREATE UNIQUE INDEX test_replica_identity_expr ON test_replica_identity (keya, keyb, (3)); CREATE UNIQUE INDEX test_replica_identity_partial ON test_replica_identity (keya, keyb) WHERE keyb != '3'; -- default is 'd'/DEFAULT for user created tables diff --git a/src/test/regress/expected/uuid.out b/src/test/regress/expected/uuid.out index 423f27787f3..db66dc723ef 100644 --- a/src/test/regress/expected/uuid.out +++ b/src/test/regress/expected/uuid.out @@ -114,7 +114,6 @@ SELECT COUNT(*) FROM guid1 WHERE guid_field >= '22222222-2222-2222-2222-22222222 -- btree and hash index creation test CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field); CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field); -WARNING: hash indexes are not WAL-logged and their use is discouraged -- unique index test CREATE UNIQUE INDEX guid1_unique_BTREE ON guid1 USING BTREE (guid_field); -- should fail |
