Fix bugs in Serializable Snapshot Isolation.

author Heikki Linnakangas <heikki.linnakangas@iki.fi>

Tue, 1 Mar 2011 17:05:16 +0000 (19:05 +0200)

committer Heikki Linnakangas <heikki.linnakangas@iki.fi>

Tue, 1 Mar 2011 17:05:16 +0000 (19:05 +0200)
author Heikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 1 Mar 2011 17:05:16 +0000 (19:05 +0200)
committer Heikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 1 Mar 2011 17:05:16 +0000 (19:05 +0200)
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c

index ba01874469bf7af9b18bd288e9edea5aa0be3115..7a0e1a9c25ea2423f1322a174b7cb6e7dd1bbb69 100644 (file)
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -824,7 +824,6 @@ restart:
         if (_bt_page_recyclable(page))
         {
                 /* Okay to recycle this page */
-               Assert(!PageIsPredicateLocked(rel, blkno));
                 RecordFreeIndexPage(rel, blkno);
                 vstate->totFreePages++;
                 stats->pages_deleted++;
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c

index d660ce5af78aefaa4dcb5576db722269603d5133..aa657fab579ea8b4c2090333a0da6a1789404d8e 100644 (file)
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -124,10 +124,6 @@
   *     SerializableXactHashLock
   *             - Protects both PredXact and SerializableXidHash.
   *
- *     PredicateLockNextRowLinkLock
- *             - Protects the priorVersionOfRow and nextVersionOfRow fields of
- *                     PREDICATELOCKTARGET when linkage is being created or destroyed.
- *
   *
   * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
@@ -444,8 +440,6 @@ static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
                                                    bool summarize);
  static bool XidIsConcurrent(TransactionId xid);
  static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag);
-static bool CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
-                                                 PREDICATELOCKTARGETTAG *nexttargettag);
  static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
  static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
                                                                                 SERIALIZABLEXACT *writer);
@@ -1044,7 +1038,6 @@ InitPredicateLocks(void)
                 PredXact->LastSxactCommitSeqNo = FirstNormalSerCommitSeqNo - 1;
                 PredXact->CanPartialClearThrough = 0;
                 PredXact->HavePartialClearedThrough = 0;
-               PredXact->NeedTargetLinkCleanup = false;
                 requestSize = mul_size((Size) max_table_size,
                                                            PredXactListElementDataSize);
                 PredXact->element = ShmemAlloc(requestSize);
@@ -1651,9 +1644,10 @@ PageIsPredicateLocked(const Relation relation, const BlockNumber blkno)
   * Important note: this function may return false even if the lock is
   * being held, because it uses the local lock table which is not
   * updated if another transaction modifies our lock list (e.g. to
- * split an index page). However, it will never return true if the
- * lock is not held. We only use this function in circumstances where
- * such false negatives are acceptable.
+ * split an index page). It can also return true when a coarser
+ * granularity lock that covers this target is being held. Be careful
+ * to only use this function in circumstances where such errors are
+ * acceptable!
   */
  static bool
  PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag)
@@ -1717,6 +1711,9 @@ GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
  /*
   * Check whether the lock we are considering is already covered by a
   * coarser lock for our transaction.
+ *
+ * Like PredicateLockExists, this function might return a false
+ * negative, but it will never return a false positive.
   */
  static bool
  CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag)
@@ -1747,7 +1744,6 @@ static void
  RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash)
  {
         PREDICATELOCKTARGET *rmtarget;
-       PREDICATELOCKTARGET *next;
  
         Assert(LWLockHeldByMe(SerializablePredicateLockListLock));
  
@@ -1755,33 +1751,6 @@ RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash)
         if (!SHMQueueEmpty(&target->predicateLocks))
                 return;
  
-       /* Can't remove it if there are locks for a prior row version. */
-       LWLockAcquire(PredicateLockNextRowLinkLock, LW_EXCLUSIVE);
-       if (target->priorVersionOfRow != NULL)
-       {
-               LWLockRelease(PredicateLockNextRowLinkLock);
-               return;
-       }
-
-       /*
-        * We are going to release this target,  This requires that we let the
-        * next version of the row (if any) know that it's previous version is
-        * done.
-        *
-        * It might be that the link was all that was keeping the other target
-        * from cleanup, but we can't clean that up here -- LW locking is all
-        * wrong for that.      We'll pass the HTAB in the general cleanup function to
-        * get rid of such "dead" targets.
-        */
-       next = target->nextVersionOfRow;
-       if (next != NULL)
-       {
-               next->priorVersionOfRow = NULL;
-               if (SHMQueueEmpty(&next->predicateLocks))
-                       PredXact->NeedTargetLinkCleanup = true;
-       }
-       LWLockRelease(PredicateLockNextRowLinkLock);
-
         /* Actually remove the target. */
         rmtarget = hash_search_with_hash_value(PredicateLockTargetHash,
                                                                                    &target->tag,
@@ -2065,11 +2034,7 @@ CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
                                  errmsg("out of shared memory"),
                                  errhint("You might need to increase max_pred_locks_per_transaction.")));
         if (!found)
-       {
                 SHMQueueInit(&(target->predicateLocks));
-               target->priorVersionOfRow = NULL;
-               target->nextVersionOfRow = NULL;
-       }
  
         /* We've got the sxact and target, make sure they're joined. */
         locktag.myTarget = target;
@@ -2125,8 +2090,6 @@ PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag)
                 hash_search_with_hash_value(LocalPredicateLockHash,
                                                                         targettag, targettaghash,
                                                                         HASH_ENTER, &found);
-       /* We should not hold the lock (but its entry might still exist) */
-       Assert(!found || !locallock->held);
         locallock->held = true;
         if (!found)
                 locallock->childLocks = 0;
@@ -2215,6 +2178,7 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
  {
         PREDICATELOCKTARGETTAG tag;
         ItemPointer tid;
+       TransactionId targetxmin;
  
         if (SkipSerialization(relation))
                 return;
@@ -2224,15 +2188,16 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
          */
         if (relation->rd_index == NULL)
         {
-               TransactionId myxid = GetTopTransactionIdIfAny();
+               TransactionId   myxid;
+
+               targetxmin = HeapTupleHeaderGetXmin(tuple->t_data);
  
+               myxid = GetTopTransactionIdIfAny();
                 if (TransactionIdIsValid(myxid))
                 {
-                       TransactionId xid = HeapTupleHeaderGetXmin(tuple->t_data);
-
-                       if (TransactionIdFollowsOrEquals(xid, TransactionXmin))
+                       if (TransactionIdFollowsOrEquals(targetxmin, TransactionXmin))
                         {
-                               xid = SubTransGetTopmostTransaction(xid);
+                               TransactionId xid = SubTransGetTopmostTransaction(targetxmin);
                                 if (TransactionIdEquals(xid, myxid))
                                 {
                                         /* We wrote it; we already have a write lock. */
@@ -2241,6 +2206,8 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
                         }
                 }
         }
+       else
+               targetxmin = InvalidTransactionId;
  
         /*
          * Do quick-but-not-definitive test for a relation lock first.  This will
@@ -2259,116 +2226,78 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
                                                                          relation->rd_node.dbNode,
                                                                          relation->rd_id,
                                                                          ItemPointerGetBlockNumber(tid),
-                                                                        ItemPointerGetOffsetNumber(tid));
+                                                                        ItemPointerGetOffsetNumber(tid),
+                                                                        targetxmin);
         PredicateLockAcquire(&tag);
  }
  
  /*
- * If the old tuple has any predicate locks, create a lock target for the
- * new tuple and point them at each other.     Conflict detection needs to
- * look for locks against prior versions of the row.
+ * If the old tuple has any predicate locks, copy them to the new target.
+ *
+ * This is called at an UPDATE, where any predicate locks held on the old
+ * tuple need to be copied to the new tuple, because logically they both
+ * represent the same row. A lock taken before the update must conflict
+ * with anyone locking the same row after the update.
   */
  void
  PredicateLockTupleRowVersionLink(const Relation relation,
                                                                  const HeapTuple oldTuple,
                                                                  const HeapTuple newTuple)
  {
-       PREDICATELOCKTARGETTAG oldtargettag;
-       PREDICATELOCKTARGETTAG newtargettag;
-       PREDICATELOCKTARGET *oldtarget;
-       PREDICATELOCKTARGET *newtarget;
-       PREDICATELOCKTARGET *next;
-       uint32          oldtargettaghash;
-       LWLockId        oldpartitionLock;
-       uint32          newtargettaghash;
-       LWLockId        newpartitionLock;
-       bool            found;
-
-       SET_PREDICATELOCKTARGETTAG_TUPLE(oldtargettag,
+       PREDICATELOCKTARGETTAG oldtupletag;
+       PREDICATELOCKTARGETTAG oldpagetag;
+       PREDICATELOCKTARGETTAG newtupletag;
+       BlockNumber     oldblk,
+                               newblk;
+       OffsetNumber oldoff,
+                               newoff;
+       TransactionId oldxmin,
+                               newxmin;
+
+       oldblk = ItemPointerGetBlockNumber(&(oldTuple->t_self));
+       oldoff = ItemPointerGetOffsetNumber(&(oldTuple->t_self));
+       oldxmin = HeapTupleHeaderGetXmin(oldTuple->t_data);
+
+       newblk = ItemPointerGetBlockNumber(&(newTuple->t_self));
+       newoff = ItemPointerGetOffsetNumber(&(newTuple->t_self));
+       newxmin = HeapTupleHeaderGetXmin(newTuple->t_data);
+
+       SET_PREDICATELOCKTARGETTAG_TUPLE(oldtupletag,
                                                                          relation->rd_node.dbNode,
                                                                          relation->rd_id,
-                                                         ItemPointerGetBlockNumber(&(oldTuple->t_self)),
-                                                       ItemPointerGetOffsetNumber(&(oldTuple->t_self)));
-       oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
-       oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+                                                                        oldblk,
+                                                                        oldoff,
+                                                                        oldxmin);
  
-       SET_PREDICATELOCKTARGETTAG_TUPLE(newtargettag,
+       SET_PREDICATELOCKTARGETTAG_PAGE(oldpagetag,
+                                                                       relation->rd_node.dbNode,
+                                                                       relation->rd_id,
+                                                                       oldblk);
+
+       SET_PREDICATELOCKTARGETTAG_TUPLE(newtupletag,
                                                                          relation->rd_node.dbNode,
                                                                          relation->rd_id,
-                                                         ItemPointerGetBlockNumber(&(newTuple->t_self)),
-                                                       ItemPointerGetOffsetNumber(&(newTuple->t_self)));
-       newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
-       newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
-
-       /* Lock lower numbered partition first. */
-       if (oldpartitionLock < newpartitionLock)
-       {
-               LWLockAcquire(oldpartitionLock, LW_SHARED);
-               LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
-       }
-       else if (newpartitionLock < oldpartitionLock)
-       {
-               LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
-               LWLockAcquire(oldpartitionLock, LW_SHARED);
-       }
-       else
-               LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
-
-       oldtarget = (PREDICATELOCKTARGET *)
-               hash_search_with_hash_value(PredicateLockTargetHash,
-                                                                       &oldtargettag, oldtargettaghash,
-                                                                       HASH_FIND, NULL);
-
-       /* Only need to link if there is an old target already. */
-       if (oldtarget)
-       {
-               LWLockAcquire(PredicateLockNextRowLinkLock, LW_EXCLUSIVE);
-
-               /* Guard against stale pointers from rollback. */
-               next = oldtarget->nextVersionOfRow;
-               if (next != NULL)
-               {
-                       next->priorVersionOfRow = NULL;
-                       oldtarget->nextVersionOfRow = NULL;
-               }
-
-               /* Find or create the new target, and link old and new. */
-               newtarget = (PREDICATELOCKTARGET *)
-                       hash_search_with_hash_value(PredicateLockTargetHash,
-                                                                               &newtargettag, newtargettaghash,
-                                                                               HASH_ENTER, &found);
-               if (!newtarget)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                                        errmsg("out of shared memory"),
-                                        errhint("You might need to increase max_pred_locks_per_transaction.")));
-               if (!found)
-               {
-                       SHMQueueInit(&(newtarget->predicateLocks));
-                       newtarget->nextVersionOfRow = NULL;
-               }
-               else
-                       Assert(newtarget->priorVersionOfRow == NULL);
+                                                                        newblk,
+                                                                        newoff,
+                                                                        newxmin);
  
-               newtarget->priorVersionOfRow = oldtarget;
-               oldtarget->nextVersionOfRow = newtarget;
+       /*
+        * A page-level lock on the page containing the old tuple counts too.
+        * Anyone holding a lock on the page is logically holding a lock on
+        * the old tuple, so we need to acquire a lock on his behalf on the
+        * new tuple too. However, if the new tuple is on the same page as the
+        * old one, the old page-level lock already covers the new tuple.
+        *
+        * A relation-level lock always covers both tuple versions, so we don't
+        * need to worry about those here.
+        */
+       LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
  
-               LWLockRelease(PredicateLockNextRowLinkLock);
-       }
+       TransferPredicateLocksToNewTarget(oldtupletag, newtupletag, false);
+       if (newblk != oldblk)
+               TransferPredicateLocksToNewTarget(oldpagetag, newtupletag, false);
  
-       /* Release lower number partition last. */
-       if (oldpartitionLock < newpartitionLock)
-       {
-               LWLockRelease(newpartitionLock);
-               LWLockRelease(oldpartitionLock);
-       }
-       else if (newpartitionLock < oldpartitionLock)
-       {
-               LWLockRelease(oldpartitionLock);
-               LWLockRelease(newpartitionLock);
-       }
-       else
-               LWLockRelease(newpartitionLock);
+       LWLockRelease(SerializablePredicateLockListLock);
  }
  
  
@@ -2437,6 +2366,17 @@ DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash)
   * removeOld is set (by using the reserved entry in
   * PredicateLockTargetHash for scratch space).
   *
+ * Warning: the "removeOld" option should be used only with care,
+ * because this function does not (indeed, can not) update other
+ * backends' LocalPredicateLockHash. If we are only adding new
+ * entries, this is not a problem: the local lock table is used only
+ * as a hint, so missing entries for locks that are held are
+ * OK. Having entries for locks that are no longer held, as can happen
+ * when using "removeOld", is not in general OK. We can only use it
+ * safely when replacing a lock with a coarser-granularity lock that
+ * covers it, or if we are absolutely certain that no one will need to
+ * refer to that lock in the future.
+ *
   * Caller must hold SerializablePredicateLockListLock.
   */
  static bool
@@ -2533,11 +2473,7 @@ TransferPredicateLocksToNewTarget(const PREDICATELOCKTARGETTAG oldtargettag,
  
                 /* If we created a new entry, initialize it */
                 if (!found)
-               {
                         SHMQueueInit(&(newtarget->predicateLocks));
-                       newtarget->priorVersionOfRow = NULL;
-                       newtarget->nextVersionOfRow = NULL;
-               }
  
                 newpredlocktag.myTarget = newtarget;
  
@@ -2704,7 +2640,14 @@ PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno,
                                                                                         &newtargettag);
                 Assert(success);
  
-               /* Move the locks to the parent. This shouldn't fail.  */
+               /*
+                * Move the locks to the parent. This shouldn't fail.
+                *
+                * Note that here we are removing locks held by other
+                * backends, leading to a possible inconsistency in their
+                * local lock hash table. This is OK because we're replacing
+                * it with a lock that covers the old one.
+                */
                 success = TransferPredicateLocksToNewTarget(oldtargettag,
                                                                                                         newtargettag,
                                                                                                         true);
@@ -2727,36 +2670,19 @@ void
  PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno,
                                                  const BlockNumber newblkno)
  {
-       PREDICATELOCKTARGETTAG oldtargettag;
-       PREDICATELOCKTARGETTAG newtargettag;
-       bool            success;
-
-
-       if (SkipSplitTracking(relation))
-               return;
-
-       Assert(oldblkno != newblkno);
-       Assert(BlockNumberIsValid(oldblkno));
-       Assert(BlockNumberIsValid(newblkno));
-
-       SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
-                                                                       relation->rd_node.dbNode,
-                                                                       relation->rd_id,
-                                                                       oldblkno);
-       SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
-                                                                       relation->rd_node.dbNode,
-                                                                       relation->rd_id,
-                                                                       newblkno);
-
-       LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
-
-       /* Move the locks. This shouldn't fail. */
-       success = TransferPredicateLocksToNewTarget(oldtargettag,
-                                                                                               newtargettag,
-                                                                                               true);
-       Assert(success);
-
-       LWLockRelease(SerializablePredicateLockListLock);
+       /*
+        * Page combines differ from page splits in that we ought to be
+        * able to remove the locks on the old page after transferring
+        * them to the new page, instead of duplicating them. However,
+        * because we can't edit other backends' local lock tables,
+        * removing the old lock would leave them with an entry in their
+        * LocalPredicateLockHash for a lock they're not holding, which
+        * isn't acceptable. So we wind up having to do the same work as a
+        * page split, acquiring a lock on the new page and keeping the old
+        * page locked too. That can lead to some false positives, but
+        * should be rare in practice.
+        */
+       PredicateLockPageSplit(relation, oldblkno, newblkno);
  }
  
  /*
@@ -3132,9 +3058,6 @@ ClearOldPredicateLocks(void)
  {
         SERIALIZABLEXACT *finishedSxact;
         PREDICATELOCK *predlock;
-       int                     i;
-       HASH_SEQ_STATUS seqstat;
-       PREDICATELOCKTARGET *locktarget;
  
         LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
         finishedSxact = (SERIALIZABLEXACT *)
@@ -3232,35 +3155,6 @@ ClearOldPredicateLocks(void)
  
         LWLockRelease(SerializablePredicateLockListLock);
         LWLockRelease(SerializableFinishedListLock);
-
-       if (!PredXact->NeedTargetLinkCleanup)
-               return;
-
-       /*
-        * Clean up any targets which were disconnected from a prior version with
-        * no predicate locks attached.
-        */
-       for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
-               LWLockAcquire(FirstPredicateLockMgrLock + i, LW_EXCLUSIVE);
-       LWLockAcquire(PredicateLockNextRowLinkLock, LW_SHARED);
-
-       hash_seq_init(&seqstat, PredicateLockTargetHash);
-       while ((locktarget = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
-       {
-               if (SHMQueueEmpty(&locktarget->predicateLocks)
-                       && locktarget->priorVersionOfRow == NULL
-                       && locktarget->nextVersionOfRow == NULL)
-               {
-                       hash_search(PredicateLockTargetHash, &locktarget->tag,
-                                               HASH_REMOVE, NULL);
-               }
-       }
-
-       PredXact->NeedTargetLinkCleanup = false;
-
-       LWLockRelease(PredicateLockNextRowLinkLock);
-       for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
-               LWLockRelease(FirstPredicateLockMgrLock + i);
  }
  
  /*
@@ -3676,38 +3570,15 @@ CheckForSerializableConflictOut(const bool visible, const Relation relation,
  }
  
  /*
- * Check a particular target for rw-dependency conflict in. This will
- * also check prior versions of a tuple, if any.
+ * Check a particular target for rw-dependency conflict in.
   */
  static void
  CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
-{
-       PREDICATELOCKTARGETTAG nexttargettag = { 0 };
-       PREDICATELOCKTARGETTAG thistargettag;
-
-       for (;;)
-       {
-               if (!CheckSingleTargetForConflictsIn(targettag, &nexttargettag))
-                       break;
-               thistargettag = nexttargettag;
-               targettag = &thistargettag;
-       }
-}
-
-/*
- * Check a particular target for rw-dependency conflict in. If the tuple
- * has prior versions, returns true and *nexttargettag is set to the tag
- * of the prior tuple version.
- */
-static bool
-CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
-                                                               PREDICATELOCKTARGETTAG *nexttargettag)
  {
         uint32          targettaghash;
         LWLockId        partitionLock;
         PREDICATELOCKTARGET *target;
         PREDICATELOCK *predlock;
-       bool            hasnexttarget = false;
  
         Assert(MySerializableXact != InvalidSerializableXact);
  
@@ -3717,7 +3588,6 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
         targettaghash = PredicateLockTargetTagHashCode(targettag);
         partitionLock = PredicateLockHashPartitionLock(targettaghash);
         LWLockAcquire(partitionLock, LW_SHARED);
-       LWLockAcquire(PredicateLockNextRowLinkLock, LW_SHARED);
         target = (PREDICATELOCKTARGET *)
                 hash_search_with_hash_value(PredicateLockTargetHash,
                                                                         targettag, targettaghash,
@@ -3725,21 +3595,9 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
         if (!target)
         {
                 /* Nothing has this target locked; we're done here. */
-               LWLockRelease(PredicateLockNextRowLinkLock);
                 LWLockRelease(partitionLock);
-               return false;
-       }
-
-       /*
-        * If the target is linked to a prior version of the row, save the tag so
-        * that it can be used for iterative calls to this function.
-        */
-       if (target->priorVersionOfRow != NULL)
-       {
-               *nexttargettag = target->priorVersionOfRow->tag;
-               hasnexttarget = true;
+               return;
         }
-       LWLockRelease(PredicateLockNextRowLinkLock);
  
         /*
          * Each lock for an overlapping transaction represents a conflict: a
@@ -3828,17 +3686,25 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
                                                 hash_search_with_hash_value(LocalPredicateLockHash,
                                                                                                         targettag, targettaghash,
                                                                                                         HASH_FIND, NULL);
-                                       Assert(locallock != NULL);
-                                       Assert(locallock->held);
-                                       locallock->held = false;
  
-                                       if (locallock->childLocks == 0)
+                                       /*
+                                        * Remove entry in local lock table if it exists and has
+                                        * no children. It's OK if it doesn't exist; that means
+                                        * the lock was transferred to a new target by a
+                                        * different backend.
+                                        */
+                                       if (locallock != NULL)
                                         {
-                                               rmlocallock = (LOCALPREDICATELOCK *)
-                                                       hash_search_with_hash_value(LocalPredicateLockHash,
-                                                                                                       targettag, targettaghash,
-                                                                                                               HASH_REMOVE, NULL);
-                                               Assert(rmlocallock == locallock);
+                                               locallock->held = false;
+
+                                               if (locallock->childLocks == 0)
+                                               {
+                                                       rmlocallock = (LOCALPREDICATELOCK *)
+                                                               hash_search_with_hash_value(LocalPredicateLockHash,
+                                                                                                                       targettag, targettaghash,
+                                                                                                                       HASH_REMOVE, NULL);
+                                                       Assert(rmlocallock == locallock);
+                                               }
                                         }
  
                                         DecrementParentLocks(targettag);
@@ -3848,7 +3714,7 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
                                          * the target, bail out before re-acquiring the locks.
                                          */
                                         if (rmtarget)
-                                               return hasnexttarget;
+                                               return;
  
                                         /*
                                          * The list has been altered.  Start over at the front.
@@ -3895,8 +3761,6 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
         }
         LWLockRelease(SerializableXactHashLock);
         LWLockRelease(partitionLock);
-
-       return hasnexttarget;
  }
  
  /*
@@ -3943,7 +3807,8 @@ CheckForSerializableConflictIn(const Relation relation, const HeapTuple tuple,
                                                                                  relation->rd_node.dbNode,
                                                                                  relation->rd_id,
                                                  ItemPointerGetBlockNumber(&(tuple->t_data->t_ctid)),
-                                          ItemPointerGetOffsetNumber(&(tuple->t_data->t_ctid)));
+                                          ItemPointerGetOffsetNumber(&(tuple->t_data->t_ctid)),
+                                          HeapTupleHeaderGetXmin(tuple->t_data));
                 CheckTargetForConflictsIn(&targettag);
         }
  
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h

index 99fe37ef83bc515eb9fd4b3ccaa4d8e88b9090a1..ad0bcd775b0f9bcab60a067d6de5a342ba4e24c4 100644 (file)
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -78,7 +78,6 @@ typedef enum LWLockId
         SerializableFinishedListLock,
         SerializablePredicateLockListLock,
         OldSerXidLock,
-       PredicateLockNextRowLinkLock,
         /* Individual lock IDs end here */
         FirstBufMappingLock,
         FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
diff --git a/src/include/storage/predicate_internals.h b/src/include/storage/predicate_internals.h

index 32e9a1bc218358e04868e765445e40e326cb877c..055612563675c066778eb2f08b7a7dc6ca442714 100644 (file)
--- a/src/include/storage/predicate_internals.h
+++ b/src/include/storage/predicate_internals.h
@@ -150,8 +150,6 @@ typedef struct PredXactListData
         SerCommitSeqNo HavePartialClearedThrough;       /* have cleared through this
                                                                                                  * seq no */
         SERIALIZABLEXACT *OldCommittedSxact;            /* shared copy of dummy sxact */
-       bool            NeedTargetLinkCleanup;  /* to save cleanup effort for rare
-                                                                                * case */
  
         PredXactListElement element;
  } PredXactListData;
@@ -231,9 +229,13 @@ typedef struct SERIALIZABLEXID
  
  /*
   * The PREDICATELOCKTARGETTAG struct identifies a database object which can
- * be the target of predicate locks.  It is designed to fit into 16 bytes
- * with no padding.  Note that this would need adjustment if we widen Oid or
- * BlockNumber to more than 32 bits.
+ * be the target of predicate locks.
+ *
+ * Note that the hash function being used doesn't properly respect tag
+ * length -- it will go to a four byte boundary past the end of the tag.
+ * If you change this struct, make sure any slack space is initialized,
+ * so that any random bytes in the middle or at the end are not included
+ * in the hash.
   *
   * TODO SSI: If we always use the same fields for the same type of value, we
   * should rename these.  Holding off until it's clear there are no exceptions.
@@ -247,8 +249,8 @@ typedef struct PREDICATELOCKTARGETTAG
         uint32          locktag_field1; /* a 32-bit ID field */
         uint32          locktag_field2; /* a 32-bit ID field */
         uint32          locktag_field3; /* a 32-bit ID field */
-       uint16          locktag_field4; /* a 16-bit ID field */
-       uint16          locktag_field5; /* a 16-bit ID field */
+       uint32          locktag_field4; /* a 32-bit ID field */
+       uint32          locktag_field5; /* a 32-bit ID field */
  } PREDICATELOCKTARGETTAG;
  
  /*
@@ -260,12 +262,11 @@ typedef struct PREDICATELOCKTARGETTAG
   * already have one.  An entry is removed when the last lock is removed from
   * its list.
   *
- * Because a check for predicate locks on a tuple target should also find
- * locks on previous versions of the same row, if there are any created by
- * overlapping transactions, we keep a pointer to the target for the prior
- * version of the row. We also keep a pointer to the next version of the
- * row, so that when we no longer have any predicate locks and the back
- * pointer is clear, we can clean up the prior pointer for the next version.
+ * Because a particular target might become obsolete, due to update to a new
+ * version, before the reading transaction is obsolete, we need some way to
+ * prevent errors from reuse of a tuple ID.  Rather than attempting to clean
+ * up the targets as the related tuples are pruned or vacuumed, we check the
+ * xmin on access.  This should be far less costly.
   */
  typedef struct PREDICATELOCKTARGET PREDICATELOCKTARGET;
  
@@ -277,15 +278,6 @@ struct PREDICATELOCKTARGET
         /* data */
         SHM_QUEUE       predicateLocks; /* list of PREDICATELOCK objects assoc. with
                                                                  * predicate lock target */
-
-       /*
-        * The following two pointers are only used for tuple locks, and are only
-        * consulted for conflict detection and cleanup; not for granularity
-        * promotion.
-        */
-       PREDICATELOCKTARGET *priorVersionOfRow;         /* what other locks to check */
-       PREDICATELOCKTARGET *nextVersionOfRow;          /* who has pointer here for
-                                                                                                * more targets */
  };
  
  
@@ -387,30 +379,32 @@ typedef struct PredicateLockData
          (locktag).locktag_field2 = (reloid), \
          (locktag).locktag_field3 = InvalidBlockNumber, \
          (locktag).locktag_field4 = InvalidOffsetNumber, \
-        (locktag).locktag_field5 = 0)
+        (locktag).locktag_field5 = InvalidTransactionId)
  
  #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
         ((locktag).locktag_field1 = (dboid), \
          (locktag).locktag_field2 = (reloid), \
          (locktag).locktag_field3 = (blocknum), \
          (locktag).locktag_field4 = InvalidOffsetNumber, \
-        (locktag).locktag_field5 = 0)
+        (locktag).locktag_field5 = InvalidTransactionId)
  
-#define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
+#define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum,xmin) \
         ((locktag).locktag_field1 = (dboid), \
          (locktag).locktag_field2 = (reloid), \
          (locktag).locktag_field3 = (blocknum), \
          (locktag).locktag_field4 = (offnum), \
-        (locktag).locktag_field5 = 0)
+        (locktag).locktag_field5 = (xmin))
  
  #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
-       ((locktag).locktag_field1)
+       ((Oid) (locktag).locktag_field1)
  #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
-       ((locktag).locktag_field2)
+       ((Oid) (locktag).locktag_field2)
  #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
-       ((locktag).locktag_field3)
+       ((BlockNumber) (locktag).locktag_field3)
  #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
-       ((locktag).locktag_field4)
+       ((OffsetNumber) (locktag).locktag_field4)
+#define GET_PREDICATELOCKTARGETTAG_XMIN(locktag) \
+       ((TransactionId) (locktag).locktag_field5)
  #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag)                                                        \
         (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
          (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE :   \
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Tue, 1 Mar 2011 17:05:16 +0000 (19:05 +0200)
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Tue, 1 Mar 2011 17:05:16 +0000 (19:05 +0200)
src/backend/access/nbtree/nbtree.c		patch \| blob \| blame \| history
src/backend/storage/lmgr/predicate.c		patch \| blob \| blame \| history
src/include/storage/lwlock.h		patch \| blob \| blame \| history
src/include/storage/predicate_internals.h		patch \| blob \| blame \| history