Divide the lock manager's shared state into 'partitions', so as to
authorTom Lane <tgl@sss.pgh.pa.us>
Sun, 11 Dec 2005 21:02:18 +0000 (21:02 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sun, 11 Dec 2005 21:02:18 +0000 (21:02 +0000)
reduce contention for the former single LockMgrLock.  Per my recent
proposal.  I set it up for 16 partitions, but on a pgbench test this
gives only a marginal further improvement over 4 partitions --- we need
to test more scenarios to choose the number of partitions.

src/backend/access/transam/twophase.c
src/backend/storage/ipc/procarray.c
src/backend/storage/lmgr/README
src/backend/storage/lmgr/deadlock.c
src/backend/storage/lmgr/lock.c
src/backend/storage/lmgr/lwlock.c
src/backend/storage/lmgr/proc.c
src/include/storage/lock.h
src/include/storage/lwlock.h
src/include/storage/proc.h

index ffdee8388b3a96e851be2ddb034bff473fbf549c..0898df623375e62a0b9e734e59d6a0cb7d1d8f70 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *             $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.17 2005/11/22 18:17:07 momjian Exp $
+ *             $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.18 2005/12/11 21:02:17 tgl Exp $
  *
  * NOTES
  *             Each global transaction is associated with a global transaction
@@ -284,7 +284,8 @@ MarkAsPreparing(TransactionId xid, const char *gid,
        gxact->proc.lwWaitLink = NULL;
        gxact->proc.waitLock = NULL;
        gxact->proc.waitProcLock = NULL;
-       SHMQueueInit(&(gxact->proc.procLocks));
+       for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+               SHMQueueInit(&(gxact->proc.myProcLocks[i]));
        /* subxid data must be filled later by GXactLoadSubxactData */
        gxact->proc.subxids.overflowed = false;
        gxact->proc.subxids.nxids = 0;
index 7ac8084f6a33fc88e4b2372e0629e50cca104795..cafadeb90542d3167ac3c2669fb656ffdb54a19b 100644 (file)
@@ -14,8 +14,8 @@
  *
  * The process array now also includes PGPROC structures representing
  * prepared transactions.  The xid and subxids fields of these are valid,
- * as is the procLocks list.  They can be distinguished from regular backend
- * PGPROCs at need by checking for pid == 0.
+ * as are the myProcLocks lists.  They can be distinguished from regular
+ * backend PGPROCs at need by checking for pid == 0.
  *
  *
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
@@ -23,7 +23,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.8 2005/11/22 18:17:20 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.9 2005/12/11 21:02:18 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
index 25820f4b73d9e418713a1d85a78ec6273b34e261..fdda5bf82a44d2e3c30fbf8c550a121adabde367 100644 (file)
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/storage/lmgr/README,v 1.18 2005/12/09 01:22:04 tgl Exp $
+$PostgreSQL: pgsql/src/backend/storage/lmgr/README,v 1.19 2005/12/11 21:02:18 tgl Exp $
 
 
 LOCKING OVERVIEW
@@ -50,9 +50,12 @@ LOCK DATA STRUCTURES
 Lock methods describe the overall locking behavior.  Currently there are
 two lock methods: DEFAULT and USER.  (USER locks are non-blocking.)
 
-Lock modes describe the type of the lock (read/write or shared/exclusive). 
-See src/tools/backend/index.html and src/include/storage/lock.h for more
-details.
+Lock modes describe the type of the lock (read/write or shared/exclusive).
+In principle, each lock method can have its own set of lock modes with
+different conflict rules, but currently DEFAULT and USER methods use
+identical lock mode sets.  See src/tools/backend/index.html and
+src/include/storage/lock.h for more details.  (Lock modes are also called
+lock types in some places in the code and documentation.)
 
 There are two fundamental lock structures in shared memory: the
 per-lockable-object LOCK struct, and the per-lock-and-requestor PROCLOCK
@@ -67,7 +70,7 @@ be made per lockable object/lock mode/backend.  Internally to a backend,
 however, the same lock may be requested and perhaps released multiple times
 in a transaction, and it can also be held both transactionally and session-
 wide.  The internal request counts are held in LOCALLOCK so that the shared
-LockMgrLock need not be obtained to alter them.
+data structures need not be accessed to alter them.
 
 ---------------------------------------------------------------------------
 
@@ -103,10 +106,10 @@ procLocks -
     be waiting for more!).
 
 waitProcs -
-    This is a shared memory queue of all process structures corresponding to
-    a backend that is waiting (sleeping) until another backend releases this
+    This is a shared memory queue of all PGPROC structures corresponding to
+    backends that are waiting (sleeping) until another backend releases this
     lock.  The process structure holds the information needed to determine
-    if it should be woken up when this lock is released.
+    if it should be woken up when the lock is released.
 
 nRequested -
     Keeps a count of how many times this lock has been attempted to be
@@ -131,12 +134,12 @@ nGranted -
 granted -
     Keeps count of how many locks of each type are currently held.  Once again
     only elements 1 through MAX_LOCKMODES-1 are used (0 is not).  Also, like
-    requested, summing the values of granted should total to the value
+    requested[], summing the values of granted[] should total to the value
     of nGranted.
 
 We should always have 0 <= nGranted <= nRequested, and
-0 <= granted[i] <= requested[i] for each i.  If the request counts go to
-zero, the lock object is no longer needed and can be freed.
+0 <= granted[i] <= requested[i] for each i.  When all the request counts
+go to zero, the LOCK object is no longer needed and can be freed.
 
 ---------------------------------------------------------------------------
 
@@ -154,15 +157,16 @@ tag -
         SHMEM offset of PGPROC of backend process that owns this PROCLOCK.
 
 holdMask -
-    A bitmask for the lock types successfully acquired by this PROCLOCK.
+    A bitmask for the lock modes successfully acquired by this PROCLOCK.
     This should be a subset of the LOCK object's grantMask, and also a
-    subset of the PGPROC object's heldLocks mask.
+    subset of the PGPROC object's heldLocks mask (if the PGPROC is
+    currently waiting for another lock mode on this lock).
 
 releaseMask -
-    A bitmask for the lock types due to be released during LockReleaseAll.
+    A bitmask for the lock modes due to be released during LockReleaseAll.
     This must be a subset of the holdMask.  Note that it is modified without
-    taking the LockMgrLock, and therefore it is unsafe for any backend except
-    the one owning the PROCLOCK to examine/change it.
+    taking the partition LWLock, and therefore it is unsafe for any
+    backend except the one owning the PROCLOCK to examine/change it.
 
 lockLink -
     List link for shared memory queue of all the PROCLOCK objects for the
@@ -174,7 +178,60 @@ procLink -
 
 ---------------------------------------------------------------------------
 
-The deadlock detection algorithm:
+
+LOCK MANAGER INTERNAL LOCKING
+
+Before PostgreSQL 8.2, all of the shared-memory data structures used by
+the lock manager were protected by a single LWLock, the LockMgrLock;
+any operation involving these data structures had to exclusively lock
+LockMgrLock.  Not too surprisingly, this became a contention bottleneck.
+To reduce contention, the lock manager's data structures have been split
+into multiple "partitions", each protected by an independent LWLock.
+Most operations only need to lock the single partition they are working in.
+Here are the details:
+
+* Each possible lock is assigned to one partition according to a hash of
+its LOCKTAG value (see LockTagToPartition()).  The partition's LWLock is
+considered to protect all the LOCK objects of that partition as well as
+their subsidiary PROCLOCKs.  The shared-memory hash tables for LOCKs and
+PROCLOCKs are divided into separate hash tables for each partition, and
+operations on each hash table are likewise protected by the partition
+lock.
+
+* Formerly, each PGPROC had a single list of PROCLOCKs belonging to it.
+This has now been split into per-partition lists, so that access to a
+particular PROCLOCK list can be protected by the associated partition's
+LWLock.  (This is not strictly necessary at the moment, because at this
+writing a PGPROC's PROCLOCK list is only accessed by the owning backend
+anyway.  But it seems forward-looking to maintain a convention for how
+other backends could access it.  In any case LockReleaseAll needs to be
+able to quickly determine which partition each LOCK belongs to, and
+for the currently contemplated number of partitions, this way takes less
+shared memory than explicitly storing a partition number in LOCK structs
+would require.)
+
+* The other lock-related fields of a PGPROC are only interesting when
+the PGPROC is waiting for a lock, so we consider that they are protected
+by the partition LWLock of the awaited lock.
+
+For normal lock acquisition and release, it is sufficient to lock the
+partition containing the desired lock.  Deadlock checking needs to touch
+multiple partitions in general; for simplicity, we just make it lock all
+the partitions in partition-number order.  (To prevent LWLock deadlock,
+we establish the rule that any backend needing to lock more than one
+partition at once must lock them in partition-number order.)  It's
+possible that deadlock checking could be done without touching every
+partition in typical cases, but since in a properly functioning system
+deadlock checking should not occur often enough to be performance-critical,
+trying to make this work does not seem a productive use of effort.
+
+A backend's internal LOCALLOCK hash table is not partitioned.  We do store
+the partition number in LOCALLOCK table entries, but this is a straight
+speed-for-space tradeoff: we could instead recalculate the partition
+number from the LOCKTAG when needed.
+
+
+THE DEADLOCK DETECTION ALGORITHM
 
 Since we allow user transactions to request locks in any order, deadlock
 is possible.  We use a deadlock detection/breaking algorithm that is
index adbd373bb7f00642d2ad9ac641e2a0210b0b3682..e72ab00b5b03a2eb36efb8ecc70662c51d4c2125 100644 (file)
@@ -12,7 +12,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/lmgr/deadlock.c,v 1.37 2005/12/09 01:22:04 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/lmgr/deadlock.c,v 1.38 2005/12/11 21:02:18 tgl Exp $
  *
  *     Interface:
  *
@@ -53,9 +53,9 @@ typedef struct
  * Information saved about each edge in a detected deadlock cycle.     This
  * is used to print a diagnostic message upon failure.
  *
- * Note: because we want to examine this info after releasing the LockMgrLock,
- * we can't just store LOCK and PGPROC pointers; we must extract out all the
- * info we want to be able to print.
+ * Note: because we want to examine this info after releasing the lock
+ * manager's partition locks, we can't just store LOCK and PGPROC pointers;
+ * we must extract out all the info we want to be able to print.
  */
 typedef struct
 {
@@ -188,19 +188,11 @@ InitDeadLockChecking(void)
  * deadlock.  If resolution is impossible, return TRUE --- the caller
  * is then expected to abort the given proc's transaction.
  *
- * We can't block on user locks, so no sense testing for deadlock
- * because there is no blocking, and no timer for the block.  So,
- * only look at regular locks.
- *
- * We must have already locked the master lock before being called.
- * NOTE: although the lockmethod structure appears to allow each lock
- * table to have a different masterLock, all locks that can block had
- * better use the same LWLock, else this code will not be adequately
- * interlocked!
+ * Caller must already have locked all partitions of the lock tables.
  *
  * On failure, deadlock details are recorded in deadlockDetails[] for
  * subsequent printing by DeadLockReport().  That activity is separate
- * because we don't want to do it while holding the master lock.
+ * because we don't want to do it while holding all those LWLocks.
  */
 bool
 DeadLockCheck(PGPROC *proc)
index 344d677cd2f29104e2e20f27370fe54b577202e8..7f42b477cc607f6f960e61c8874dd15a5b9678d6 100644 (file)
@@ -1,14 +1,14 @@
 /*-------------------------------------------------------------------------
  *
  * lock.c
- *       POSTGRES low-level lock mechanism
+ *       POSTGRES primary lock mechanism
  *
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.161 2005/12/09 01:22:04 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.162 2005/12/11 21:02:18 tgl Exp $
  *
  * NOTES
  *       A lock table is a shared memory hash table.  When
@@ -163,10 +163,13 @@ typedef struct TwoPhaseLockRecord
 
 
 /*
- * Links to hash tables containing lock state
+ * Pointers to hash tables containing lock state
+ *
+ * The LockMethodLockHash and LockMethodProcLockHash hash tables are in
+ * shared memory; LockMethodLocalHash is local to each backend.
  */
-static HTAB *LockMethodLockHash;
-static HTAB *LockMethodProcLockHash;
+static HTAB *LockMethodLockHash[NUM_LOCK_PARTITIONS];
+static HTAB *LockMethodProcLockHash[NUM_LOCK_PARTITIONS];
 static HTAB *LockMethodLocalHash;
 
 
@@ -255,16 +258,25 @@ PROCLOCK_PRINT(const char *where, const PROCLOCK *proclockP)
 
 static void RemoveLocalLock(LOCALLOCK *locallock);
 static void GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner);
-static void WaitOnLock(LOCKMETHODID lockmethodid, LOCALLOCK *locallock,
-                  ResourceOwner owner);
+static void WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner);
 static bool UnGrantLock(LOCK *lock, LOCKMODE lockmode,
                        PROCLOCK *proclock, LockMethod lockMethodTable);
-static void CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock,
-                       PROCLOCK *proclock, bool wakeupNeeded);
+static void CleanUpLock(LOCK *lock, PROCLOCK *proclock,
+                       LockMethod lockMethodTable, int partition,
+                       bool wakeupNeeded);
 
 
 /*
- * InitLocks -- Initialize the lock module's shared memory.
+ * InitLocks -- Initialize the lock manager's data structures.
+ *
+ * This is called from CreateSharedMemoryAndSemaphores(), which see for
+ * more comments.  In the normal postmaster case, the shared hash tables
+ * are created here, as well as a locallock hash table that will remain
+ * unused and empty in the postmaster itself.  Backends inherit the pointers
+ * to the shared tables via fork(), and also inherit an image of the locallock
+ * hash table, which they proceed to use.  In the EXEC_BACKEND case, each
+ * backend re-executes this code to obtain pointers to the already existing
+ * shared hash tables and to create its locallock hash table.
  */
 void
 InitLocks(void)
@@ -274,13 +286,18 @@ InitLocks(void)
        int                     hash_flags;
        long            init_table_size,
                                max_table_size;
+       int                     i;
 
-       /* Compute init/max size to request for lock hashtables */
+       /*
+        * Compute init/max size to request for lock hashtables.  Note these
+        * calculations must agree with LockShmemSize!
+        */
        max_table_size = NLOCKENTS();
+       max_table_size = (max_table_size - 1) / NUM_LOCK_PARTITIONS + 1;
        init_table_size = max_table_size / 2;
 
        /*
-        * allocate a hash table for LOCK structs.      This is used to store
+        * Allocate hash tables for LOCK structs.  These are used to store
         * per-locked-object information.
         */
        MemSet(&info, 0, sizeof(info));
@@ -289,37 +306,45 @@ InitLocks(void)
        info.hash = tag_hash;
        hash_flags = (HASH_ELEM | HASH_FUNCTION);
 
-       sprintf(shmemName, "LOCK hash");
-       LockMethodLockHash = ShmemInitHash(shmemName,
-                                                                          init_table_size,
-                                                                          max_table_size,
-                                                                          &info,
-                                                                          hash_flags);
+       for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+       {
+               sprintf(shmemName, "LOCK hash %d", i);
+               LockMethodLockHash[i] = ShmemInitHash(shmemName,
+                                                                                         init_table_size,
+                                                                                         max_table_size,
+                                                                                         &info,
+                                                                                         hash_flags);
+               if (!LockMethodLockHash[i])
+                       elog(FATAL, "could not initialize lock table \"%s\"", shmemName);
+       }
 
-       if (!LockMethodLockHash)
-               elog(FATAL, "could not initialize lock table \"%s\"", shmemName);
+       /* Assume an average of 2 holders per lock */
+       max_table_size *= 2;
+       init_table_size *= 2;
 
        /*
-        * allocate a hash table for PROCLOCK structs.  This is used to store
-        * per-lock-holder information.
+        * Allocate hash tables for PROCLOCK structs.  These are used to store
+        * per-lock-per-holder information.
         */
        info.keysize = sizeof(PROCLOCKTAG);
        info.entrysize = sizeof(PROCLOCK);
        info.hash = tag_hash;
        hash_flags = (HASH_ELEM | HASH_FUNCTION);
 
-       sprintf(shmemName, "PROCLOCK hash");
-       LockMethodProcLockHash = ShmemInitHash(shmemName,
-                                                                                  init_table_size,
-                                                                                  max_table_size,
-                                                                                  &info,
-                                                                                  hash_flags);
-
-       if (!LockMethodProcLockHash)
-               elog(FATAL, "could not initialize lock table \"%s\"", shmemName);
+       for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+       {
+               sprintf(shmemName, "PROCLOCK hash %d", i);
+               LockMethodProcLockHash[i] = ShmemInitHash(shmemName,
+                                                                                                 init_table_size,
+                                                                                                 max_table_size,
+                                                                                                 &info,
+                                                                                                 hash_flags);
+               if (!LockMethodProcLockHash[i])
+                       elog(FATAL, "could not initialize lock table \"%s\"", shmemName);
+       }
 
        /*
-        * allocate a non-shared hash table for LOCALLOCK structs.      This is used
+        * Allocate one non-shared hash table for LOCALLOCK structs.  This is used
         * to store lock counts and resource owner information.
         *
         * The non-shared table could already exist in this process (this occurs
@@ -355,6 +380,39 @@ GetLocksMethodTable(const LOCK *lock)
 }
 
 
+/*
+ * Given a LOCKTAG, determine which partition the lock belongs in.
+ *
+ * Basically what we want to do here is hash the locktag.  However, it
+ * seems unwise to use hash_any() because that is the same function that
+ * will be used to distribute the locks within each partition's hash table;
+ * if we use it, we run a big risk of having uneven distribution of hash
+ * codes within each hash table.  Instead, we use a simple linear XOR of the
+ * bits of the locktag.
+ */
+int
+LockTagToPartition(const LOCKTAG *locktag)
+{
+       const uint8 *ptr = (const uint8 *) locktag;
+       int                     result = 0;
+       int                     i;
+
+       for (i = 0; i < sizeof(LOCKTAG); i++)
+               result ^= *ptr++;
+#if NUM_LOCK_PARTITIONS == 16
+       result ^= result >> 4;
+       result &= 0x0F;
+#elif NUM_LOCK_PARTITIONS == 4
+       result ^= result >> 4;
+       result ^= result >> 2;
+       result &= 0x03;
+#else
+#error unsupported NUM_LOCK_PARTITIONS
+#endif
+       return result;
+}
+
+
 /*
  * LockAcquire -- Check for lock conflicts, sleep if conflict found,
  *             set lock if/when no conflicts.
@@ -397,7 +455,8 @@ LockAcquire(const LOCKTAG *locktag,
        PROCLOCKTAG proclocktag;
        bool            found;
        ResourceOwner owner;
-       LWLockId        masterLock;
+       int                     partition;
+       LWLockId        partitionLock;
        int                     status;
 
        if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
@@ -438,6 +497,7 @@ LockAcquire(const LOCKTAG *locktag,
                locallock->lock = NULL;
                locallock->proclock = NULL;
                locallock->isTempObject = isTempObject;
+               locallock->partition = LockTagToPartition(&(localtag.lock));
                locallock->nLocks = 0;
                locallock->numLockOwners = 0;
                locallock->maxLockOwners = 8;
@@ -474,9 +534,10 @@ LockAcquire(const LOCKTAG *locktag,
        /*
         * Otherwise we've got to mess with the shared lock table.
         */
-       masterLock = LockMgrLock;
+       partition = locallock->partition;
+       partitionLock = FirstLockMgrLock + partition;
 
-       LWLockAcquire(masterLock, LW_EXCLUSIVE);
+       LWLockAcquire(partitionLock, LW_EXCLUSIVE);
 
        /*
         * Find or create a lock with this tag.
@@ -486,12 +547,12 @@ LockAcquire(const LOCKTAG *locktag,
         * pointer is valid, since a lock object with no locks can go away
         * anytime.
         */
-       lock = (LOCK *) hash_search(LockMethodLockHash,
+       lock = (LOCK *) hash_search(LockMethodLockHash[partition],
                                                                (void *) locktag,
                                                                HASH_ENTER_NULL, &found);
        if (!lock)
        {
-               LWLockRelease(masterLock);
+               LWLockRelease(partitionLock);
                ereport(ERROR,
                                (errcode(ERRCODE_OUT_OF_MEMORY),
                                 errmsg("out of shared memory"),
@@ -532,7 +593,7 @@ LockAcquire(const LOCKTAG *locktag,
        /*
         * Find or create a proclock entry with this tag
         */
-       proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash,
+       proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash[partition],
                                                                                (void *) &proclocktag,
                                                                                HASH_ENTER_NULL, &found);
        if (!proclock)
@@ -547,12 +608,12 @@ LockAcquire(const LOCKTAG *locktag,
                         * anyone to release the lock object later.
                         */
                        Assert(SHMQueueEmpty(&(lock->procLocks)));
-                       if (!hash_search(LockMethodLockHash,
+                       if (!hash_search(LockMethodLockHash[partition],
                                                         (void *) &(lock->tag),
                                                         HASH_REMOVE, NULL))
                                elog(PANIC, "lock table corrupted");
                }
-               LWLockRelease(masterLock);
+               LWLockRelease(partitionLock);
                ereport(ERROR,
                                (errcode(ERRCODE_OUT_OF_MEMORY),
                                 errmsg("out of shared memory"),
@@ -569,7 +630,8 @@ LockAcquire(const LOCKTAG *locktag,
                proclock->releaseMask = 0;
                /* Add proclock to appropriate lists */
                SHMQueueInsertBefore(&lock->procLocks, &proclock->lockLink);
-               SHMQueueInsertBefore(&MyProc->procLocks, &proclock->procLink);
+               SHMQueueInsertBefore(&(MyProc->myProcLocks[partition]),
+                                                        &proclock->procLink);
                PROCLOCK_PRINT("LockAcquire: new", proclock);
        }
        else
@@ -666,7 +728,7 @@ LockAcquire(const LOCKTAG *locktag,
                        {
                                SHMQueueDelete(&proclock->lockLink);
                                SHMQueueDelete(&proclock->procLink);
-                               if (!hash_search(LockMethodProcLockHash,
+                               if (!hash_search(LockMethodProcLockHash[partition],
                                                                 (void *) &(proclock->tag),
                                                                 HASH_REMOVE, NULL))
                                        elog(PANIC, "proclock table corrupted");
@@ -678,7 +740,7 @@ LockAcquire(const LOCKTAG *locktag,
                        LOCK_PRINT("LockAcquire: conditional lock failed", lock, lockmode);
                        Assert((lock->nRequested > 0) && (lock->requested[lockmode] >= 0));
                        Assert(lock->nGranted <= lock->nRequested);
-                       LWLockRelease(masterLock);
+                       LWLockRelease(partitionLock);
                        if (locallock->nLocks == 0)
                                RemoveLocalLock(locallock);
                        return LOCKACQUIRE_NOT_AVAIL;
@@ -692,7 +754,7 @@ LockAcquire(const LOCKTAG *locktag,
                /*
                 * Sleep till someone wakes me up.
                 */
-               WaitOnLock(lockmethodid, locallock, owner);
+               WaitOnLock(locallock, owner);
 
                /*
                 * NOTE: do not do any material change of state between here and
@@ -709,14 +771,14 @@ LockAcquire(const LOCKTAG *locktag,
                        PROCLOCK_PRINT("LockAcquire: INCONSISTENT", proclock);
                        LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode);
                        /* Should we retry ? */
-                       LWLockRelease(masterLock);
+                       LWLockRelease(partitionLock);
                        elog(ERROR, "LockAcquire failed");
                }
                PROCLOCK_PRINT("LockAcquire: granted", proclock);
                LOCK_PRINT("LockAcquire: granted", lock, lockmode);
        }
 
-       LWLockRelease(masterLock);
+       LWLockRelease(partitionLock);
 
        return LOCKACQUIRE_OK;
 }
@@ -894,11 +956,12 @@ UnGrantLock(LOCK *lock, LOCKMODE lockmode,
  * should be called after UnGrantLock, and wakeupNeeded is the result from
  * UnGrantLock.)
  *
- * The locktable's masterLock must be held at entry, and will be
+ * The lock table's partition lock must be held at entry, and will be
  * held at exit.
  */
 static void
-CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock, PROCLOCK *proclock,
+CleanUpLock(LOCK *lock, PROCLOCK *proclock,
+                       LockMethod lockMethodTable, int partition,
                        bool wakeupNeeded)
 {
        /*
@@ -910,7 +973,7 @@ CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock, PROCLOCK *proclock,
                PROCLOCK_PRINT("CleanUpLock: deleting", proclock);
                SHMQueueDelete(&proclock->lockLink);
                SHMQueueDelete(&proclock->procLink);
-               if (!hash_search(LockMethodProcLockHash,
+               if (!hash_search(LockMethodProcLockHash[partition],
                                                 (void *) &(proclock->tag),
                                                 HASH_REMOVE, NULL))
                        elog(PANIC, "proclock table corrupted");
@@ -924,7 +987,7 @@ CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock, PROCLOCK *proclock,
                 */
                LOCK_PRINT("CleanUpLock: deleting", lock, 0);
                Assert(SHMQueueEmpty(&(lock->procLocks)));
-               if (!hash_search(LockMethodLockHash,
+               if (!hash_search(LockMethodLockHash[partition],
                                                 (void *) &(lock->tag),
                                                 HASH_REMOVE, NULL))
                        elog(PANIC, "lock table corrupted");
@@ -932,7 +995,7 @@ CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock, PROCLOCK *proclock,
        else if (wakeupNeeded)
        {
                /* There are waiters on this lock, so wake them up. */
-               ProcLockWakeup(LockMethods[lockmethodid], lock);
+               ProcLockWakeup(lockMethodTable, lock);
        }
 }
 
@@ -988,12 +1051,12 @@ GrantAwaitedLock(void)
  * Caller must have set MyProc->heldLocks to reflect locks already held
  * on the lockable object by this process.
  *
- * The locktable's masterLock must be held at entry.
+ * The appropriate partition lock must be held at entry.
  */
 static void
-WaitOnLock(LOCKMETHODID lockmethodid, LOCALLOCK *locallock,
-                  ResourceOwner owner)
+WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner)
 {
+       LOCKMETHODID lockmethodid = LOCALLOCK_LOCKMETHOD(*locallock);
        LockMethod      lockMethodTable = LockMethods[lockmethodid];
        const char *old_status;
        char       *new_status;
@@ -1025,10 +1088,7 @@ WaitOnLock(LOCKMETHODID lockmethodid, LOCALLOCK *locallock,
         * will also happen in the cancel/die case.
         */
 
-       if (ProcSleep(lockMethodTable,
-                                 locallock->tag.mode,
-                                 locallock->lock,
-                                 locallock->proclock) != STATUS_OK)
+       if (ProcSleep(locallock, lockMethodTable) != STATUS_OK)
        {
                /*
                 * We failed as a result of a deadlock, see CheckDeadLock(). Quit now.
@@ -1036,10 +1096,10 @@ WaitOnLock(LOCKMETHODID lockmethodid, LOCALLOCK *locallock,
                awaitedLock = NULL;
                LOCK_PRINT("WaitOnLock: aborting on lock",
                                   locallock->lock, locallock->tag.mode);
-               LWLockRelease(LockMgrLock);
+               LWLockRelease(FirstLockMgrLock + locallock->partition);
 
                /*
-                * Now that we aren't holding the LockMgrLock, we can give an error
+                * Now that we aren't holding the partition lock, we can give an error
                 * report including details about the detected deadlock.
                 */
                DeadLockReport();
@@ -1059,12 +1119,12 @@ WaitOnLock(LOCKMETHODID lockmethodid, LOCALLOCK *locallock,
  * Remove a proc from the wait-queue it is on
  * (caller must know it is on one).
  *
- * Locktable lock must be held by caller.
+ * Appropriate partition lock must be held by caller.
  *
  * NB: this does not clean up any locallock object that may exist for the lock.
  */
 void
-RemoveFromWaitQueue(PGPROC *proc)
+RemoveFromWaitQueue(PGPROC *proc, int partition)
 {
        LOCK       *waitLock = proc->waitLock;
        PROCLOCK   *proclock = proc->waitProcLock;
@@ -1102,7 +1162,9 @@ RemoveFromWaitQueue(PGPROC *proc)
         * LockRelease expects there to be no remaining proclocks.) Then see if
         * any other waiters for the lock can be woken up now.
         */
-       CleanUpLock(lockmethodid, waitLock, proclock, true);
+       CleanUpLock(waitLock, proclock,
+                               LockMethods[lockmethodid], partition,
+                               true);
 }
 
 /*
@@ -1125,7 +1187,8 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
        LOCALLOCK  *locallock;
        LOCK       *lock;
        PROCLOCK   *proclock;
-       LWLockId        masterLock;
+       int                     partition;
+       LWLockId        partitionLock;
        bool            wakeupNeeded;
 
        if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
@@ -1212,9 +1275,10 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
        /*
         * Otherwise we've got to mess with the shared lock table.
         */
-       masterLock = LockMgrLock;
+       partition = locallock->partition;
+       partitionLock = FirstLockMgrLock + partition;
 
-       LWLockAcquire(masterLock, LW_EXCLUSIVE);
+       LWLockAcquire(partitionLock, LW_EXCLUSIVE);
 
        /*
         * We don't need to re-find the lock or proclock, since we kept their
@@ -1233,7 +1297,7 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
        if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
        {
                PROCLOCK_PRINT("LockRelease: WRONGTYPE", proclock);
-               LWLockRelease(masterLock);
+               LWLockRelease(partitionLock);
                elog(WARNING, "you don't own a lock of type %s",
                         lockMethodTable->lockModeNames[lockmode]);
                RemoveLocalLock(locallock);
@@ -1245,9 +1309,11 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
         */
        wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable);
 
-       CleanUpLock(lockmethodid, lock, proclock, wakeupNeeded);
+       CleanUpLock(lock, proclock,
+                               lockMethodTable, partition,
+                               wakeupNeeded);
 
-       LWLockRelease(masterLock);
+       LWLockRelease(partitionLock);
 
        RemoveLocalLock(locallock);
        return TRUE;
@@ -1265,14 +1331,13 @@ void
 LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
 {
        HASH_SEQ_STATUS status;
-       SHM_QUEUE  *procLocks = &(MyProc->procLocks);
-       LWLockId        masterLock;
        LockMethod      lockMethodTable;
        int                     i,
                                numLockModes;
        LOCALLOCK  *locallock;
-       PROCLOCK   *proclock;
        LOCK       *lock;
+       PROCLOCK   *proclock;
+       int                     partition;
 
        if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
                elog(ERROR, "unrecognized lock method: %d", lockmethodid);
@@ -1284,7 +1349,6 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
 #endif
 
        numLockModes = lockMethodTable->numLockModes;
-       masterLock = LockMgrLock;
 
        /*
         * First we run through the locallock table and get rid of unwanted
@@ -1351,74 +1415,89 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
                RemoveLocalLock(locallock);
        }
 
-       LWLockAcquire(masterLock, LW_EXCLUSIVE);
+       /*
+        * Now, scan each lock partition separately.
+        */
+       for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++)
+       {
+               LWLockId        partitionLock = FirstLockMgrLock + partition;
+               SHM_QUEUE  *procLocks = &(MyProc->myProcLocks[partition]);
 
-       proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
-                                                                                offsetof(PROCLOCK, procLink));
+               proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+                                                                                        offsetof(PROCLOCK, procLink));
 
-       while (proclock)
-       {
-               bool            wakeupNeeded = false;
-               PROCLOCK   *nextplock;
+               if (!proclock)
+                       continue;                       /* needn't examine this partition */
 
-               /* Get link first, since we may unlink/delete this proclock */
-               nextplock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->procLink,
-                                                                                         offsetof(PROCLOCK, procLink));
+               LWLockAcquire(partitionLock, LW_EXCLUSIVE);
 
-               Assert(proclock->tag.proc == MAKE_OFFSET(MyProc));
+               while (proclock)
+               {
+                       bool            wakeupNeeded = false;
+                       PROCLOCK   *nextplock;
 
-               lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+                       /* Get link first, since we may unlink/delete this proclock */
+                       nextplock = (PROCLOCK *)
+                               SHMQueueNext(procLocks, &proclock->procLink,
+                                                        offsetof(PROCLOCK, procLink));
 
-               /* Ignore items that are not of the lockmethod to be removed */
-               if (LOCK_LOCKMETHOD(*lock) != lockmethodid)
-                       goto next_item;
+                       Assert(proclock->tag.proc == MAKE_OFFSET(MyProc));
 
-               /*
-                * In allLocks mode, force release of all locks even if locallock
-                * table had problems
-                */
-               if (allLocks)
-                       proclock->releaseMask = proclock->holdMask;
-               else
-                       Assert((proclock->releaseMask & ~proclock->holdMask) == 0);
+                       lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
 
-               /*
-                * Ignore items that have nothing to be released, unless they have
-                * holdMask == 0 and are therefore recyclable
-                */
-               if (proclock->releaseMask == 0 && proclock->holdMask != 0)
-                       goto next_item;
+                       /* Ignore items that are not of the lockmethod to be removed */
+                       if (LOCK_LOCKMETHOD(*lock) != lockmethodid)
+                               goto next_item;
 
-               PROCLOCK_PRINT("LockReleaseAll", proclock);
-               LOCK_PRINT("LockReleaseAll", lock, 0);
-               Assert(lock->nRequested >= 0);
-               Assert(lock->nGranted >= 0);
-               Assert(lock->nGranted <= lock->nRequested);
-               Assert((proclock->holdMask & ~lock->grantMask) == 0);
+                       /*
+                        * In allLocks mode, force release of all locks even if locallock
+                        * table had problems
+                        */
+                       if (allLocks)
+                               proclock->releaseMask = proclock->holdMask;
+                       else
+                               Assert((proclock->releaseMask & ~proclock->holdMask) == 0);
 
-               /*
-                * Release the previously-marked lock modes
-                */
-               for (i = 1; i <= numLockModes; i++)
-               {
-                       if (proclock->releaseMask & LOCKBIT_ON(i))
-                               wakeupNeeded |= UnGrantLock(lock, i, proclock,
-                                                                                       lockMethodTable);
-               }
-               Assert((lock->nRequested >= 0) && (lock->nGranted >= 0));
-               Assert(lock->nGranted <= lock->nRequested);
-               LOCK_PRINT("LockReleaseAll: updated", lock, 0);
+                       /*
+                        * Ignore items that have nothing to be released, unless they have
+                        * holdMask == 0 and are therefore recyclable
+                        */
+                       if (proclock->releaseMask == 0 && proclock->holdMask != 0)
+                               goto next_item;
 
-               proclock->releaseMask = 0;
+                       PROCLOCK_PRINT("LockReleaseAll", proclock);
+                       LOCK_PRINT("LockReleaseAll", lock, 0);
+                       Assert(lock->nRequested >= 0);
+                       Assert(lock->nGranted >= 0);
+                       Assert(lock->nGranted <= lock->nRequested);
+                       Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+                       /*
+                        * Release the previously-marked lock modes
+                        */
+                       for (i = 1; i <= numLockModes; i++)
+                       {
+                               if (proclock->releaseMask & LOCKBIT_ON(i))
+                                       wakeupNeeded |= UnGrantLock(lock, i, proclock,
+                                                                                               lockMethodTable);
+                       }
+                       Assert((lock->nRequested >= 0) && (lock->nGranted >= 0));
+                       Assert(lock->nGranted <= lock->nRequested);
+                       LOCK_PRINT("LockReleaseAll: updated", lock, 0);
 
-               /* CleanUpLock will wake up waiters if needed. */
-               CleanUpLock(lockmethodid, lock, proclock, wakeupNeeded);
+                       proclock->releaseMask = 0;
 
-next_item:
-               proclock = nextplock;
-       }
+                       /* CleanUpLock will wake up waiters if needed. */
+                       CleanUpLock(lock, proclock,
+                                               lockMethodTable, partition,
+                                               wakeupNeeded);
 
-       LWLockRelease(masterLock);
+               next_item:
+                       proclock = nextplock;
+               } /* loop over PROCLOCKs within this partition */
+
+               LWLockRelease(partitionLock);
+       } /* loop over partitions */
 
 #ifdef LOCK_DEBUG
        if (*(lockMethodTable->trace_flag))
@@ -1627,19 +1706,16 @@ PostPrepare_Locks(TransactionId xid)
 {
        PGPROC     *newproc = TwoPhaseGetDummyProc(xid);
        HASH_SEQ_STATUS status;
-       SHM_QUEUE  *procLocks = &(MyProc->procLocks);
-       LWLockId        masterLock;
        LOCALLOCK  *locallock;
+       LOCK       *lock;
        PROCLOCK   *proclock;
        PROCLOCKTAG proclocktag;
        bool            found;
-       LOCK       *lock;
+       int                     partition;
 
        /* This is a critical section: any error means big trouble */
        START_CRIT_SECTION();
 
-       masterLock = LockMgrLock;
-
        /*
         * First we run through the locallock table and get rid of unwanted
         * entries, then we scan the process's proclocks and transfer them to the
@@ -1678,105 +1754,121 @@ PostPrepare_Locks(TransactionId xid)
                RemoveLocalLock(locallock);
        }
 
-       LWLockAcquire(masterLock, LW_EXCLUSIVE);
+       /*
+        * Now, scan each lock partition separately.
+        */
+       for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++)
+       {
+               LWLockId        partitionLock = FirstLockMgrLock + partition;
+               SHM_QUEUE  *procLocks = &(MyProc->myProcLocks[partition]);
 
-       proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
-                                                                                offsetof(PROCLOCK, procLink));
+               proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+                                                                                        offsetof(PROCLOCK, procLink));
 
-       while (proclock)
-       {
-               PROCLOCK   *nextplock;
-               LOCKMASK        holdMask;
-               PROCLOCK   *newproclock;
+               if (!proclock)
+                       continue;                       /* needn't examine this partition */
 
-               /* Get link first, since we may unlink/delete this proclock */
-               nextplock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->procLink,
-                                                                                         offsetof(PROCLOCK, procLink));
+               LWLockAcquire(partitionLock, LW_EXCLUSIVE);
 
-               Assert(proclock->tag.proc == MAKE_OFFSET(MyProc));
+               while (proclock)
+               {
+                       PROCLOCK   *nextplock;
+                       LOCKMASK        holdMask;
+                       PROCLOCK   *newproclock;
 
-               lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+                       /* Get link first, since we may unlink/delete this proclock */
+                       nextplock = (PROCLOCK *)
+                               SHMQueueNext(procLocks, &proclock->procLink,
+                                                        offsetof(PROCLOCK, procLink));
 
-               /* Ignore nontransactional locks */
-               if (!LockMethods[LOCK_LOCKMETHOD(*lock)]->transactional)
-                       goto next_item;
+                       Assert(proclock->tag.proc == MAKE_OFFSET(MyProc));
 
-               PROCLOCK_PRINT("PostPrepare_Locks", proclock);
-               LOCK_PRINT("PostPrepare_Locks", lock, 0);
-               Assert(lock->nRequested >= 0);
-               Assert(lock->nGranted >= 0);
-               Assert(lock->nGranted <= lock->nRequested);
-               Assert((proclock->holdMask & ~lock->grantMask) == 0);
+                       lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
 
-               /*
-                * Since there were no session locks, we should be releasing all locks
-                */
-               if (proclock->releaseMask != proclock->holdMask)
-                       elog(PANIC, "we seem to have dropped a bit somewhere");
+                       /* Ignore nontransactional locks */
+                       if (!LockMethods[LOCK_LOCKMETHOD(*lock)]->transactional)
+                               goto next_item;
 
-               holdMask = proclock->holdMask;
+                       PROCLOCK_PRINT("PostPrepare_Locks", proclock);
+                       LOCK_PRINT("PostPrepare_Locks", lock, 0);
+                       Assert(lock->nRequested >= 0);
+                       Assert(lock->nGranted >= 0);
+                       Assert(lock->nGranted <= lock->nRequested);
+                       Assert((proclock->holdMask & ~lock->grantMask) == 0);
 
-               /*
-                * We cannot simply modify proclock->tag.proc to reassign ownership of
-                * the lock, because that's part of the hash key and the proclock
-                * would then be in the wrong hash chain.  So, unlink and delete the
-                * old proclock; create a new one with the right contents; and link it
-                * into place.  We do it in this order to be certain we won't run out
-                * of shared memory (the way dynahash.c works, the deleted object is
-                * certain to be available for reallocation).
-                */
-               SHMQueueDelete(&proclock->lockLink);
-               SHMQueueDelete(&proclock->procLink);
-               if (!hash_search(LockMethodProcLockHash,
-                                                (void *) &(proclock->tag),
-                                                HASH_REMOVE, NULL))
-                       elog(PANIC, "proclock table corrupted");
+                       /*
+                        * Since there were no session locks, we should be releasing all
+                        * locks
+                        */
+                       if (proclock->releaseMask != proclock->holdMask)
+                               elog(PANIC, "we seem to have dropped a bit somewhere");
 
-               /*
-                * Create the hash key for the new proclock table.
-                */
-               MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG));
-               proclocktag.lock = MAKE_OFFSET(lock);
-               proclocktag.proc = MAKE_OFFSET(newproc);
-
-               newproclock = (PROCLOCK *) hash_search(LockMethodProcLockHash,
-                                                                                          (void *) &proclocktag,
-                                                                                          HASH_ENTER_NULL, &found);
-               if (!newproclock)
-                       ereport(PANIC,          /* should not happen */
-                                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                                        errmsg("out of shared memory"),
-                                        errdetail("Not enough memory for reassigning the prepared transaction's locks.")));
+                       holdMask = proclock->holdMask;
 
-               /*
-                * If new, initialize the new entry
-                */
-               if (!found)
-               {
-                       newproclock->holdMask = 0;
-                       newproclock->releaseMask = 0;
-                       /* Add new proclock to appropriate lists */
-                       SHMQueueInsertBefore(&lock->procLocks, &newproclock->lockLink);
-                       SHMQueueInsertBefore(&newproc->procLocks, &newproclock->procLink);
-                       PROCLOCK_PRINT("PostPrepare_Locks: new", newproclock);
-               }
-               else
-               {
-                       PROCLOCK_PRINT("PostPrepare_Locks: found", newproclock);
-                       Assert((newproclock->holdMask & ~lock->grantMask) == 0);
-               }
+                       /*
+                        * We cannot simply modify proclock->tag.proc to reassign
+                        * ownership of the lock, because that's part of the hash key and
+                        * the proclock would then be in the wrong hash chain.  So, unlink
+                        * and delete the old proclock; create a new one with the right
+                        * contents; and link it into place.  We do it in this order to be
+                        * certain we won't run out of shared memory (the way dynahash.c
+                        * works, the deleted object is certain to be available for
+                        * reallocation).
+                        */
+                       SHMQueueDelete(&proclock->lockLink);
+                       SHMQueueDelete(&proclock->procLink);
+                       if (!hash_search(LockMethodProcLockHash[partition],
+                                                        (void *) &(proclock->tag),
+                                                        HASH_REMOVE, NULL))
+                               elog(PANIC, "proclock table corrupted");
 
-               /*
-                * Pass over the identified lock ownership.
-                */
-               Assert((newproclock->holdMask & holdMask) == 0);
-               newproclock->holdMask |= holdMask;
+                       /*
+                        * Create the hash key for the new proclock table.
+                        */
+                       MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG));
+                       proclocktag.lock = MAKE_OFFSET(lock);
+                       proclocktag.proc = MAKE_OFFSET(newproc);
+
+                       newproclock = (PROCLOCK *) hash_search(LockMethodProcLockHash[partition],
+                                                                                                  (void *) &proclocktag,
+                                                                                                  HASH_ENTER_NULL, &found);
+                       if (!newproclock)
+                               ereport(PANIC,          /* should not happen */
+                                               (errcode(ERRCODE_OUT_OF_MEMORY),
+                                                errmsg("out of shared memory"),
+                                                errdetail("Not enough memory for reassigning the prepared transaction's locks.")));
 
-next_item:
-               proclock = nextplock;
-       }
+                       /*
+                        * If new, initialize the new entry
+                        */
+                       if (!found)
+                       {
+                               newproclock->holdMask = 0;
+                               newproclock->releaseMask = 0;
+                               /* Add new proclock to appropriate lists */
+                               SHMQueueInsertBefore(&lock->procLocks, &newproclock->lockLink);
+                               SHMQueueInsertBefore(&(newproc->myProcLocks[partition]),
+                                                                        &newproclock->procLink);
+                               PROCLOCK_PRINT("PostPrepare_Locks: new", newproclock);
+                       }
+                       else
+                       {
+                               PROCLOCK_PRINT("PostPrepare_Locks: found", newproclock);
+                               Assert((newproclock->holdMask & ~lock->grantMask) == 0);
+                       }
+
+                       /*
+                        * Pass over the identified lock ownership.
+                        */
+                       Assert((newproclock->holdMask & holdMask) == 0);
+                       newproclock->holdMask |= holdMask;
+
+               next_item:
+                       proclock = nextplock;
+               } /* loop over PROCLOCKs within this partition */
 
-       LWLockRelease(masterLock);
+               LWLockRelease(partitionLock);
+       } /* loop over partitions */
 
        END_CRIT_SECTION();
 }
@@ -1789,20 +1881,23 @@ Size
 LockShmemSize(void)
 {
        Size            size = 0;
-       long            max_table_size = NLOCKENTS();
+       Size            tabsize;
+       long            max_table_size;
 
-       /* lockHash table */
-       size = add_size(size, hash_estimate_size(max_table_size, sizeof(LOCK)));
+       /* lock hash tables */
+       max_table_size = NLOCKENTS();
+       max_table_size = (max_table_size - 1) / NUM_LOCK_PARTITIONS + 1;
+       tabsize = hash_estimate_size(max_table_size, sizeof(LOCK));
+       size = add_size(size, mul_size(tabsize, NUM_LOCK_PARTITIONS));
 
-       /* proclockHash table */
-       size = add_size(size, hash_estimate_size(max_table_size, sizeof(PROCLOCK)));
+       /* proclock hash tables */
+       max_table_size *= 2;
+       tabsize = hash_estimate_size(max_table_size, sizeof(PROCLOCK));
+       size = add_size(size, mul_size(tabsize, NUM_LOCK_PARTITIONS));
 
        /*
-        * Note we count only one pair of hash tables, since the userlocks table
-        * actually overlays the main one.
-        *
-        * Since the lockHash entry count above is only an estimate, add 10%
-        * safety margin.
+        * Since there is likely to be some space wastage due to uneven use
+        * of the partitions, add 10% safety margin.
         */
        size = add_size(size, size / 10);
 
@@ -1818,9 +1913,9 @@ LockShmemSize(void)
  * copies of the same PGPROC and/or LOCK objects are likely to appear.
  * It is the caller's responsibility to match up duplicates if wanted.
  *
- * The design goal is to hold the LockMgrLock for as short a time as possible;
+ * The design goal is to hold the LWLocks for as short a time as possible;
  * thus, this function simply makes a copy of the necessary data and releases
- * the lock, allowing the caller to contemplate and format the data for as
+ * the locks, allowing the caller to contemplate and format the data for as
  * long as it pleases.
  */
 LockData *
@@ -1830,40 +1925,67 @@ GetLockStatusData(void)
        HTAB       *proclockTable;
        PROCLOCK   *proclock;
        HASH_SEQ_STATUS seqstat;
+       int                     els;
+       int                     el;
        int                     i;
 
        data = (LockData *) palloc(sizeof(LockData));
 
-       LWLockAcquire(LockMgrLock, LW_EXCLUSIVE);
-
-       proclockTable = LockMethodProcLockHash;
-
-       data->nelements = i = proclockTable->hctl->nentries;
+       /*
+        * Acquire lock on the entire shared lock data structures.  We can't
+        * operate one partition at a time if we want to deliver a self-consistent
+        * view of the state.
+        *
+        * Since this is a read-only operation, we take shared instead of exclusive
+        * lock.  There's not a whole lot of point to this, because all the normal
+        * operations require exclusive lock, but it doesn't hurt anything either.
+        * It will at least allow two backends to do GetLockStatusData in parallel.
+        *
+        * Must grab LWLocks in partition-number order to avoid LWLock deadlock.
+        *
+        * Use same loop to count up the total number of PROCLOCK objects.
+        */
+       els = 0;
+       for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+       {
+               LWLockAcquire(FirstLockMgrLock + i, LW_SHARED);
+               proclockTable = LockMethodProcLockHash[i];
+               els += proclockTable->hctl->nentries;
+       }
 
-       data->proclockaddrs = (SHMEM_OFFSET *) palloc(sizeof(SHMEM_OFFSET) * i);
-       data->proclocks = (PROCLOCK *) palloc(sizeof(PROCLOCK) * i);
-       data->procs = (PGPROC *) palloc(sizeof(PGPROC) * i);
-       data->locks = (LOCK *) palloc(sizeof(LOCK) * i);
+       data->nelements = els;
+       data->proclockaddrs = (SHMEM_OFFSET *) palloc(sizeof(SHMEM_OFFSET) * els);
+       data->proclocks = (PROCLOCK *) palloc(sizeof(PROCLOCK) * els);
+       data->procs = (PGPROC *) palloc(sizeof(PGPROC) * els);
+       data->locks = (LOCK *) palloc(sizeof(LOCK) * els);
 
-       hash_seq_init(&seqstat, proclockTable);
+       el = 0;
 
-       i = 0;
-       while ((proclock = hash_seq_search(&seqstat)))
+       /* Now scan the tables to copy the data */
+       for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
        {
-               PGPROC     *proc = (PGPROC *) MAKE_PTR(proclock->tag.proc);
-               LOCK       *lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+               proclockTable = LockMethodProcLockHash[i];
+               hash_seq_init(&seqstat, proclockTable);
 
-               data->proclockaddrs[i] = MAKE_OFFSET(proclock);
-               memcpy(&(data->proclocks[i]), proclock, sizeof(PROCLOCK));
-               memcpy(&(data->procs[i]), proc, sizeof(PGPROC));
-               memcpy(&(data->locks[i]), lock, sizeof(LOCK));
+               while ((proclock = hash_seq_search(&seqstat)))
+               {
+                       PGPROC     *proc = (PGPROC *) MAKE_PTR(proclock->tag.proc);
+                       LOCK       *lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+
+                       data->proclockaddrs[el] = MAKE_OFFSET(proclock);
+                       memcpy(&(data->proclocks[el]), proclock, sizeof(PROCLOCK));
+                       memcpy(&(data->procs[el]), proc, sizeof(PGPROC));
+                       memcpy(&(data->locks[el]), lock, sizeof(LOCK));
 
-               i++;
+                       el++;
+               }
        }
 
-       LWLockRelease(LockMgrLock);
+       /* And release locks */
+       for (i = NUM_LOCK_PARTITIONS; --i >= 0; )
+               LWLockRelease(FirstLockMgrLock + i);
 
-       Assert(i == data->nelements);
+       Assert(el == data->nelements);
 
        return data;
 }
@@ -1879,7 +2001,7 @@ GetLockmodeName(LOCKMETHODID lockmethodid, LOCKMODE mode)
 
 #ifdef LOCK_DEBUG
 /*
- * Dump all locks in the given proc's procLocks list.
+ * Dump all locks in the given proc's myProcLocks lists.
  *
  * Caller is responsible for having acquired appropriate LWLocks.
  */
@@ -1889,29 +2011,34 @@ DumpLocks(PGPROC *proc)
        SHM_QUEUE  *procLocks;
        PROCLOCK   *proclock;
        LOCK       *lock;
+       int                     i;
 
        if (proc == NULL)
                return;
 
-       procLocks = &proc->procLocks;
-
        if (proc->waitLock)
                LOCK_PRINT("DumpLocks: waiting on", proc->waitLock, 0);
 
-       proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
-                                                                                offsetof(PROCLOCK, procLink));
-
-       while (proclock)
+       for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
        {
-               Assert(proclock->tag.proc == MAKE_OFFSET(proc));
+               procLocks = &(proc->myProcLocks[i]);
 
-               lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+               proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+                                                                                        offsetof(PROCLOCK, procLink));
 
-               PROCLOCK_PRINT("DumpLocks", proclock);
-               LOCK_PRINT("DumpLocks", lock, 0);
+               while (proclock)
+               {
+                       Assert(proclock->tag.proc == MAKE_OFFSET(proc));
 
-               proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->procLink,
-                                                                                        offsetof(PROCLOCK, procLink));
+                       lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+
+                       PROCLOCK_PRINT("DumpLocks", proclock);
+                       LOCK_PRINT("DumpLocks", lock, 0);
+
+                       proclock = (PROCLOCK *)
+                               SHMQueueNext(procLocks, &proclock->procLink,
+                                                        offsetof(PROCLOCK, procLink));
+               }
        }
 }
 
@@ -1928,25 +2055,30 @@ DumpAllLocks(void)
        LOCK       *lock;
        HTAB       *proclockTable;
        HASH_SEQ_STATUS status;
+       int                     i;
 
        proc = MyProc;
-       proclockTable = LockMethodProcLockHash;
 
        if (proc && proc->waitLock)
                LOCK_PRINT("DumpAllLocks: waiting on", proc->waitLock, 0);
 
-       hash_seq_init(&status, proclockTable);
-       while ((proclock = (PROCLOCK *) hash_seq_search(&status)) != NULL)
+       for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
        {
-               PROCLOCK_PRINT("DumpAllLocks", proclock);
+               proclockTable = LockMethodProcLockHash[i];
+               hash_seq_init(&status, proclockTable);
 
-               if (proclock->tag.lock)
+               while ((proclock = (PROCLOCK *) hash_seq_search(&status)) != NULL)
                {
-                       lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
-                       LOCK_PRINT("DumpAllLocks", lock, 0);
+                       PROCLOCK_PRINT("DumpAllLocks", proclock);
+
+                       if (proclock->tag.lock)
+                       {
+                               lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+                               LOCK_PRINT("DumpAllLocks", lock, 0);
+                       }
+                       else
+                               elog(LOG, "DumpAllLocks: proclock->tag.lock = NULL");
                }
-               else
-                       elog(LOG, "DumpAllLocks: proclock->tag.lock = NULL");
        }
 }
 #endif   /* LOCK_DEBUG */
@@ -1975,7 +2107,8 @@ lock_twophase_recover(TransactionId xid, uint16 info,
        PROCLOCK   *proclock;
        PROCLOCKTAG proclocktag;
        bool            found;
-       LWLockId        masterLock;
+       int                     partition;
+       LWLockId        partitionLock;
        LockMethod      lockMethodTable;
 
        Assert(len == sizeof(TwoPhaseLockRecord));
@@ -1987,19 +2120,20 @@ lock_twophase_recover(TransactionId xid, uint16 info,
                elog(ERROR, "unrecognized lock method: %d", lockmethodid);
        lockMethodTable = LockMethods[lockmethodid];
 
-       masterLock = LockMgrLock;
+       partition = LockTagToPartition(locktag);
+       partitionLock = FirstLockMgrLock + partition;
 
-       LWLockAcquire(masterLock, LW_EXCLUSIVE);
+       LWLockAcquire(partitionLock, LW_EXCLUSIVE);
 
        /*
         * Find or create a lock with this tag.
         */
-       lock = (LOCK *) hash_search(LockMethodLockHash,
+       lock = (LOCK *) hash_search(LockMethodLockHash[partition],
                                                                (void *) locktag,
                                                                HASH_ENTER_NULL, &found);
        if (!lock)
        {
-               LWLockRelease(masterLock);
+               LWLockRelease(partitionLock);
                ereport(ERROR,
                                (errcode(ERRCODE_OUT_OF_MEMORY),
                                 errmsg("out of shared memory"),
@@ -2039,7 +2173,7 @@ lock_twophase_recover(TransactionId xid, uint16 info,
        /*
         * Find or create a proclock entry with this tag
         */
-       proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash,
+       proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash[partition],
                                                                                (void *) &proclocktag,
                                                                                HASH_ENTER_NULL, &found);
        if (!proclock)
@@ -2054,12 +2188,12 @@ lock_twophase_recover(TransactionId xid, uint16 info,
                         * anyone to release the lock object later.
                         */
                        Assert(SHMQueueEmpty(&(lock->procLocks)));
-                       if (!hash_search(LockMethodLockHash,
+                       if (!hash_search(LockMethodLockHash[partition],
                                                         (void *) &(lock->tag),
                                                         HASH_REMOVE, NULL))
                                elog(PANIC, "lock table corrupted");
                }
-               LWLockRelease(masterLock);
+               LWLockRelease(partitionLock);
                ereport(ERROR,
                                (errcode(ERRCODE_OUT_OF_MEMORY),
                                 errmsg("out of shared memory"),
@@ -2075,7 +2209,8 @@ lock_twophase_recover(TransactionId xid, uint16 info,
                proclock->releaseMask = 0;
                /* Add proclock to appropriate lists */
                SHMQueueInsertBefore(&lock->procLocks, &proclock->lockLink);
-               SHMQueueInsertBefore(&proc->procLocks, &proclock->procLink);
+               SHMQueueInsertBefore(&(proc->myProcLocks[partition]),
+                                                        &proclock->procLink);
                PROCLOCK_PRINT("lock_twophase_recover: new", proclock);
        }
        else
@@ -2106,7 +2241,7 @@ lock_twophase_recover(TransactionId xid, uint16 info,
         */
        GrantLock(lock, proclock, lockmode);
 
-       LWLockRelease(masterLock);
+       LWLockRelease(partitionLock);
 }
 
 /*
@@ -2123,10 +2258,11 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
        LOCKTAG    *locktag;
        LOCKMODE        lockmode;
        LOCKMETHODID lockmethodid;
-       PROCLOCKTAG proclocktag;
        LOCK       *lock;
        PROCLOCK   *proclock;
-       LWLockId        masterLock;
+       PROCLOCKTAG proclocktag;
+       int                     partition;
+       LWLockId        partitionLock;
        LockMethod      lockMethodTable;
        bool            wakeupNeeded;
 
@@ -2139,14 +2275,15 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
                elog(ERROR, "unrecognized lock method: %d", lockmethodid);
        lockMethodTable = LockMethods[lockmethodid];
 
-       masterLock = LockMgrLock;
+       partition = LockTagToPartition(locktag);
+       partitionLock = FirstLockMgrLock + partition;
 
-       LWLockAcquire(masterLock, LW_EXCLUSIVE);
+       LWLockAcquire(partitionLock, LW_EXCLUSIVE);
 
        /*
         * Re-find the lock object (it had better be there).
         */
-       lock = (LOCK *) hash_search(LockMethodLockHash,
+       lock = (LOCK *) hash_search(LockMethodLockHash[partition],
                                                                (void *) locktag,
                                                                HASH_FIND, NULL);
        if (!lock)
@@ -2158,7 +2295,7 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
        MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG));           /* must clear padding */
        proclocktag.lock = MAKE_OFFSET(lock);
        proclocktag.proc = MAKE_OFFSET(proc);
-       proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash,
+       proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash[partition],
                                                                                (void *) &proclocktag,
                                                                                HASH_FIND, NULL);
        if (!proclock)
@@ -2171,7 +2308,7 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
        if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
        {
                PROCLOCK_PRINT("lock_twophase_postcommit: WRONGTYPE", proclock);
-               LWLockRelease(masterLock);
+               LWLockRelease(partitionLock);
                elog(WARNING, "you don't own a lock of type %s",
                         lockMethodTable->lockModeNames[lockmode]);
                return;
@@ -2182,9 +2319,11 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
         */
        wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable);
 
-       CleanUpLock(lockmethodid, lock, proclock, wakeupNeeded);
+       CleanUpLock(lock, proclock,
+                               lockMethodTable, partition,
+                               wakeupNeeded);
 
-       LWLockRelease(masterLock);
+       LWLockRelease(partitionLock);
 }
 
 /*
index a215a65285511dca9c7ca6f26145d1bcacd67a12..e1edabde905efa7004c02285742fb455a6b2843b 100644 (file)
@@ -8,14 +8,14 @@
  * exclusive and shared lock modes (to support read/write and read-only
  * access to a shared object). There are few other frammishes.  User-level
  * locking should be done with the full lock manager --- which depends on
- * an LWLock to protect its shared state.
+ * LWLocks to protect its shared state.
  *
  *
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/lmgr/lwlock.c,v 1.35 2005/12/06 23:08:33 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/lmgr/lwlock.c,v 1.36 2005/12/11 21:02:18 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -125,7 +125,10 @@ NumLWLocks(void)
         */
 
        /* Predefined LWLocks */
-       numLocks = (int) NumFixedLWLocks;
+       numLocks = (int) FirstLockMgrLock;
+
+       /* lock.c gets the ones starting at FirstLockMgrLock */
+       numLocks += NUM_LOCK_PARTITIONS;
 
        /* bufmgr.c needs two for each shared buffer */
        numLocks += 2 * NBuffers;
@@ -204,10 +207,11 @@ CreateLWLocks(void)
 
        /*
         * Initialize the dynamic-allocation counter, which is stored just before
-        * the first LWLock.
+        * the first LWLock.  The LWLocks used by lock.c are not dynamically
+        * allocated, it just assumes it has them.
         */
        LWLockCounter = (int *) ((char *) LWLockArray - 2 * sizeof(int));
-       LWLockCounter[0] = (int) NumFixedLWLocks;
+       LWLockCounter[0] = (int) FirstLockMgrLock + NUM_LOCK_PARTITIONS;
        LWLockCounter[1] = numLocks;
 }
 
index 8d8269041e7f2ba24fcb38603930dc8de5f7e262..34d80bfceeacf8ea4e5007831964dab095e8440b 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.169 2005/12/09 01:22:04 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.170 2005/12/11 21:02:18 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -18,9 +18,8 @@
  *             ProcQueueAlloc() -- create a shm queue for sleeping processes
  *             ProcQueueInit() -- create a queue without allocing memory
  *
- * Locking and waiting for buffers can cause the backend to be
- * put to sleep.  Whoever releases the lock, etc. wakes the
- * process up again (and gives it an error code so it knows
+ * Waiting for a lock causes the backend to be put to sleep.  Whoever releases
+ * the lock wakes the process up again (and gives it an error code so it knows
  * whether it was awoken on an error condition).
  *
  * Interface (b):
@@ -28,7 +27,7 @@
  * ProcReleaseLocks -- frees the locks associated with current transaction
  *
  * ProcKill -- destroys the shared memory state (and locks)
- *             associated with the process.
+ * associated with the process.
  */
 #include "postgres.h"
 
@@ -65,7 +64,8 @@ NON_EXEC_STATIC slock_t *ProcStructLock = NULL;
 static PROC_HDR *ProcGlobal = NULL;
 static PGPROC *DummyProcs = NULL;
 
-static bool waitingForLock = false;
+/* If we are waiting for a lock, this points to the associated LOCALLOCK */
+static LOCALLOCK *lockAwaited = NULL;
 
 /* Mark these volatile because they can be changed by signal handler */
 static volatile bool statement_timeout_active = false;
@@ -200,10 +200,10 @@ InitProcGlobal(void)
 void
 InitProcess(void)
 {
-       SHMEM_OFFSET myOffset;
-
        /* use volatile pointer to prevent code rearrangement */
        volatile PROC_HDR *procglobal = ProcGlobal;
+       SHMEM_OFFSET myOffset;
+       int                     i;
 
        /*
         * ProcGlobal should be set by a previous call to InitProcGlobal (if we
@@ -264,7 +264,8 @@ InitProcess(void)
        MyProc->lwWaitLink = NULL;
        MyProc->waitLock = NULL;
        MyProc->waitProcLock = NULL;
-       SHMQueueInit(&(MyProc->procLocks));
+       for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+               SHMQueueInit(&(MyProc->myProcLocks[i]));
 
        /*
         * Add our PGPROC to the PGPROC array in shared memory.
@@ -304,6 +305,7 @@ void
 InitDummyProcess(int proctype)
 {
        PGPROC     *dummyproc;
+       int                     i;
 
        /*
         * ProcGlobal should be set by a previous call to InitProcGlobal (we
@@ -360,7 +362,8 @@ InitDummyProcess(int proctype)
        MyProc->lwWaitLink = NULL;
        MyProc->waitLock = NULL;
        MyProc->waitProcLock = NULL;
-       SHMQueueInit(&(MyProc->procLocks));
+       for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+               SHMQueueInit(&(MyProc->myProcLocks[i]));
 
        /*
         * Arrange to clean up at process exit.
@@ -416,21 +419,24 @@ HaveNFreeProcs(int n)
 bool
 LockWaitCancel(void)
 {
+       LWLockId        partitionLock;
+
        /* Nothing to do if we weren't waiting for a lock */
-       if (!waitingForLock)
+       if (lockAwaited == NULL)
                return false;
 
        /* Turn off the deadlock timer, if it's still running (see ProcSleep) */
        disable_sig_alarm(false);
 
        /* Unlink myself from the wait queue, if on it (might not be anymore!) */
-       LWLockAcquire(LockMgrLock, LW_EXCLUSIVE);
+       partitionLock = FirstLockMgrLock + lockAwaited->partition;
+       LWLockAcquire(partitionLock, LW_EXCLUSIVE);
 
        if (MyProc->links.next != INVALID_OFFSET)
        {
                /* We could not have been granted the lock yet */
                Assert(MyProc->waitStatus == STATUS_ERROR);
-               RemoveFromWaitQueue(MyProc);
+               RemoveFromWaitQueue(MyProc, lockAwaited->partition);
        }
        else
        {
@@ -444,9 +450,9 @@ LockWaitCancel(void)
                        GrantAwaitedLock();
        }
 
-       waitingForLock = false;
+       lockAwaited = NULL;
 
-       LWLockRelease(LockMgrLock);
+       LWLockRelease(partitionLock);
 
        /*
         * Reset the proc wait semaphore to zero.  This is necessary in the
@@ -606,18 +612,18 @@ ProcQueueInit(PROC_QUEUE *queue)
 
 
 /*
- * ProcSleep -- put a process to sleep
+ * ProcSleep -- put a process to sleep on the specified lock
  *
  * Caller must have set MyProc->heldLocks to reflect locks already held
  * on the lockable object by this process (under all XIDs).
  *
- * Locktable's masterLock must be held at entry, and will be held
+ * The lock table's partition lock must be held at entry, and will be held
  * at exit.
  *
  * Result: STATUS_OK if we acquired the lock, STATUS_ERROR if not (deadlock).
  *
  * ASSUME: that no one will fiddle with the queue until after
- *             we release the masterLock.
+ *             we release the partition lock.
  *
  * NOTES: The process queue is now a priority queue for locking.
  *
@@ -625,12 +631,13 @@ ProcQueueInit(PROC_QUEUE *queue)
  * semaphore is normally zero, so when we try to acquire it, we sleep.
  */
 int
-ProcSleep(LockMethod lockMethodTable,
-                 LOCKMODE lockmode,
-                 LOCK *lock,
-                 PROCLOCK *proclock)
+ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 {
-       LWLockId        masterLock = LockMgrLock;
+       LOCKMODE        lockmode = locallock->tag.mode;
+       LOCK       *lock = locallock->lock;
+       PROCLOCK   *proclock = locallock->proclock;
+       int                     partition = locallock->partition;
+       LWLockId        partitionLock = FirstLockMgrLock + partition;
        PROC_QUEUE *waitQueue = &(lock->waitProcs);
        LOCKMASK        myHeldLocks = MyProc->heldLocks;
        bool            early_deadlock = false;
@@ -732,22 +739,22 @@ ProcSleep(LockMethod lockMethodTable,
         */
        if (early_deadlock)
        {
-               RemoveFromWaitQueue(MyProc);
+               RemoveFromWaitQueue(MyProc, partition);
                return STATUS_ERROR;
        }
 
        /* mark that we are waiting for a lock */
-       waitingForLock = true;
+       lockAwaited = locallock;
 
        /*
-        * Release the locktable's masterLock.
+        * Release the lock table's partition lock.
         *
         * NOTE: this may also cause us to exit critical-section state, possibly
         * allowing a cancel/die interrupt to be accepted. This is OK because we
         * have recorded the fact that we are waiting for a lock, and so
         * LockWaitCancel will clean up if cancel/die happens.
         */
-       LWLockRelease(masterLock);
+       LWLockRelease(partitionLock);
 
        /*
         * Set timer so we can wake up after awhile and check for a deadlock. If a
@@ -785,16 +792,16 @@ ProcSleep(LockMethod lockMethodTable,
                elog(FATAL, "could not disable timer for process wakeup");
 
        /*
-        * Re-acquire the locktable's masterLock.  We have to do this to hold off
-        * cancel/die interrupts before we can mess with waitingForLock (else we
-        * might have a missed or duplicated locallock update).
+        * Re-acquire the lock table's partition lock.  We have to do this to
+        * hold off cancel/die interrupts before we can mess with lockAwaited
+        * (else we might have a missed or duplicated locallock update).
         */
-       LWLockAcquire(masterLock, LW_EXCLUSIVE);
+       LWLockAcquire(partitionLock, LW_EXCLUSIVE);
 
        /*
         * We no longer want LockWaitCancel to do anything.
         */
-       waitingForLock = false;
+       lockAwaited = NULL;
 
        /*
         * If we got the lock, be sure to remember it in the locallock table.
@@ -816,6 +823,8 @@ ProcSleep(LockMethod lockMethodTable,
  *      Also remove the process from the wait queue and set its links invalid.
  *      RETURN: the next process in the wait queue.
  *
+ * The appropriate lock partition lock must be held by caller.
+ *
  * XXX: presently, this code is only used for the "success" case, and only
  * works correctly for that case.  To clean up in failure case, would need
  * to twiddle the lock's request counts too --- see RemoveFromWaitQueue.
@@ -825,8 +834,6 @@ ProcWakeup(PGPROC *proc, int waitStatus)
 {
        PGPROC     *retProc;
 
-       /* assume that masterLock has been acquired */
-
        /* Proc should be sleeping ... */
        if (proc->links.prev == INVALID_OFFSET ||
                proc->links.next == INVALID_OFFSET)
@@ -854,6 +861,8 @@ ProcWakeup(PGPROC *proc, int waitStatus)
  * ProcLockWakeup -- routine for waking up processes when a lock is
  *             released (or a prior waiter is aborted).  Scan all waiters
  *             for lock, waken any that are no longer blocked.
+ *
+ * The appropriate lock partition lock must be held by caller.
  */
 void
 ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock)
@@ -908,25 +917,32 @@ ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock)
        Assert(waitQueue->size >= 0);
 }
 
-/* --------------------
+/*
+ * CheckDeadLock
+ *
  * We only get to this routine if we got SIGALRM after DeadlockTimeout
  * while waiting for a lock to be released by some other process.  Look
  * to see if there's a deadlock; if not, just return and continue waiting.
  * If we have a real deadlock, remove ourselves from the lock's wait queue
  * and signal an error to ProcSleep.
- * --------------------
  */
 static void
 CheckDeadLock(void)
 {
+       int                     i;
+
        /*
-        * Acquire locktable lock.      Note that the deadlock check interrupt had
-        * better not be enabled anywhere that this process itself holds the
-        * locktable lock, else this will wait forever.  Also note that
-        * LWLockAcquire creates a critical section, so that this routine cannot
-        * be interrupted by cancel/die interrupts.
+        * Acquire exclusive lock on the entire shared lock data structures.
+        * Must grab LWLocks in partition-number order to avoid LWLock deadlock.
+        *
+        * Note that the deadlock check interrupt had better not be enabled
+        * anywhere that this process itself holds lock partition locks, else this
+        * will wait forever.  Also note that LWLockAcquire creates a critical
+        * section, so that this routine cannot be interrupted by cancel/die
+        * interrupts.
         */
-       LWLockAcquire(LockMgrLock, LW_EXCLUSIVE);
+       for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+               LWLockAcquire(FirstLockMgrLock + i, LW_EXCLUSIVE);
 
        /*
         * Check to see if we've been awoken by anyone in the interim.
@@ -937,14 +953,11 @@ CheckDeadLock(void)
         *
         * We check by looking to see if we've been unlinked from the wait queue.
         * This is quicker than checking our semaphore's state, since no kernel
-        * call is needed, and it is safe because we hold the locktable lock.
+        * call is needed, and it is safe because we hold the lock partition lock.
         */
        if (MyProc->links.prev == INVALID_OFFSET ||
                MyProc->links.next == INVALID_OFFSET)
-       {
-               LWLockRelease(LockMgrLock);
-               return;
-       }
+               goto check_done;
 
 #ifdef LOCK_DEBUG
        if (Debug_deadlocks)
@@ -954,16 +967,19 @@ CheckDeadLock(void)
        if (!DeadLockCheck(MyProc))
        {
                /* No deadlock, so keep waiting */
-               LWLockRelease(LockMgrLock);
-               return;
+               goto check_done;
        }
 
        /*
         * Oops.  We have a deadlock.
         *
-        * Get this process out of wait state.
+        * Get this process out of wait state.  (Note: we could do this more
+        * efficiently by relying on lockAwaited, but use this coding to preserve
+        * the flexibility to kill some other transaction than the one detecting
+        * the deadlock.)
         */
-       RemoveFromWaitQueue(MyProc);
+       Assert(MyProc->waitLock != NULL);
+       RemoveFromWaitQueue(MyProc, LockTagToPartition(&(MyProc->waitLock->tag)));
 
        /*
         * Set MyProc->waitStatus to STATUS_ERROR so that ProcSleep will report an
@@ -987,7 +1003,15 @@ CheckDeadLock(void)
         * them anymore.  However, RemoveFromWaitQueue took care of waking up any
         * such processes.
         */
-       LWLockRelease(LockMgrLock);
+
+       /*
+        * Release locks acquired at head of routine.  Order is not critical,
+        * so do it back-to-front to avoid waking another CheckDeadLock instance
+        * before it can get all the locks.
+        */
+check_done:
+       for (i = NUM_LOCK_PARTITIONS; --i >= 0; )
+               LWLockRelease(FirstLockMgrLock + i);
 }
 
 
index e289632054cc6fe70ad83454866005a3b73dfb24..9af03fb4742785828c8b2ea11153f9e8c2f7c47f 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/lock.h,v 1.92 2005/12/09 01:22:04 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lock.h,v 1.93 2005/12/11 21:02:18 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "storage/shmem.h"
 
 
+/*
+ * Number of partitions the shared lock tables are divided into.
+ *
+ * See LockTagToPartition() if you change this.
+ */
+#define NUM_LOCK_PARTITIONS  16
+
 /* originally in procq.h */
 typedef struct PROC_QUEUE
 {
@@ -348,6 +355,7 @@ typedef struct LOCALLOCK
        LOCK       *lock;                       /* associated LOCK object in shared mem */
        PROCLOCK   *proclock;           /* associated PROCLOCK object in shmem */
        bool            isTempObject;   /* true if lock is on a temporary object */
+       int                     partition;              /* ID of partition containing this lock */
        int                     nLocks;                 /* total number of times lock is held */
        int                     numLockOwners;  /* # of relevant ResourceOwners */
        int                     maxLockOwners;  /* allocated size of array */
@@ -389,6 +397,7 @@ typedef enum
  */
 extern void InitLocks(void);
 extern LockMethod GetLocksMethodTable(const LOCK *lock);
+extern int     LockTagToPartition(const LOCKTAG *locktag);
 extern LockAcquireResult LockAcquire(const LOCKTAG *locktag,
                        bool isTempObject,
                        LOCKMODE lockmode,
@@ -406,7 +415,7 @@ extern int LockCheckConflicts(LockMethod lockMethodTable,
                                   LOCK *lock, PROCLOCK *proclock, PGPROC *proc);
 extern void GrantLock(LOCK *lock, PROCLOCK *proclock, LOCKMODE lockmode);
 extern void GrantAwaitedLock(void);
-extern void RemoveFromWaitQueue(PGPROC *proc);
+extern void RemoveFromWaitQueue(PGPROC *proc, int partition);
 extern Size LockShmemSize(void);
 extern bool DeadLockCheck(PGPROC *proc);
 extern void DeadLockReport(void);
index 4291e0b2e747b9fc3b4682e63a325f9bf423e7ed..c318e60b5771fbeb35845dad0d0eea98226278f4 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.23 2005/10/15 02:49:46 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.24 2005/12/11 21:02:18 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -16,9 +16,9 @@
 
 /*
  * We have a number of predefined LWLocks, plus a bunch of LWLocks that are
- * dynamically assigned (for shared buffers).  The LWLock structures live
- * in shared memory (since they contain shared data) and are identified by
- * values of this enumerated type.     We abuse the notion of an enum somewhat
+ * dynamically assigned (e.g., for shared buffers).  The LWLock structures
+ * live in shared memory (since they contain shared data) and are identified
+ * by values of this enumerated type.  We abuse the notion of an enum somewhat
  * by allowing values not listed in the enum declaration to be assigned.
  * The extra value MaxDynamicLWLock is there to keep the compiler from
  * deciding that the enum can be represented as char or short ...
@@ -27,7 +27,6 @@ typedef enum LWLockId
 {
        BufMappingLock,
        BufFreelistLock,
-       LockMgrLock,
        OidGenLock,
        XidGenLock,
        ProcArrayLock,
@@ -46,8 +45,7 @@ typedef enum LWLockId
        RelCacheInitLock,
        BgWriterCommLock,
        TwoPhaseStateLock,
-
-       NumFixedLWLocks,                        /* must be last except for MaxDynamicLWLock */
+       FirstLockMgrLock,                       /* must be last except for MaxDynamicLWLock */
 
        MaxDynamicLWLock = 1000000000
 } LWLockId;
index 4cba391048eb582b803e5d1e02081ea12ace211e..2cfee41eff91e85f6767aa8098402e36ea69148a 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/proc.h,v 1.84 2005/10/15 02:49:46 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/proc.h,v 1.85 2005/12/11 21:02:18 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -52,7 +52,8 @@ struct XidCache
  * so that the prepared transactions appear to be still running and are
  * correctly shown as holding locks.  A prepared transaction PGPROC can be
  * distinguished from a real one at need by the fact that it has pid == 0.
- * The semaphore and lock-related fields in a prepared-xact PGPROC are unused.
+ * The semaphore and lock-activity fields in a prepared-xact PGPROC are unused,
+ * but its myProcLocks[] lists are valid.
  */
 struct PGPROC
 {
@@ -86,8 +87,12 @@ struct PGPROC
        LOCKMASK        heldLocks;              /* bitmask for lock types already held on this
                                                                 * lock object by this backend */
 
-       SHM_QUEUE       procLocks;              /* list of PROCLOCK objects for locks held or
-                                                                * awaited by this backend */
+       /*
+        * All PROCLOCK objects for locks held or awaited by this backend are
+        * linked into one of these lists, according to the partition number of
+        * their lock.
+        */
+       SHM_QUEUE       myProcLocks[NUM_LOCK_PARTITIONS];
 
        struct XidCache subxids;        /* cache for subtransaction XIDs */
 };
@@ -99,7 +104,7 @@ extern DLLIMPORT PGPROC *MyProc;
 
 
 /*
- * There is one ProcGlobal struct for the whole installation.
+ * There is one ProcGlobal struct for the whole database cluster.
  */
 typedef struct PROC_HDR
 {
@@ -134,8 +139,7 @@ extern bool HaveNFreeProcs(int n);
 extern void ProcReleaseLocks(bool isCommit);
 
 extern void ProcQueueInit(PROC_QUEUE *queue);
-extern int ProcSleep(LockMethod lockMethodTable, LOCKMODE lockmode,
-                 LOCK *lock, PROCLOCK *proclock);
+extern int     ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable);
 extern PGPROC *ProcWakeup(PGPROC *proc, int waitStatus);
 extern void ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock);
 extern bool LockWaitCancel(void);