Fix up pgstats counting of live and dead tuples to recognize that committed
authorTom Lane <tgl@sss.pgh.pa.us>
Sun, 27 May 2007 03:50:39 +0000 (03:50 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sun, 27 May 2007 03:50:39 +0000 (03:50 +0000)
and aborted transactions have different effects; also teach it not to assume
that prepared transactions are always committed.

Along the way, simplify the pgstats API by tying counting directly to
Relations; I cannot detect any redeeming social value in having stats
pointers in HeapScanDesc and IndexScanDesc structures.  And fix a few
corner cases in which counts might be missed because the relation's
pgstat_info pointer hadn't been set.

20 files changed:
src/backend/access/gin/ginscan.c
src/backend/access/gist/gistget.c
src/backend/access/hash/hashsearch.c
src/backend/access/heap/heapam.c
src/backend/access/index/genam.c
src/backend/access/index/indexam.c
src/backend/access/nbtree/nbtsearch.c
src/backend/access/transam/twophase.c
src/backend/access/transam/twophase_rmgr.c
src/backend/access/transam/xact.c
src/backend/executor/nodeBitmapHeapscan.c
src/backend/postmaster/bgwriter.c
src/backend/postmaster/pgstat.c
src/backend/storage/buffer/bufmgr.c
src/backend/utils/cache/relcache.c
src/include/access/heapam.h
src/include/access/relscan.h
src/include/access/twophase_rmgr.h
src/include/pgstat.h
src/include/utils/rel.h

index 22896bc5d77d06fd77c08e551137ed05156f6e4f..2eb1ba95b4bd4296ab53daaaaa1e28ad2e6b4986 100644 (file)
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *                     $PostgreSQL: pgsql/src/backend/access/gin/ginscan.c,v 1.9 2007/01/31 15:09:45 teodor Exp $
+ *                     $PostgreSQL: pgsql/src/backend/access/gin/ginscan.c,v 1.10 2007/05/27 03:50:38 tgl Exp $
  *-------------------------------------------------------------------------
  */
 
@@ -189,7 +189,7 @@ newScanKey(IndexScanDesc scan)
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                 errmsg("GIN index does not support search with void query")));
 
-       pgstat_count_index_scan(&scan->xs_pgstat_info);
+       pgstat_count_index_scan(scan->indexRelation);
 }
 
 Datum
index 226812322aa603a0c9caa23c5f3409f705a9185d..ed839de40342524a0e37819c8f038535c0a1a0e5 100644 (file)
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/gist/gistget.c,v 1.65 2007/04/06 22:33:41 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/gist/gistget.c,v 1.66 2007/05/27 03:50:38 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -165,7 +165,7 @@ gistnext(IndexScanDesc scan, ScanDirection dir, ItemPointer tids,
                stk->next = NULL;
                stk->block = GIST_ROOT_BLKNO;
 
-               pgstat_count_index_scan(&scan->xs_pgstat_info);
+               pgstat_count_index_scan(scan->indexRelation);
        }
        else if (so->curbuf == InvalidBuffer)
        {
index 5de0f402297b97a24d6e93595cd3d5c302a0feb1..104a0c14de380d509a892a983c05b06d8ab4975f 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.49 2007/05/03 16:45:58 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.50 2007/05/27 03:50:38 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -127,7 +127,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
        ItemPointer current;
        OffsetNumber offnum;
 
-       pgstat_count_index_scan(&scan->xs_pgstat_info);
+       pgstat_count_index_scan(rel);
 
        current = &(so->hashso_curpos);
        ItemPointerSetInvalid(current);
index ee2be7cfdb1926598346303e2f6e84deadfbd0f2..9edeaff130686ade0f026a977095d9440b0b25a4 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.232 2007/04/08 01:26:27 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.233 2007/05/27 03:50:38 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -100,7 +100,7 @@ initscan(HeapScanDesc scan, ScanKey key)
        if (key != NULL)
                memcpy(scan->rs_key, key, scan->rs_nkeys * sizeof(ScanKeyData));
 
-       pgstat_count_heap_scan(&scan->rs_pgstat_info);
+       pgstat_count_heap_scan(scan->rs_rd);
 }
 
 /*
@@ -701,6 +701,8 @@ relation_open(Oid relationId, LOCKMODE lockmode)
        if (!RelationIsValid(r))
                elog(ERROR, "could not open relation with OID %u", relationId);
 
+       pgstat_initstats(r);
+
        return r;
 }
 
@@ -743,6 +745,8 @@ try_relation_open(Oid relationId, LOCKMODE lockmode)
        if (!RelationIsValid(r))
                elog(ERROR, "could not open relation with OID %u", relationId);
 
+       pgstat_initstats(r);
+
        return r;
 }
 
@@ -787,6 +791,8 @@ relation_open_nowait(Oid relationId, LOCKMODE lockmode)
        if (!RelationIsValid(r))
                elog(ERROR, "could not open relation with OID %u", relationId);
 
+       pgstat_initstats(r);
+
        return r;
 }
 
@@ -873,8 +879,6 @@ heap_open(Oid relationId, LOCKMODE lockmode)
                                 errmsg("\"%s\" is a composite type",
                                                RelationGetRelationName(r))));
 
-       pgstat_initstats(&r->pgstat_info, r);
-
        return r;
 }
 
@@ -903,8 +907,6 @@ heap_openrv(const RangeVar *relation, LOCKMODE lockmode)
                                 errmsg("\"%s\" is a composite type",
                                                RelationGetRelationName(r))));
 
-       pgstat_initstats(&r->pgstat_info, r);
-
        return r;
 }
 
@@ -954,8 +956,6 @@ heap_beginscan(Relation relation, Snapshot snapshot,
        else
                scan->rs_key = NULL;
 
-       pgstat_initstats(&scan->rs_pgstat_info, relation);
-
        initscan(scan, key);
 
        return scan;
@@ -1059,7 +1059,7 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction)
         */
        HEAPDEBUG_3;                            /* heap_getnext returning tuple */
 
-       pgstat_count_heap_getnext(&scan->rs_pgstat_info);
+       pgstat_count_heap_getnext(scan->rs_rd);
 
        return &(scan->rs_ctup);
 }
@@ -1086,6 +1086,10 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction)
  * and return it in *userbuf (so the caller must eventually unpin it); when
  * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer.
  *
+ * stats_relation is the relation to charge the heap_fetch operation against
+ * for statistical purposes.  (This could be the heap rel itself, an
+ * associated index, or NULL to not count the fetch at all.)
+ *
  * It is somewhat inconsistent that we ereport() on invalid block number but
  * return false on invalid item number.  There are a couple of reasons though.
  * One is that the caller can relatively easily check the block number for
@@ -1101,12 +1105,12 @@ heap_fetch(Relation relation,
                   HeapTuple tuple,
                   Buffer *userbuf,
                   bool keep_buf,
-                  PgStat_Info *pgstat_info)
+                  Relation stats_relation)
 {
        /* Assume *userbuf is undefined on entry */
        *userbuf = InvalidBuffer;
        return heap_release_fetch(relation, snapshot, tuple,
-                                                         userbuf, keep_buf, pgstat_info);
+                                                         userbuf, keep_buf, stats_relation);
 }
 
 /*
@@ -1125,7 +1129,7 @@ heap_release_fetch(Relation relation,
                                   HeapTuple tuple,
                                   Buffer *userbuf,
                                   bool keep_buf,
-                                  PgStat_Info *pgstat_info)
+                                  Relation stats_relation)
 {
        ItemPointer tid = &(tuple->t_self);
        ItemId          lp;
@@ -1210,9 +1214,9 @@ heap_release_fetch(Relation relation,
                 */
                *userbuf = buffer;
 
-               /* Count the successful fetch in *pgstat_info, if given. */
-               if (pgstat_info != NULL)
-                       pgstat_count_heap_fetch(pgstat_info);
+               /* Count the successful fetch against appropriate rel, if any */
+               if (stats_relation != NULL)
+                       pgstat_count_heap_fetch(stats_relation);
 
                return true;
        }
@@ -1517,7 +1521,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
         */
        CacheInvalidateHeapTuple(relation, heaptup);
 
-       pgstat_count_heap_insert(&relation->pgstat_info);
+       pgstat_count_heap_insert(relation);
 
        /*
         * If heaptup is a private copy, release it.  Don't forget to copy t_self
@@ -1807,7 +1811,7 @@ l1:
        if (have_tuple_lock)
                UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
 
-       pgstat_count_heap_delete(&relation->pgstat_info);
+       pgstat_count_heap_delete(relation);
 
        return HeapTupleMayBeUpdated;
 }
@@ -2269,7 +2273,7 @@ l2:
        if (have_tuple_lock)
                UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
 
-       pgstat_count_heap_update(&relation->pgstat_info);
+       pgstat_count_heap_update(relation);
 
        /*
         * If heaptup is a private copy, release it.  Don't forget to copy t_self
index 49ffff6e51db8a8f87f4fbdb01d2da29a9638663..0009739180c8528ec77d233928a1e50a20cec4be 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.61 2007/01/20 18:43:35 neilc Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.62 2007/05/27 03:50:38 tgl Exp $
  *
  * NOTES
  *       many of the old access method routines have been turned into
@@ -96,8 +96,6 @@ RelationGetIndexScan(Relation indexRelation,
        scan->xs_ctup.t_data = NULL;
        scan->xs_cbuf = InvalidBuffer;
 
-       pgstat_initstats(&scan->xs_pgstat_info, indexRelation);
-
        /*
         * Let the AM fill in the key and any opaque data it wants.
         */
index 23522ba740ea453cf67110d5bb6c8a67fef75b31..d905013a5fce45ace3d5f53335fff6f2ce59d3ac 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.97 2007/01/05 22:19:23 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.98 2007/05/27 03:50:38 tgl Exp $
  *
  * INTERFACE ROUTINES
  *             index_open              - open an index relation by relation OID
@@ -145,8 +145,6 @@ index_open(Oid relationId, LOCKMODE lockmode)
                                 errmsg("\"%s\" is not an index",
                                                RelationGetRelationName(r))));
 
-       pgstat_initstats(&r->pgstat_info, r);
-
        return r;
 }
 
@@ -433,14 +431,14 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
                        return NULL;            /* failure exit */
                }
 
-               pgstat_count_index_tuples(&scan->xs_pgstat_info, 1);
+               pgstat_count_index_tuples(scan->indexRelation, 1);
 
                /*
                 * Fetch the heap tuple and see if it matches the snapshot.
                 */
                if (heap_release_fetch(scan->heapRelation, scan->xs_snapshot,
                                                           heapTuple, &scan->xs_cbuf, true,
-                                                          &scan->xs_pgstat_info))
+                                                          scan->indexRelation))
                        break;
 
                /* Skip if no undeleted tuple at this location */
@@ -502,7 +500,7 @@ index_getnext_indexitem(IndexScanDesc scan,
                                                                           Int32GetDatum(direction)));
 
        if (found)
-               pgstat_count_index_tuples(&scan->xs_pgstat_info, 1);
+               pgstat_count_index_tuples(scan->indexRelation, 1);
 
        return found;
 }
@@ -543,7 +541,7 @@ index_getmulti(IndexScanDesc scan,
                                                                           Int32GetDatum(max_tids),
                                                                           PointerGetDatum(returned_tids)));
 
-       pgstat_count_index_tuples(&scan->xs_pgstat_info, *returned_tids);
+       pgstat_count_index_tuples(scan->indexRelation, *returned_tids);
 
        return found;
 }
index 036a97a8d04de5a990bcafa56f92c6a7e4f56664..b947d770aa2cd8872c73044c40f72a53fd9a4263 100644 (file)
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.112 2007/04/06 22:33:42 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.113 2007/05/27 03:50:39 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -453,7 +453,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
        int                     i;
        StrategyNumber strat_total;
 
-       pgstat_count_index_scan(&scan->xs_pgstat_info);
+       pgstat_count_index_scan(rel);
 
        /*
         * Examine the scan keys and eliminate any redundant keys; also mark the
index 6f495a84087b832ce1171897088db266a2d5da29..7fdf5a7eed389f84e13b6e7fb00060e10d5fb589 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *             $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.30 2007/04/30 21:01:52 tgl Exp $
+ *             $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.31 2007/05/27 03:50:39 tgl Exp $
  *
  * NOTES
  *             Each global transaction is associated with a global transaction
@@ -1211,7 +1211,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
        else
                ProcessRecords(bufptr, xid, twophase_postabort_callbacks);
 
-       pgstat_count_xact_commit();
+       /* Count the prepared xact as committed or aborted */
+       AtEOXact_PgStat(isCommit);
 
        /*
         * And now we can clean up our mess.
index e93bac7b2d8199db0c7fe283170f16f62abc4c40..9c2f14a1a38c90417299a44541e7bc8d60d67e49 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/transam/twophase_rmgr.c,v 1.4 2007/01/05 22:19:23 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/transam/twophase_rmgr.c,v 1.5 2007/05/27 03:50:39 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -16,6 +16,7 @@
 
 #include "access/twophase_rmgr.h"
 #include "commands/async.h"
+#include "pgstat.h"
 #include "storage/lock.h"
 #include "utils/flatfiles.h"
 #include "utils/inval.h"
@@ -27,7 +28,8 @@ const TwoPhaseCallback twophase_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] =
        lock_twophase_recover,          /* Lock */
        NULL,                                           /* Inval */
        NULL,                                           /* flat file update */
-       NULL                                            /* notify/listen */
+       NULL,                                           /* notify/listen */
+       NULL                                            /* pgstat */
 };
 
 const TwoPhaseCallback twophase_postcommit_callbacks[TWOPHASE_RM_MAX_ID + 1] =
@@ -36,7 +38,8 @@ const TwoPhaseCallback twophase_postcommit_callbacks[TWOPHASE_RM_MAX_ID + 1] =
        lock_twophase_postcommit,       /* Lock */
        inval_twophase_postcommit,      /* Inval */
        flatfile_twophase_postcommit,           /* flat file update */
-       notify_twophase_postcommit      /* notify/listen */
+       notify_twophase_postcommit,     /* notify/listen */
+       pgstat_twophase_postcommit      /* pgstat */
 };
 
 const TwoPhaseCallback twophase_postabort_callbacks[TWOPHASE_RM_MAX_ID + 1] =
@@ -45,5 +48,6 @@ const TwoPhaseCallback twophase_postabort_callbacks[TWOPHASE_RM_MAX_ID + 1] =
        lock_twophase_postabort,        /* Lock */
        NULL,                                           /* Inval */
        NULL,                                           /* flat file update */
-       NULL                                            /* notify/listen */
+       NULL,                                           /* notify/listen */
+       pgstat_twophase_postabort       /* pgstat */
 };
index f2685ee0b346e3c2bbed5e1f1c4bac02c780557a..c16b4fa6be9261dfb2d12f6db29a98fd91ae7d4a 100644 (file)
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.242 2007/04/30 21:01:52 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.243 2007/05/27 03:50:39 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1661,8 +1661,7 @@ CommitTransaction(void)
        AtEOXact_Files();
        AtEOXact_ComboCid();
        AtEOXact_HashTables(true);
-       pgstat_clear_snapshot();
-       pgstat_count_xact_commit();
+       AtEOXact_PgStat(true);
        pgstat_report_txn_timestamp(0);
 
        CurrentResourceOwner = NULL;
@@ -1796,6 +1795,7 @@ PrepareTransaction(void)
        AtPrepare_UpdateFlatFiles();
        AtPrepare_Inval();
        AtPrepare_Locks();
+       AtPrepare_PgStat();
 
        /*
         * Here is where we really truly prepare.
@@ -1853,6 +1853,8 @@ PrepareTransaction(void)
 
        /* notify and flatfiles don't need a postprepare call */
 
+       PostPrepare_PgStat();
+
        PostPrepare_Inval();
 
        PostPrepare_smgr();
@@ -1880,7 +1882,7 @@ PrepareTransaction(void)
        AtEOXact_Files();
        AtEOXact_ComboCid();
        AtEOXact_HashTables(true);
-       pgstat_clear_snapshot();
+       /* don't call AtEOXact_PgStat here */
 
        CurrentResourceOwner = NULL;
        ResourceOwnerDelete(TopTransactionResourceOwner);
@@ -2035,8 +2037,7 @@ AbortTransaction(void)
        AtEOXact_Files();
        AtEOXact_ComboCid();
        AtEOXact_HashTables(false);
-       pgstat_clear_snapshot();
-       pgstat_count_xact_rollback();
+       AtEOXact_PgStat(false);
        pgstat_report_txn_timestamp(0);
 
        /*
@@ -3749,6 +3750,7 @@ CommitSubTransaction(void)
        AtEOSubXact_Files(true, s->subTransactionId,
                                          s->parent->subTransactionId);
        AtEOSubXact_HashTables(true, s->nestingLevel);
+       AtEOSubXact_PgStat(true, s->nestingLevel);
 
        /*
         * We need to restore the upper transaction's read-only state, in case the
@@ -3861,6 +3863,7 @@ AbortSubTransaction(void)
                AtEOSubXact_Files(false, s->subTransactionId,
                                                  s->parent->subTransactionId);
                AtEOSubXact_HashTables(false, s->nestingLevel);
+               AtEOSubXact_PgStat(false, s->nestingLevel);
        }
 
        /*
index 3e9a91de2f5182814742932955d6b313c192bf2d..07729da2be68725e115bef6936e88cc0af79b452 100644 (file)
@@ -21,7 +21,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.16 2007/01/05 22:19:28 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.17 2007/05/27 03:50:39 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -189,7 +189,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
                scan->rs_ctup.t_len = ItemIdGetLength(lp);
                ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
 
-               pgstat_count_heap_fetch(&scan->rs_pgstat_info);
+               pgstat_count_heap_fetch(scan->rs_rd);
 
                /*
                 * Set up the result slot to point to this tuple. Note that the slot
@@ -389,7 +389,7 @@ ExecBitmapHeapReScan(BitmapHeapScanState *node, ExprContext *exprCtxt)
        heap_rescan(node->ss.ss_currentScanDesc, NULL);
 
        /* undo bogus "seq scan" count (see notes in ExecInitBitmapHeapScan) */
-       pgstat_discount_heap_scan(&node->ss.ss_currentScanDesc->rs_pgstat_info);
+       pgstat_discount_heap_scan(node->ss.ss_currentScanDesc->rs_rd);
 
        if (node->tbm)
                tbm_free(node->tbm);
@@ -535,7 +535,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
         * when we actually aren't doing any such thing.  Reverse out the added
         * scan count.  (Eventually we may want to count bitmap scans separately.)
         */
-       pgstat_discount_heap_scan(&scanstate->ss.ss_currentScanDesc->rs_pgstat_info);
+       pgstat_discount_heap_scan(scanstate->ss.ss_currentScanDesc->rs_rd);
 
        /*
         * get the scan type from the relation descriptor.
index 273588424ebab575ca45c146655bbe4ea2f003b1..10f57f00b8fa6fcfc5e20d6d833334f3bf701ff4 100644 (file)
@@ -37,7 +37,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.37 2007/03/30 18:34:55 mha Exp $
+ *       $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.38 2007/05/27 03:50:39 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -125,13 +125,6 @@ typedef struct
 
 static BgWriterShmemStruct *BgWriterShmem;
 
-/*
- * BgWriter statistics counters.
- * Stored directly in a stats message structure so it can be sent
- * without needing to copy things around.
- */
-PgStat_MsgBgWriter BgWriterStats;
-
 /*
  * GUC parameters
  */
@@ -250,11 +243,6 @@ BackgroundWriterMain(void)
                                                                                         ALLOCSET_DEFAULT_MAXSIZE);
        MemoryContextSwitchTo(bgwriter_context);
 
-       /*
-        * Initialize statistics counters to zero
-        */
-       memset(&BgWriterStats, 0, sizeof(BgWriterStats));
-
        /*
         * If an exception is encountered, processing resumes here.
         *
index 1fac5af284b4a8399aa0e6dc1cfd7e6e57592996..b41a16de44ce86435068597a40e0fa3537ccd08b 100644 (file)
@@ -13,7 +13,7 @@
  *
  *     Copyright (c) 2001-2007, PostgreSQL Global Development Group
  *
- *     $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.155 2007/04/30 16:37:08 tgl Exp $
+ *     $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.156 2007/05/27 03:50:39 tgl Exp $
  * ----------
  */
 #include "postgres.h"
@@ -39,6 +39,7 @@
 
 #include "access/heapam.h"
 #include "access/transam.h"
+#include "access/twophase_rmgr.h"
 #include "access/xact.h"
 #include "catalog/pg_database.h"
 #include "libpq/ip.h"
@@ -98,6 +99,13 @@ bool         pgstat_collect_tuplelevel = false;
 bool           pgstat_collect_blocklevel = false;
 bool           pgstat_collect_querystring = false;
 
+/*
+ * BgWriter global statistics counters (unused in other processes).
+ * Stored directly in a stats message structure so it can be sent
+ * without needing to copy things around.  We assume this inits to zeroes.
+ */
+PgStat_MsgBgWriter BgWriterStats;
+
 /* ----------
  * Local data
  * ----------
@@ -111,43 +119,63 @@ static time_t last_pgstat_start_time;
 static bool pgStatRunningInCollector = false;
 
 /*
- * Place where backends store per-table info to be sent to the collector.
- * We store shared relations separately from non-shared ones, to be able to
- * send them in separate messages.
+ * Structures in which backends store per-table info that's waiting to be
+ * sent to the collector.
  *
- * NOTE: once allocated, a PgStat_MsgTabstat struct belonging to a
- * TabStatArray is never moved or deleted for the life of the backend.
- * Also, we zero out the t_id fields of the contained PgStat_TableEntry
- * structs whenever they are not actively in use.  This allows PgStat_Info
- * pointers to be treated as long-lived data, avoiding repeated searches in
- * pgstat_initstats() when a relation is repeatedly heap_open'd or
- * index_open'd during a transaction.
+ * NOTE: once allocated, TabStatusArray structures are never moved or deleted
+ * for the life of the backend.  Also, we zero out the t_id fields of the
+ * contained PgStat_TableStatus structs whenever they are not actively in use.
+ * This allows relcache pgstat_info pointers to be treated as long-lived data,
+ * avoiding repeated searches in pgstat_initstats() when a relation is
+ * repeatedly opened during a transaction.
  */
-typedef struct TabStatArray
+#define TABSTAT_QUANTUM                100                     /* we alloc this many at a time */
+
+typedef struct TabStatusArray
 {
-       int                     tsa_alloc;              /* num allocated */
-       int                     tsa_used;               /* num actually used */
-       PgStat_MsgTabstat **tsa_messages;       /* the array itself */
-} TabStatArray;
+       struct TabStatusArray *tsa_next;        /* link to next array, if any */
+       int                     tsa_used;                               /* # entries currently used */
+       PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM];        /* per-table data */
+} TabStatusArray;
 
-#define TABSTAT_QUANTUM                4       /* we alloc this many at a time */
+static TabStatusArray *pgStatTabList = NULL;
+
+/*
+ * Tuple insertion/deletion counts for an open transaction can't be propagated
+ * into PgStat_TableStatus counters until we know if it is going to commit
+ * or abort.  Hence, we keep these counts in per-subxact structs that live
+ * in TopTransactionContext.  This data structure is designed on the assumption
+ * that subxacts won't usually modify very many tables.
+ */
+typedef struct PgStat_SubXactStatus
+{
+       int                     nest_level;                             /* subtransaction nest level */
+       struct PgStat_SubXactStatus *prev;      /* higher-level subxact if any */
+       PgStat_TableXactStatus *first;          /* head of list for this subxact */
+} PgStat_SubXactStatus;
 
-static TabStatArray RegularTabStat = {0, 0, NULL};
-static TabStatArray SharedTabStat = {0, 0, NULL};
+static PgStat_SubXactStatus *pgStatXactStack = NULL;
 
 static int     pgStatXactCommit = 0;
 static int     pgStatXactRollback = 0;
 
+/* Record that's written to 2PC state file when pgstat state is persisted */
+typedef struct TwoPhasePgStatRecord
+{
+       PgStat_Counter tuples_inserted; /* tuples inserted in xact */
+       PgStat_Counter tuples_deleted;  /* tuples deleted in xact */
+       Oid                     t_id;                           /* table's OID */
+       bool            t_shared;                       /* is it a shared catalog? */
+} TwoPhasePgStatRecord;
+
+/*
+ * Info about current "snapshot" of stats file
+ */
 static MemoryContext pgStatLocalContext = NULL;
 static HTAB *pgStatDBHash = NULL;
 static PgBackendStatus *localBackendStatusTable = NULL;
 static int     localNumBackends = 0;
 
-/*
- * BgWriter global statistics counters, from bgwriter.c
- */
-extern PgStat_MsgBgWriter BgWriterStats;
-
 /*
  * Cluster wide statistics, kept in the stats collector.
  * Contains statistics that are not collected per database
@@ -177,9 +205,12 @@ static void pgstat_write_statsfile(void);
 static HTAB *pgstat_read_statsfile(Oid onlydb);
 static void backend_read_statsfile(void);
 static void pgstat_read_current_status(void);
-static void pgstat_report_one_tabstat(TabStatArray *tsarr, Oid dbid);
+
+static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
 static HTAB *pgstat_collect_oids(Oid catalogid);
 
+static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
+
 static void pgstat_setup_memcxt(void);
 
 static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
@@ -617,12 +648,19 @@ void allow_immediate_pgstat_restart(void)
 void
 pgstat_report_tabstat(bool force)
 {
+       /* we assume this inits to all zeroes: */
+       static const PgStat_TableCounts all_zeroes;
        static TimestampTz last_report = 0;     
+
        TimestampTz now;
+       PgStat_MsgTabstat regular_msg;
+       PgStat_MsgTabstat shared_msg;
+       TabStatusArray *tsa;
+       int                     i;
 
        /* Don't expend a clock check if nothing to do */
-       if (RegularTabStat.tsa_used == 0 &&
-               SharedTabStat.tsa_used == 0)
+       if (pgStatTabList == NULL ||
+               pgStatTabList->tsa_used == 0)
                return;
 
        /*
@@ -636,51 +674,101 @@ pgstat_report_tabstat(bool force)
        last_report = now;
 
        /*
-        * For each message buffer used during the last queries, set the header
-        * fields and send it out; then mark the entries unused.
+        * Scan through the TabStatusArray struct(s) to find tables that actually
+        * have counts, and build messages to send.  We have to separate shared
+        * relations from regular ones because the databaseid field in the
+        * message header has to depend on that.
         */
-       pgstat_report_one_tabstat(&RegularTabStat, MyDatabaseId);
-       pgstat_report_one_tabstat(&SharedTabStat, InvalidOid);
+       regular_msg.m_databaseid = MyDatabaseId;
+       shared_msg.m_databaseid = InvalidOid;
+       regular_msg.m_nentries = 0;
+       shared_msg.m_nentries = 0;
+
+       for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
+       {
+               for (i = 0; i < tsa->tsa_used; i++)
+               {
+                       PgStat_TableStatus *entry = &tsa->tsa_entries[i];
+                       PgStat_MsgTabstat *this_msg;
+                       PgStat_TableEntry *this_ent;
+
+                       /* Shouldn't have any pending transaction-dependent counts */
+                       Assert(entry->trans == NULL);
+
+                       /*
+                        * Ignore entries that didn't accumulate any actual counts,
+                        * such as indexes that were opened by the planner but not used.
+                        */
+                       if (memcmp(&entry->t_counts, &all_zeroes,
+                                          sizeof(PgStat_TableCounts)) == 0)
+                               continue;
+                       /*
+                        * OK, insert data into the appropriate message, and send if full.
+                        */
+                       this_msg = entry->t_shared ? &shared_msg : &regular_msg;
+                       this_ent = &this_msg->m_entry[this_msg->m_nentries];
+                       this_ent->t_id = entry->t_id;
+                       memcpy(&this_ent->t_counts, &entry->t_counts,
+                                  sizeof(PgStat_TableCounts));
+                       if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
+                       {
+                               pgstat_send_tabstat(this_msg);
+                               this_msg->m_nentries = 0;
+                       }
+               }
+               /* zero out TableStatus structs after use */
+               MemSet(tsa->tsa_entries, 0,
+                          tsa->tsa_used * sizeof(PgStat_TableStatus));
+               tsa->tsa_used = 0;
+       }
+
+       /*
+        * Send partial messages.  If force is true, make sure that any pending
+        * xact commit/abort gets counted, even if no table stats to send.
+        */
+       if (regular_msg.m_nentries > 0 ||
+               (force && (pgStatXactCommit > 0 || pgStatXactRollback > 0)))
+               pgstat_send_tabstat(&regular_msg);
+       if (shared_msg.m_nentries > 0)
+               pgstat_send_tabstat(&shared_msg);
 }
 
+/*
+ * Subroutine for pgstat_report_tabstat: finish and send a tabstat message
+ */
 static void
-pgstat_report_one_tabstat(TabStatArray *tsarr, Oid dbid)
+pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg)
 {
-       int                     i;
-
-       for (i = 0; i < tsarr->tsa_used; i++)
-       {
-               PgStat_MsgTabstat *tsmsg = tsarr->tsa_messages[i];
-               int                     n;
-               int                     len;
+       int                     n;
+       int                     len;
 
-               n = tsmsg->m_nentries;
-               len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
-                       n * sizeof(PgStat_TableEntry);
+       /* It's unlikely we'd get here with no socket, but maybe not impossible */
+       if (pgStatSock < 0)
+               return;
 
+       /*
+        * Report accumulated xact commit/rollback whenever we send a normal
+        * tabstat message
+        */
+       if (OidIsValid(tsmsg->m_databaseid))
+       {
                tsmsg->m_xact_commit = pgStatXactCommit;
                tsmsg->m_xact_rollback = pgStatXactRollback;
                pgStatXactCommit = 0;
                pgStatXactRollback = 0;
+       }
+       else
+       {
+               tsmsg->m_xact_commit = 0;
+               tsmsg->m_xact_rollback = 0;
+       }
 
-               /*
-                * It's unlikely we'd get here with no socket, but maybe not
-                * impossible
-                */
-               if (pgStatSock >= 0)
-               {
-                       pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
-                       tsmsg->m_databaseid = dbid;
-                       pgstat_send(tsmsg, len);
-               }
+       n = tsmsg->m_nentries;
+       len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
+               n * sizeof(PgStat_TableEntry);
 
-               /*
-                * Zero out the entries, to mark them unused and prepare them
-                * for next use.
-                */
-               MemSet(tsmsg, 0, len);
-       }
-       tsarr->tsa_used = 0;
+       pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
+       pgstat_send(tsmsg, len);
 }
 
 
@@ -1016,209 +1104,489 @@ pgstat_ping(void)
        pgstat_send(&msg, sizeof(msg));
 }
 
-/*
- * Enlarge a TabStatArray
- */
-static void
-more_tabstat_space(TabStatArray *tsarr)
-{
-       PgStat_MsgTabstat *newMessages;
-       PgStat_MsgTabstat **msgArray;
-       int                     newAlloc;
-       int                     i;
-
-       AssertArg(PointerIsValid(tsarr));
-
-       newAlloc = tsarr->tsa_alloc + TABSTAT_QUANTUM;
-
-       /* Create (another) quantum of message buffers, and zero them */
-       newMessages = (PgStat_MsgTabstat *)
-               MemoryContextAllocZero(TopMemoryContext,
-                                                          sizeof(PgStat_MsgTabstat) * TABSTAT_QUANTUM);
-
-       /* Create or enlarge the pointer array */
-       if (tsarr->tsa_messages == NULL)
-               msgArray = (PgStat_MsgTabstat **)
-                       MemoryContextAlloc(TopMemoryContext,
-                                                          sizeof(PgStat_MsgTabstat *) * newAlloc);
-       else
-               msgArray = (PgStat_MsgTabstat **)
-                       repalloc(tsarr->tsa_messages,
-                                        sizeof(PgStat_MsgTabstat *) * newAlloc);
-
-       for (i = 0; i < TABSTAT_QUANTUM; i++)
-               msgArray[tsarr->tsa_alloc + i] = newMessages++;
-       tsarr->tsa_messages = msgArray;
-       tsarr->tsa_alloc = newAlloc;
-
-       Assert(tsarr->tsa_used < tsarr->tsa_alloc);
-}
 
 /* ----------
  * pgstat_initstats() -
  *
- *     Called from various places usually dealing with initialization
- *     of Relation or Scan structures. The data placed into these
- *     structures from here tell where later to count for buffer reads,
- *     scans and tuples fetched.
- *
- *     NOTE: PgStat_Info pointers in scan structures are really redundant
- *     with those in relcache entries.  The passed stats pointer might point
- *     either to the Relation struct's own pgstat_info field, or to one in
- *     a scan structure; we'll set the Relation pg_statinfo and copy it to
- *     the scan struct.
+ *     Initialize a relcache entry to count access statistics.
+ *     Called whenever a relation is opened.
  *
  *     We assume that a relcache entry's pgstat_info field is zeroed by
  *     relcache.c when the relcache entry is made; thereafter it is long-lived
- *     data.  We can avoid repeated searches of the TabStat arrays when the
+ *     data.  We can avoid repeated searches of the TabStatus arrays when the
  *     same relation is touched repeatedly within a transaction.
  * ----------
  */
 void
-pgstat_initstats(PgStat_Info *stats, Relation rel)
+pgstat_initstats(Relation rel)
 {
        Oid                     rel_id = rel->rd_id;
-       PgStat_TableEntry *useent;
-       TabStatArray *tsarr;
-       PgStat_MsgTabstat *tsmsg;
-       int                     mb;
-       int                     i;
+       char            relkind = rel->rd_rel->relkind;
+
+       /* We only count stats for things that have storage */
+       if (!(relkind == RELKIND_RELATION ||
+                 relkind == RELKIND_INDEX ||
+                 relkind == RELKIND_TOASTVALUE))
+       {
+               rel->pgstat_info = NULL;
+               return;
+       }
 
        if (pgStatSock < 0 ||
                !(pgstat_collect_tuplelevel ||
                  pgstat_collect_blocklevel))
        {
-               /* We're not counting at all. */
-               stats->tabentry = NULL;
+               /* We're not counting at all */
+               rel->pgstat_info = NULL;
                return;
        }
 
        /*
         * If we already set up this relation in the current transaction,
-        * just copy the pointer.
+        * nothing to do.
         */
-       if (rel->pgstat_info.tabentry != NULL &&
-               ((PgStat_TableEntry *) rel->pgstat_info.tabentry)->t_id == rel_id)
-       {
-               stats->tabentry = rel->pgstat_info.tabentry;
+       if (rel->pgstat_info != NULL &&
+               rel->pgstat_info->t_id == rel_id)
                return;
-       }
+
+       /* Else find or make the PgStat_TableStatus entry, and update link */
+       rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
+}
+
+/*
+ * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
+ */
+static PgStat_TableStatus *
+get_tabstat_entry(Oid rel_id, bool isshared)
+{
+       PgStat_TableStatus *entry;
+       TabStatusArray *tsa;
+       TabStatusArray *prev_tsa;
+       int                     i;
 
        /*
-        * Search the already-used message slots for this relation.
+        * Search the already-used tabstat slots for this relation.
         */
-       tsarr = rel->rd_rel->relisshared ? &SharedTabStat : &RegularTabStat;
-
-       for (mb = 0; mb < tsarr->tsa_used; mb++)
+       prev_tsa = NULL;
+       for (tsa = pgStatTabList; tsa != NULL; prev_tsa = tsa, tsa = tsa->tsa_next)
        {
-               tsmsg = tsarr->tsa_messages[mb];
-
-               for (i = tsmsg->m_nentries; --i >= 0;)
+               for (i = 0; i < tsa->tsa_used; i++)
                {
-                       if (tsmsg->m_entry[i].t_id == rel_id)
-                       {
-                               rel->pgstat_info.tabentry = (void *) &(tsmsg->m_entry[i]);
-                               stats->tabentry = rel->pgstat_info.tabentry;
-                               return;
-                       }
+                       entry = &tsa->tsa_entries[i];
+                       if (entry->t_id == rel_id)
+                               return entry;
                }
 
-               if (tsmsg->m_nentries >= PGSTAT_NUM_TABENTRIES)
-                       continue;
-
-               /*
-                * Not found, but found a message buffer with an empty slot instead.
-                * Fine, let's use this one.  We assume the entry was already zeroed,
-                * either at creation or after last use.
-                */
-               i = tsmsg->m_nentries++;
-               useent = &tsmsg->m_entry[i];
-               useent->t_id = rel_id;
-               rel->pgstat_info.tabentry = (void *) useent;
-               stats->tabentry = rel->pgstat_info.tabentry;
-               return;
+               if (tsa->tsa_used < TABSTAT_QUANTUM)
+               {
+                       /*
+                        * It must not be present, but we found a free slot instead.
+                        * Fine, let's use this one.  We assume the entry was already
+                        * zeroed, either at creation or after last use.
+                        */
+                       entry = &tsa->tsa_entries[tsa->tsa_used++];
+                       entry->t_id = rel_id;
+                       entry->t_shared = isshared;
+                       return entry;
+               }
        }
 
        /*
-        * If we ran out of message buffers, we just allocate more.
+        * We ran out of tabstat slots, so allocate more.  Be sure they're zeroed.
         */
-       if (tsarr->tsa_used >= tsarr->tsa_alloc)
-               more_tabstat_space(tsarr);
+       tsa = (TabStatusArray *) MemoryContextAllocZero(TopMemoryContext,
+                                                                                                       sizeof(TabStatusArray));
+       if (prev_tsa)
+               prev_tsa->tsa_next = tsa;
+       else
+               pgStatTabList = tsa;
+
+       /*
+        * Use the first entry of the new TabStatusArray.
+        */
+       entry = &tsa->tsa_entries[tsa->tsa_used++];
+       entry->t_id = rel_id;
+       entry->t_shared = isshared;
+       return entry;
+}
+
+/*
+ * get_tabstat_stack_level - add a new (sub)transaction stack entry if needed
+ */
+static PgStat_SubXactStatus *
+get_tabstat_stack_level(int nest_level)
+{
+       PgStat_SubXactStatus *xact_state;
+
+       xact_state = pgStatXactStack;
+       if (xact_state == NULL || xact_state->nest_level != nest_level)
+       {
+               xact_state = (PgStat_SubXactStatus *)
+                       MemoryContextAlloc(TopTransactionContext,
+                                                          sizeof(PgStat_SubXactStatus));
+               xact_state->nest_level = nest_level;
+               xact_state->prev = pgStatXactStack;
+               xact_state->first = NULL;
+               pgStatXactStack = xact_state;
+       }
+       return xact_state;
+}
+
+/*
+ * add_tabstat_xact_level - add a new (sub)transaction state record
+ */
+static void
+add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level)
+{
+       PgStat_SubXactStatus *xact_state;
+       PgStat_TableXactStatus *trans;
 
        /*
-        * Use the first entry of the next message buffer.
+        * If this is the first rel to be modified at the current nest level,
+        * we first have to push a transaction stack entry.
         */
-       mb = tsarr->tsa_used++;
-       tsmsg = tsarr->tsa_messages[mb];
-       tsmsg->m_nentries = 1;
-       useent = &tsmsg->m_entry[0];
-       useent->t_id = rel_id;
-       rel->pgstat_info.tabentry = (void *) useent;
-       stats->tabentry = rel->pgstat_info.tabentry;
+       xact_state = get_tabstat_stack_level(nest_level);
+
+       /* Now make a per-table stack entry */
+       trans = (PgStat_TableXactStatus *)
+               MemoryContextAllocZero(TopTransactionContext,
+                                                          sizeof(PgStat_TableXactStatus));
+       trans->nest_level = nest_level;
+       trans->upper = pgstat_info->trans;
+       trans->parent = pgstat_info;
+       trans->next = xact_state->first;
+       xact_state->first = trans;
+       pgstat_info->trans = trans;
+}
+
+/*
+ * pgstat_count_heap_insert - count a tuple insertion
+ */
+void
+pgstat_count_heap_insert(Relation rel)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+       if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+       {
+               int             nest_level = GetCurrentTransactionNestLevel();
+
+               /* t_tuples_inserted is nontransactional, so just advance it */
+               pgstat_info->t_counts.t_tuples_inserted++;
+
+               /* We have to log the transactional effect at the proper level */
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               pgstat_info->trans->tuples_inserted++;
+       }
+}
+
+/*
+ * pgstat_count_heap_update - count a tuple update
+ */
+void
+pgstat_count_heap_update(Relation rel)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+       if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+       {
+               int             nest_level = GetCurrentTransactionNestLevel();
+
+               /* t_tuples_updated is nontransactional, so just advance it */
+               pgstat_info->t_counts.t_tuples_updated++;
+
+               /* We have to log the transactional effect at the proper level */
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               /* An UPDATE both inserts a new tuple and deletes the old */
+               pgstat_info->trans->tuples_inserted++;
+               pgstat_info->trans->tuples_deleted++;
+       }
+}
+
+/*
+ * pgstat_count_heap_delete - count a tuple deletion
+ */
+void
+pgstat_count_heap_delete(Relation rel)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+       if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+       {
+               int             nest_level = GetCurrentTransactionNestLevel();
+
+               /* t_tuples_deleted is nontransactional, so just advance it */
+               pgstat_info->t_counts.t_tuples_deleted++;
+
+               /* We have to log the transactional effect at the proper level */
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               pgstat_info->trans->tuples_deleted++;
+       }
 }
 
 
 /* ----------
- * pgstat_count_xact_commit() -
+ * AtEOXact_PgStat
  *
- *     Called from access/transam/xact.c to count transaction commits.
+ *     Called from access/transam/xact.c at top-level transaction commit/abort.
  * ----------
  */
 void
-pgstat_count_xact_commit(void)
+AtEOXact_PgStat(bool isCommit)
 {
-       if (!pgstat_collect_tuplelevel &&
-               !pgstat_collect_blocklevel)
-               return;
-
-       pgStatXactCommit++;
+       PgStat_SubXactStatus *xact_state;
 
        /*
-        * If there was no relation activity yet, just make one existing message
-        * buffer used without slots, causing the next report to tell new
-        * xact-counters.
+        * Count transaction commit or abort.  (We use counters, not just bools,
+        * in case the reporting message isn't sent right away.)
         */
-       if (RegularTabStat.tsa_alloc == 0)
-               more_tabstat_space(&RegularTabStat);
+       if (isCommit)
+               pgStatXactCommit++;
+       else
+               pgStatXactRollback++;
 
-       if (RegularTabStat.tsa_used == 0)
+       /*
+        * Transfer transactional insert/update counts into the base tabstat
+        * entries.  We don't bother to free any of the transactional state,
+        * since it's all in TopTransactionContext and will go away anyway.
+        */
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
        {
-               RegularTabStat.tsa_used++;
-               RegularTabStat.tsa_messages[0]->m_nentries = 0;
+               PgStat_TableXactStatus *trans;
+
+               Assert(xact_state->nest_level == 1);
+               Assert(xact_state->prev == NULL);
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       Assert(trans->nest_level == 1);
+                       Assert(trans->upper == NULL);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+                       if (isCommit)
+                       {
+                               tabstat->t_counts.t_new_live_tuples += trans->tuples_inserted;
+                               tabstat->t_counts.t_new_dead_tuples += trans->tuples_deleted;
+                       }
+                       else
+                       {
+                               /* inserted tuples are dead, deleted tuples are unaffected */
+                               tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
+                       }
+                       tabstat->trans = NULL;
+               }
        }
-}
+       pgStatXactStack = NULL;
 
+       /* Make sure any stats snapshot is thrown away */
+       pgstat_clear_snapshot();
+}
 
 /* ----------
- * pgstat_count_xact_rollback() -
+ * AtEOSubXact_PgStat
  *
- *     Called from access/transam/xact.c to count transaction rollbacks.
+ *     Called from access/transam/xact.c at subtransaction commit/abort.
  * ----------
  */
 void
-pgstat_count_xact_rollback(void)
+AtEOSubXact_PgStat(bool isCommit, int nestDepth)
 {
-       if (!pgstat_collect_tuplelevel &&
-               !pgstat_collect_blocklevel)
-               return;
-
-       pgStatXactRollback++;
+       PgStat_SubXactStatus *xact_state;
 
        /*
-        * If there was no relation activity yet, just make one existing message
-        * buffer used without slots, causing the next report to tell new
-        * xact-counters.
+        * Transfer transactional insert/update counts into the next higher
+        * subtransaction state.
         */
-       if (RegularTabStat.tsa_alloc == 0)
-               more_tabstat_space(&RegularTabStat);
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL &&
+               xact_state->nest_level >= nestDepth)
+       {
+               PgStat_TableXactStatus *trans;
+               PgStat_TableXactStatus *next_trans;
+
+               /* delink xact_state from stack immediately to simplify reuse case */
+               pgStatXactStack = xact_state->prev;
+
+               for (trans = xact_state->first; trans != NULL; trans = next_trans)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       next_trans = trans->next;
+                       Assert(trans->nest_level == nestDepth);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+                       if (isCommit)
+                       {
+                               if (trans->upper && trans->upper->nest_level == nestDepth - 1)
+                               {
+                                       trans->upper->tuples_inserted += trans->tuples_inserted;
+                                       trans->upper->tuples_deleted += trans->tuples_deleted;
+                                       tabstat->trans = trans->upper;
+                                       pfree(trans);
+                               }
+                               else
+                               {
+                                       /*
+                                        * When there isn't an immediate parent state, we can
+                                        * just reuse the record instead of going through a
+                                        * palloc/pfree pushup (this works since it's all in
+                                        * TopTransactionContext anyway).  We have to re-link
+                                        * it into the parent level, though, and that might mean
+                                        * pushing a new entry into the pgStatXactStack.
+                                        */
+                                       PgStat_SubXactStatus *upper_xact_state;
+
+                                       upper_xact_state = get_tabstat_stack_level(nestDepth - 1);
+                                       trans->next = upper_xact_state->first;
+                                       upper_xact_state->first = trans;
+                                       trans->nest_level = nestDepth - 1;
+                               }
+                       }
+                       else
+                       {
+                               /*
+                                * On abort, inserted tuples are dead (and can be bounced out
+                                * to the top-level tabstat), deleted tuples are unaffected
+                                */
+                               tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
+                               tabstat->trans = trans->upper;
+                               pfree(trans);
+                       }
+               }
+               pfree(xact_state);
+       }
+}
+
+
+/*
+ * AtPrepare_PgStat
+ *             Save the transactional stats state at 2PC transaction prepare.
+ *
+ * In this phase we just generate 2PC records for all the pending
+ * transaction-dependent stats work.
+ */
+void
+AtPrepare_PgStat(void)
+{
+       PgStat_SubXactStatus *xact_state;
 
-       if (RegularTabStat.tsa_used == 0)
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
        {
-               RegularTabStat.tsa_used++;
-               RegularTabStat.tsa_messages[0]->m_nentries = 0;
+               PgStat_TableXactStatus *trans;
+
+               Assert(xact_state->nest_level == 1);
+               Assert(xact_state->prev == NULL);
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+                       TwoPhasePgStatRecord record;
+
+                       Assert(trans->nest_level == 1);
+                       Assert(trans->upper == NULL);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+
+                       record.tuples_inserted = trans->tuples_inserted;
+                       record.tuples_deleted = trans->tuples_deleted;
+                       record.t_id = tabstat->t_id;
+                       record.t_shared = tabstat->t_shared;
+
+                       RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0,
+                                                                  &record, sizeof(TwoPhasePgStatRecord));
+               }
        }
 }
 
+/*
+ * PostPrepare_PgStat
+ *             Clean up after successful PREPARE.
+ *
+ * All we need do here is unlink the transaction stats state from the
+ * nontransactional state.  The nontransactional action counts will be
+ * reported to the stats collector immediately, while the effects on live
+ * and dead tuple counts are preserved in the 2PC state file.
+ *
+ * Note: AtEOXact_PgStat is not called during PREPARE.
+ */
+void
+PostPrepare_PgStat(void)
+{
+       PgStat_SubXactStatus *xact_state;
+
+       /*
+        * We don't bother to free any of the transactional state,
+        * since it's all in TopTransactionContext and will go away anyway.
+        */
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
+       {
+               PgStat_TableXactStatus *trans;
+
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       tabstat = trans->parent;
+                       tabstat->trans = NULL;
+               }
+       }
+       pgStatXactStack = NULL;
+
+       /* Make sure any stats snapshot is thrown away */
+       pgstat_clear_snapshot();
+}
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state.
+ */
+void
+pgstat_twophase_postcommit(TransactionId xid, uint16 info,
+                                                  void *recdata, uint32 len)
+{
+       TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+       PgStat_TableStatus *pgstat_info;
+
+       /* Find or create a tabstat entry for the rel */
+       pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+       pgstat_info->t_counts.t_new_live_tuples += rec->tuples_inserted;
+       pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_deleted;
+}
+
+/*
+ * 2PC processing routine for ROLLBACK PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state, but treat them
+ * as aborted.
+ */
+void
+pgstat_twophase_postabort(TransactionId xid, uint16 info,
+                                                 void *recdata, uint32 len)
+{
+       TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+       PgStat_TableStatus *pgstat_info;
+
+       /* Find or create a tabstat entry for the rel */
+       pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+       /* inserted tuples are dead, deleted tuples are no-ops */
+       pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_inserted;
+}
+
 
 /* ----------
  * pgstat_fetch_stat_dbentry() -
@@ -1725,18 +2093,15 @@ pgstat_send(void *msg, int len)
 void
 pgstat_send_bgwriter(void)
 {
+       /* We assume this initializes to zeroes */
+       static const PgStat_MsgBgWriter all_zeroes;
+
        /*
         * This function can be called even if nothing at all has happened.
         * In this case, avoid sending a completely empty message to
         * the stats collector.
         */
-       if (BgWriterStats.m_timed_checkpoints == 0 &&
-               BgWriterStats.m_requested_checkpoints == 0 &&
-               BgWriterStats.m_buf_written_checkpoints == 0 &&
-               BgWriterStats.m_buf_written_lru == 0 &&
-               BgWriterStats.m_buf_written_all == 0 &&
-               BgWriterStats.m_maxwritten_lru == 0 &&
-               BgWriterStats.m_maxwritten_all == 0)
+       if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0)
                return;
 
        /*
@@ -1746,10 +2111,9 @@ pgstat_send_bgwriter(void)
        pgstat_send(&BgWriterStats, sizeof(BgWriterStats));
 
        /*
-        * Clear out the bgwriter statistics buffer, so it can be
-        * re-used.
+        * Clear out the statistics buffer, so it can be re-used.
         */
-       memset(&BgWriterStats, 0, sizeof(BgWriterStats));
+       MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
 }
 
 
@@ -2509,60 +2873,50 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
                         * If it's a new table entry, initialize counters to the values we
                         * just got.
                         */
-                       tabentry->numscans = tabmsg[i].t_numscans;
-                       tabentry->tuples_returned = tabmsg[i].t_tuples_returned;
-                       tabentry->tuples_fetched = tabmsg[i].t_tuples_fetched;
-                       tabentry->tuples_inserted = tabmsg[i].t_tuples_inserted;
-                       tabentry->tuples_updated = tabmsg[i].t_tuples_updated;
-                       tabentry->tuples_deleted = tabmsg[i].t_tuples_deleted;
-
-                       tabentry->n_live_tuples = tabmsg[i].t_tuples_inserted;
-                       tabentry->n_dead_tuples = tabmsg[i].t_tuples_updated +
-                               tabmsg[i].t_tuples_deleted;
+                       tabentry->numscans = tabmsg[i].t_counts.t_numscans;
+                       tabentry->tuples_returned = tabmsg[i].t_counts.t_tuples_returned;
+                       tabentry->tuples_fetched = tabmsg[i].t_counts.t_tuples_fetched;
+                       tabentry->tuples_inserted = tabmsg[i].t_counts.t_tuples_inserted;
+                       tabentry->tuples_updated = tabmsg[i].t_counts.t_tuples_updated;
+                       tabentry->tuples_deleted = tabmsg[i].t_counts.t_tuples_deleted;
+                       tabentry->n_live_tuples = tabmsg[i].t_counts.t_new_live_tuples;
+                       tabentry->n_dead_tuples = tabmsg[i].t_counts.t_new_dead_tuples;
+                       tabentry->blocks_fetched = tabmsg[i].t_counts.t_blocks_fetched;
+                       tabentry->blocks_hit = tabmsg[i].t_counts.t_blocks_hit;
+
                        tabentry->last_anl_tuples = 0;
                        tabentry->vacuum_timestamp = 0;
                        tabentry->autovac_vacuum_timestamp = 0;
                        tabentry->analyze_timestamp = 0;
                        tabentry->autovac_analyze_timestamp = 0;
-
-                       tabentry->blocks_fetched = tabmsg[i].t_blocks_fetched;
-                       tabentry->blocks_hit = tabmsg[i].t_blocks_hit;
                }
                else
                {
                        /*
                         * Otherwise add the values to the existing entry.
                         */
-                       tabentry->numscans += tabmsg[i].t_numscans;
-                       tabentry->tuples_returned += tabmsg[i].t_tuples_returned;
-                       tabentry->tuples_fetched += tabmsg[i].t_tuples_fetched;
-                       tabentry->tuples_inserted += tabmsg[i].t_tuples_inserted;
-                       tabentry->tuples_updated += tabmsg[i].t_tuples_updated;
-                       tabentry->tuples_deleted += tabmsg[i].t_tuples_deleted;
-
-                       tabentry->n_live_tuples += tabmsg[i].t_tuples_inserted -
-                               tabmsg[i].t_tuples_deleted;
-                       tabentry->n_dead_tuples += tabmsg[i].t_tuples_updated +
-                               tabmsg[i].t_tuples_deleted;
-
-                       tabentry->blocks_fetched += tabmsg[i].t_blocks_fetched;
-                       tabentry->blocks_hit += tabmsg[i].t_blocks_hit;
+                       tabentry->numscans += tabmsg[i].t_counts.t_numscans;
+                       tabentry->tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
+                       tabentry->tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
+                       tabentry->tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
+                       tabentry->tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
+                       tabentry->tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+                       tabentry->n_live_tuples += tabmsg[i].t_counts.t_new_live_tuples;
+                       tabentry->n_dead_tuples += tabmsg[i].t_counts.t_new_dead_tuples;
+                       tabentry->blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
+                       tabentry->blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
                }
 
                /*
-                * Add table stats to the database entry.
-                */
-               dbentry->n_tuples_returned += tabmsg[i].t_tuples_returned;
-               dbentry->n_tuples_fetched += tabmsg[i].t_tuples_fetched;
-               dbentry->n_tuples_inserted += tabmsg[i].t_tuples_inserted;
-               dbentry->n_tuples_updated += tabmsg[i].t_tuples_updated;
-               dbentry->n_tuples_deleted += tabmsg[i].t_tuples_deleted;
-
-               /*
-                * And add the block IO to the database entry.
+                * Add per-table stats to the per-database entry, too.
                 */
-               dbentry->n_blocks_fetched += tabmsg[i].t_blocks_fetched;
-               dbentry->n_blocks_hit += tabmsg[i].t_blocks_hit;
+               dbentry->n_tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
+               dbentry->n_tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
+               dbentry->n_tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
+               dbentry->n_tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
+               dbentry->n_tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+               dbentry->n_blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
+               dbentry->n_blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
        }
 }
 
index 9f4876a6050323d3cbb6aa4900921a3701ad1e88..e2cfc870e2e9eeb72c05a9ed26d5fdc0f4083b6f 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.218 2007/05/02 23:34:48 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.219 2007/05/27 03:50:39 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -88,12 +88,6 @@ static bool IsForInput;
 /* local state for LockBufferForCleanup */
 static volatile BufferDesc *PinCountWaitBuf = NULL;
 
-/*
- * Global statistics for the bgwriter. The contents of this variable
- * only makes sense in the bgwriter process.
- */
-extern PgStat_MsgBgWriter BgWriterStats;
-
 
 static Buffer ReadBuffer_common(Relation reln, BlockNumber blockNum,
                                                                bool zeroPage);
@@ -174,7 +168,7 @@ ReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage)
        if (isExtend)
                blockNum = smgrnblocks(reln->rd_smgr);
 
-       pgstat_count_buffer_read(&reln->pgstat_info, reln);
+       pgstat_count_buffer_read(reln);
 
        if (isLocalBuf)
        {
@@ -204,7 +198,7 @@ ReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage)
                if (!isExtend)
                {
                        /* Just need to update stats before we exit */
-                       pgstat_count_buffer_hit(&reln->pgstat_info, reln);
+                       pgstat_count_buffer_hit(reln);
 
                        if (VacuumCostActive)
                                VacuumCostBalance += VacuumCostPageHit;
index 7d554c2ada2e040cb818096d0d33df26314d5c7e..45cb103adeeeb4d2462f741204fa250fc073d5f7 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.260 2007/05/02 21:08:46 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.261 2007/05/27 03:50:39 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1802,6 +1802,7 @@ RelationClearRelation(Relation relation, bool rebuild)
                int                     old_refcnt = relation->rd_refcnt;
                SubTransactionId old_createSubid = relation->rd_createSubid;
                SubTransactionId old_newRelfilenodeSubid = relation->rd_newRelfilenodeSubid;
+               struct PgStat_TableStatus *old_pgstat_info = relation->pgstat_info;
                TupleDesc       old_att = relation->rd_att;
                RuleLock   *old_rules = relation->rd_rules;
                MemoryContext old_rulescxt = relation->rd_rulescxt;
@@ -1821,6 +1822,7 @@ RelationClearRelation(Relation relation, bool rebuild)
                relation->rd_refcnt = old_refcnt;
                relation->rd_createSubid = old_createSubid;
                relation->rd_newRelfilenodeSubid = old_newRelfilenodeSubid;
+               relation->pgstat_info = old_pgstat_info;
 
                if (equalTupleDescs(old_att, relation->rd_att))
                {
index 5ea66e74672e1d42f37a8a512bae61ca4f1e60bd..ebb2e984c24903ddea1c07db8d3a42485c6a6036 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.123 2007/04/08 01:26:33 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.124 2007/05/27 03:50:39 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -147,10 +147,10 @@ extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction);
 
 extern bool heap_fetch(Relation relation, Snapshot snapshot,
                   HeapTuple tuple, Buffer *userbuf, bool keep_buf,
-                  PgStat_Info *pgstat_info);
+                  Relation stats_relation);
 extern bool heap_release_fetch(Relation relation, Snapshot snapshot,
                                   HeapTuple tuple, Buffer *userbuf, bool keep_buf,
-                                  PgStat_Info *pgstat_info);
+                                  Relation stats_relation);
 
 extern void heap_get_latest_tid(Relation relation, Snapshot snapshot,
                                        ItemPointer tid);
index 77bca6be482b8e5c4025476b55a048406cd08d93..7a1ea39352ade3e95692372b55589b73d8fde517 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.52 2007/01/20 18:43:35 neilc Exp $
+ * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.53 2007/05/27 03:50:39 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -37,8 +37,6 @@ typedef struct HeapScanDescData
        /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
        ItemPointerData rs_mctid;       /* marked scan position, if any */
 
-       PgStat_Info rs_pgstat_info; /* statistics collector hook */
-
        /* these fields only used in page-at-a-time mode */
        int                     rs_cindex;              /* current tuple's index in vistuples */
        int                     rs_mindex;              /* marked tuple's saved index */
@@ -78,8 +76,6 @@ typedef struct IndexScanDescData
        HeapTupleData xs_ctup;          /* current heap tuple, if any */
        Buffer          xs_cbuf;                /* current heap buffer in scan, if any */
        /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
-
-       PgStat_Info xs_pgstat_info; /* statistics collector hook */
 } IndexScanDescData;
 
 typedef IndexScanDescData *IndexScanDesc;
index 0dbcd226fbde99d06fc04594eaf373b5eddcfb34..e98ad7cb375a1efd9cc12c2ae900cb4f426c6c35 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/twophase_rmgr.h,v 1.4 2007/01/05 22:19:51 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/twophase_rmgr.h,v 1.5 2007/05/27 03:50:39 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -26,7 +26,8 @@ typedef uint8 TwoPhaseRmgrId;
 #define TWOPHASE_RM_INVAL_ID           2
 #define TWOPHASE_RM_FLATFILES_ID       3
 #define TWOPHASE_RM_NOTIFY_ID          4
-#define TWOPHASE_RM_MAX_ID                     TWOPHASE_RM_NOTIFY_ID
+#define TWOPHASE_RM_PGSTAT_ID          5
+#define TWOPHASE_RM_MAX_ID                     TWOPHASE_RM_PGSTAT_ID
 
 extern const TwoPhaseCallback twophase_recover_callbacks[];
 extern const TwoPhaseCallback twophase_postcommit_callbacks[];
index 694ee44db19adb2812ac3315891d57bfb962aab9..476fd47dc7b0ab48fba50abc3b4ad3e965c522ae 100644 (file)
@@ -5,7 +5,7 @@
  *
  *     Copyright (c) 2001-2007, PostgreSQL Global Development Group
  *
- *     $PostgreSQL: pgsql/src/include/pgstat.h,v 1.58 2007/04/30 16:37:08 tgl Exp $
+ *     $PostgreSQL: pgsql/src/include/pgstat.h,v 1.59 2007/05/27 03:50:39 tgl Exp $
  * ----------
  */
 #ifndef PGSTAT_H
@@ -40,6 +40,90 @@ typedef enum StatMsgType
  */
 typedef int64 PgStat_Counter;
 
+/* ----------
+ * PgStat_TableCounts                  The actual per-table counts kept by a backend
+ *
+ * This struct should contain only actual event counters, because we memcmp
+ * it against zeroes to detect whether there are any counts to transmit.
+ * It is a component of PgStat_TableStatus (within-backend state) and
+ * PgStat_TableEntry (the transmitted message format).
+ *
+ * Note: for a table, tuples_returned is the number of tuples successfully
+ * fetched by heap_getnext, while tuples_fetched is the number of tuples
+ * successfully fetched by heap_fetch under the control of bitmap indexscans.
+ * For an index, tuples_returned is the number of index entries returned by
+ * the index AM, while tuples_fetched is the number of tuples successfully
+ * fetched by heap_fetch under the control of simple indexscans for this index.
+ *
+ * tuples_inserted/tuples_updated/tuples_deleted count attempted actions,
+ * regardless of whether the transaction committed.  new_live_tuples and
+ * new_dead_tuples are properly adjusted depending on commit or abort.
+ * ----------
+ */
+typedef struct PgStat_TableCounts
+{
+       PgStat_Counter t_numscans;
+
+       PgStat_Counter t_tuples_returned;
+       PgStat_Counter t_tuples_fetched;
+
+       PgStat_Counter t_tuples_inserted;
+       PgStat_Counter t_tuples_updated;
+       PgStat_Counter t_tuples_deleted;
+
+       PgStat_Counter t_new_live_tuples;
+       PgStat_Counter t_new_dead_tuples;
+
+       PgStat_Counter t_blocks_fetched;
+       PgStat_Counter t_blocks_hit;
+} PgStat_TableCounts;
+
+
+/* ------------------------------------------------------------
+ * Structures kept in backend local memory while accumulating counts
+ * ------------------------------------------------------------
+ */
+
+
+/* ----------
+ * PgStat_TableStatus                  Per-table status within a backend
+ *
+ * Most of the event counters are nontransactional, ie, we count events
+ * in committed and aborted transactions alike.  For these, we just count
+ * directly in the PgStat_TableStatus.  However, new_live_tuples and
+ * new_dead_tuples must be derived from tuple insertion and deletion counts
+ * with awareness of whether the transaction or subtransaction committed or
+ * aborted.  Hence, we also keep a stack of per-(sub)transaction status
+ * records for every table modified in the current transaction.  At commit
+ * or abort, we propagate tuples_inserted and tuples_deleted up to the
+ * parent subtransaction level, or out to the parent PgStat_TableStatus,
+ * as appropriate.
+ * ----------
+ */
+typedef struct PgStat_TableStatus
+{
+       Oid                     t_id;                           /* table's OID */
+       bool            t_shared;                       /* is it a shared catalog? */
+       struct PgStat_TableXactStatus *trans;   /* lowest subxact's counts */
+       PgStat_TableCounts t_counts;    /* event counts to be sent */
+} PgStat_TableStatus;
+
+/* ----------
+ * PgStat_TableXactStatus              Per-table, per-subtransaction status
+ * ----------
+ */
+typedef struct PgStat_TableXactStatus
+{
+       PgStat_Counter tuples_inserted; /* tuples inserted in (sub)xact */
+       PgStat_Counter tuples_deleted;  /* tuples deleted in (sub)xact */
+       int                     nest_level;                     /* subtransaction nest level */
+       /* links to other structs for same relation: */
+       struct PgStat_TableXactStatus *upper;   /* next higher subxact if any */
+       PgStat_TableStatus *parent;                             /* per-table status */
+       /* structs of same subxact level are linked here: */
+       struct PgStat_TableXactStatus *next;    /* next of same subxact */
+} PgStat_TableXactStatus;
+
 
 /* ------------------------------------------------------------
  * Message formats follow
@@ -78,30 +162,12 @@ typedef struct PgStat_MsgDummy
 
 /* ----------
  * PgStat_TableEntry                   Per-table info in a MsgTabstat
- *
- * Note: for a table, tuples_returned is the number of tuples successfully
- * fetched by heap_getnext, while tuples_fetched is the number of tuples
- * successfully fetched by heap_fetch under the control of bitmap indexscans.
- * For an index, tuples_returned is the number of index entries returned by
- * the index AM, while tuples_fetched is the number of tuples successfully
- * fetched by heap_fetch under the control of simple indexscans for this index.
  * ----------
  */
 typedef struct PgStat_TableEntry
 {
        Oid                     t_id;
-
-       PgStat_Counter t_numscans;
-
-       PgStat_Counter t_tuples_returned;
-       PgStat_Counter t_tuples_fetched;
-
-       PgStat_Counter t_tuples_inserted;
-       PgStat_Counter t_tuples_updated;
-       PgStat_Counter t_tuples_deleted;
-
-       PgStat_Counter t_blocks_fetched;
-       PgStat_Counter t_blocks_hit;
+       PgStat_TableCounts t_counts;
 } PgStat_TableEntry;
 
 /* ----------
@@ -393,6 +459,10 @@ extern bool pgstat_collect_tuplelevel;
 extern bool pgstat_collect_blocklevel;
 extern bool pgstat_collect_querystring;
 
+/*
+ * BgWriter statistics counters are updated directly by bgwriter and bufmgr
+ */
+extern PgStat_MsgBgWriter BgWriterStats;
 
 /* ----------
  * Functions called from postmaster
@@ -436,83 +506,67 @@ extern void pgstat_report_activity(const char *what);
 extern void pgstat_report_txn_timestamp(TimestampTz tstamp);
 extern void pgstat_report_waiting(bool waiting);
 
-extern void pgstat_initstats(PgStat_Info *stats, Relation rel);
+extern void pgstat_initstats(Relation rel);
 
+/* nontransactional event counts are simple enough to inline */
 
-#define pgstat_count_heap_scan(s)                                                                              \
+#define pgstat_count_heap_scan(rel)                                                                            \
        do {                                                                                                                            \
-               if (pgstat_collect_tuplelevel && (s)->tabentry != NULL)                 \
-                       ((PgStat_TableEntry *)((s)->tabentry))->t_numscans++;           \
+               if (pgstat_collect_tuplelevel && (rel)->pgstat_info != NULL)    \
+                       (rel)->pgstat_info->t_counts.t_numscans++;                                      \
        } while (0)
 /* kluge for bitmap scans: */
-#define pgstat_discount_heap_scan(s)                                                                   \
+#define pgstat_discount_heap_scan(rel)                                                                 \
        do {                                                                                                                            \
-               if (pgstat_collect_tuplelevel && (s)->tabentry != NULL)                 \
-                       ((PgStat_TableEntry *)((s)->tabentry))->t_numscans--;           \
+               if (pgstat_collect_tuplelevel && (rel)->pgstat_info != NULL)    \
+                       (rel)->pgstat_info->t_counts.t_numscans--;                                      \
        } while (0)
-#define pgstat_count_heap_getnext(s)                                                                   \
+#define pgstat_count_heap_getnext(rel)                                                                 \
        do {                                                                                                                            \
-               if (pgstat_collect_tuplelevel && (s)->tabentry != NULL)                 \
-                       ((PgStat_TableEntry *)((s)->tabentry))->t_tuples_returned++; \
+               if (pgstat_collect_tuplelevel && (rel)->pgstat_info != NULL)    \
+                       (rel)->pgstat_info->t_counts.t_tuples_returned++;                       \
        } while (0)
-#define pgstat_count_heap_fetch(s)                                                                             \
+#define pgstat_count_heap_fetch(rel)                                                                   \
        do {                                                                                                                            \
-               if (pgstat_collect_tuplelevel && (s)->tabentry != NULL)                 \
-                       ((PgStat_TableEntry *)((s)->tabentry))->t_tuples_fetched++; \
+               if (pgstat_collect_tuplelevel && (rel)->pgstat_info != NULL)    \
+                       (rel)->pgstat_info->t_counts.t_tuples_fetched++;                        \
        } while (0)
-#define pgstat_count_heap_insert(s)                                                                            \
+#define pgstat_count_index_scan(rel)                                                                   \
        do {                                                                                                                            \
-               if (pgstat_collect_tuplelevel && (s)->tabentry != NULL)                 \
-                       ((PgStat_TableEntry *)((s)->tabentry))->t_tuples_inserted++; \
+               if (pgstat_collect_tuplelevel && (rel)->pgstat_info != NULL)    \
+                       (rel)->pgstat_info->t_counts.t_numscans++;                                      \
        } while (0)
-#define pgstat_count_heap_update(s)                                                                            \
+#define pgstat_count_index_tuples(rel, n)                                                              \
        do {                                                                                                                            \
-               if (pgstat_collect_tuplelevel && (s)->tabentry != NULL)                 \
-                       ((PgStat_TableEntry *)((s)->tabentry))->t_tuples_updated++; \
+               if (pgstat_collect_tuplelevel && (rel)->pgstat_info != NULL)    \
+                       (rel)->pgstat_info->t_counts.t_tuples_returned += (n);          \
        } while (0)
-#define pgstat_count_heap_delete(s)                                                                            \
+#define pgstat_count_buffer_read(rel)                                                                  \
        do {                                                                                                                            \
-               if (pgstat_collect_tuplelevel && (s)->tabentry != NULL)                 \
-                       ((PgStat_TableEntry *)((s)->tabentry))->t_tuples_deleted++; \
+               if (pgstat_collect_blocklevel && (rel)->pgstat_info != NULL)    \
+                       (rel)->pgstat_info->t_counts.t_blocks_fetched++;                        \
        } while (0)
-#define pgstat_count_index_scan(s)                                                                             \
+#define pgstat_count_buffer_hit(rel)                                                                   \
        do {                                                                                                                            \
-               if (pgstat_collect_tuplelevel && (s)->tabentry != NULL)                 \
-                       ((PgStat_TableEntry *)((s)->tabentry))->t_numscans++;           \
-       } while (0)
-#define pgstat_count_index_tuples(s, n)                                                                        \
-       do {                                                                                                                            \
-               if (pgstat_collect_tuplelevel && (s)->tabentry != NULL)                 \
-                       ((PgStat_TableEntry *)((s)->tabentry))->t_tuples_returned += (n); \
-       } while (0)
-#define pgstat_count_buffer_read(s,r)                                                                  \
-       do {                                                                                                                            \
-               if (pgstat_collect_blocklevel) {                                                                \
-                       if ((s)->tabentry != NULL)                                                                      \
-                               ((PgStat_TableEntry *)((s)->tabentry))->t_blocks_fetched++; \
-                       else {                                                                                                          \
-                               pgstat_initstats((s), (r));                                                             \
-                               if ((s)->tabentry != NULL)                                                              \
-                                       ((PgStat_TableEntry *)((s)->tabentry))->t_blocks_fetched++; \
-                       }                                                                                                                       \
-               }                                                                                                                               \
-       } while (0)
-#define pgstat_count_buffer_hit(s,r)                                                                   \
-       do {                                                                                                                            \
-               if (pgstat_collect_blocklevel) {                                                                \
-                       if ((s)->tabentry != NULL)                                                                      \
-                               ((PgStat_TableEntry *)((s)->tabentry))->t_blocks_hit++; \
-                       else {                                                                                                          \
-                               pgstat_initstats((s), (r));                                                             \
-                               if ((s)->tabentry != NULL)                                                              \
-                                       ((PgStat_TableEntry *)((s)->tabentry))->t_blocks_hit++; \
-                       }                                                                                                                       \
-               }                                                                                                                               \
+               if (pgstat_collect_blocklevel && (rel)->pgstat_info != NULL)    \
+                       (rel)->pgstat_info->t_counts.t_blocks_hit++;                            \
        } while (0)
 
+extern void pgstat_count_heap_insert(Relation rel);
+extern void pgstat_count_heap_update(Relation rel);
+extern void pgstat_count_heap_delete(Relation rel);
+
+extern void AtEOXact_PgStat(bool isCommit);
+extern void AtEOSubXact_PgStat(bool isCommit, int nestDepth);
+
+extern void AtPrepare_PgStat(void);
+extern void PostPrepare_PgStat(void);
+
+extern void pgstat_twophase_postcommit(TransactionId xid, uint16 info,
+                                                                          void *recdata, uint32 len);
+extern void pgstat_twophase_postabort(TransactionId xid, uint16 info,
+                                                                         void *recdata, uint32 len);
 
-extern void pgstat_count_xact_commit(void);
-extern void pgstat_count_xact_rollback(void);
 extern void pgstat_send_bgwriter(void);
 
 /* ----------
index 33795de2bf8acb17a278c74e7e69045bbffeef40..bc6bf190b86c198851a4319569da3ab3ab375fec 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.100 2007/03/29 00:15:39 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.101 2007/05/27 03:50:39 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -89,15 +89,6 @@ typedef struct TriggerDesc
 } TriggerDesc;
 
 
-/*
- * Same for the statistics collector data in Relation and scan data.
- */
-typedef struct PgStat_Info
-{
-       void       *tabentry;
-} PgStat_Info;
-
-
 /*
  * Cached lookup information for the index access method functions defined
  * by the pg_am row associated with an index relation.
@@ -200,8 +191,8 @@ typedef struct RelationData
        List       *rd_indpred;         /* index predicate tree, if any */
        void       *rd_amcache;         /* available for use by index AM */
 
-       /* statistics collection area */
-       PgStat_Info pgstat_info;
+       /* use "struct" here to avoid needing to include pgstat.h: */
+       struct PgStat_TableStatus *pgstat_info; /* statistics collection area */
 } RelationData;
 
 typedef RelationData *Relation;