From 46f95d664eda527fab734ac3e7ee9d5c23b6ae23 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sun, 31 Aug 2025 16:37:05 -0700 Subject: [PATCH 1/7] Instrumentation: Keep time fields as instrtime, require caller to convert Previously the Instrumentation logic always converted to seconds, only for many of the callers to do unnecessary division to get to milliseconds. Since an upcoming refactoring will split the Instrumentation struct, utilize instrtime always to keep things simpler. --- contrib/auto_explain/auto_explain.c | 2 +- .../pg_stat_statements/pg_stat_statements.c | 2 +- src/backend/commands/explain.c | 12 +++++------ src/backend/executor/instrument.c | 20 ++++++++----------- src/include/executor/instrument.h | 6 +++--- src/include/portability/instr_time.h | 2 ++ 6 files changed, 21 insertions(+), 23 deletions(-) diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c index 1f4badb49284..c10f2fc0f255 100644 --- a/contrib/auto_explain/auto_explain.c +++ b/contrib/auto_explain/auto_explain.c @@ -388,7 +388,7 @@ explain_ExecutorEnd(QueryDesc *queryDesc) InstrEndLoop(queryDesc->totaltime); /* Log plan if duration is exceeded. */ - msec = queryDesc->totaltime->total * 1000.0; + msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total); if (msec >= auto_explain_log_min_duration) { ExplainState *es = NewExplainState(); diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 39208f80b5bb..47de4c98ae31 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -1093,7 +1093,7 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) queryDesc->plannedstmt->stmt_location, queryDesc->plannedstmt->stmt_len, PGSS_EXEC, - queryDesc->totaltime->total * 1000.0, /* convert to msec */ + INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total), queryDesc->estate->es_total_processed, &queryDesc->totaltime->bufusage, &queryDesc->totaltime->walusage, diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 5a6390631eba..f0a3a29d6795 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -1136,7 +1136,7 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) appendStringInfo(es->str, " on %s", relname); if (es->timing) appendStringInfo(es->str, ": time=%.3f calls=%.0f\n", - 1000.0 * instr->total, instr->ntuples); + INSTR_TIME_GET_MILLISEC(instr->total), instr->ntuples); else appendStringInfo(es->str, ": calls=%.0f\n", instr->ntuples); } @@ -1147,7 +1147,7 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) ExplainPropertyText("Constraint Name", conname, es); ExplainPropertyText("Relation", relname, es); if (es->timing) - ExplainPropertyFloat("Time", "ms", 1000.0 * instr->total, 3, + ExplainPropertyFloat("Time", "ms", INSTR_TIME_GET_MILLISEC(instr->total), 3, es); ExplainPropertyFloat("Calls", NULL, instr->ntuples, 0, es); } @@ -1835,8 +1835,8 @@ ExplainNode(PlanState *planstate, List *ancestors, planstate->instrument && planstate->instrument->nloops > 0) { double nloops = planstate->instrument->nloops; - double startup_ms = 1000.0 * planstate->instrument->startup / nloops; - double total_ms = 1000.0 * planstate->instrument->total / nloops; + double startup_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->startup) / nloops; + double total_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->total) / nloops; double rows = planstate->instrument->ntuples / nloops; if (es->format == EXPLAIN_FORMAT_TEXT) @@ -1901,8 +1901,8 @@ ExplainNode(PlanState *planstate, List *ancestors, if (nloops <= 0) continue; - startup_ms = 1000.0 * instrument->startup / nloops; - total_ms = 1000.0 * instrument->total / nloops; + startup_ms = INSTR_TIME_GET_MILLISEC(instrument->startup) / nloops; + total_ms = INSTR_TIME_GET_MILLISEC(instrument->total) / nloops; rows = instrument->ntuples / nloops; ExplainOpenWorker(n, es); diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 9e11c662a7c1..20653a5c4c4e 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -114,7 +114,7 @@ InstrStopNode(Instrumentation *instr, double nTuples) if (!instr->running) { instr->running = true; - instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter); + instr->firsttuple = instr->counter; } else { @@ -123,7 +123,7 @@ InstrStopNode(Instrumentation *instr, double nTuples) * this might be the first tuple */ if (instr->async_mode && save_tuplecount < 1.0) - instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter); + instr->firsttuple = instr->counter; } } @@ -139,8 +139,6 @@ InstrUpdateTupleCount(Instrumentation *instr, double nTuples) void InstrEndLoop(Instrumentation *instr) { - double totaltime; - /* Skip if nothing has happened, or already shut down */ if (!instr->running) return; @@ -149,10 +147,8 @@ InstrEndLoop(Instrumentation *instr) elog(ERROR, "InstrEndLoop called on running node"); /* Accumulate per-cycle statistics into totals */ - totaltime = INSTR_TIME_GET_DOUBLE(instr->counter); - - instr->startup += instr->firsttuple; - instr->total += totaltime; + INSTR_TIME_ADD(instr->startup, instr->firsttuple); + INSTR_TIME_ADD(instr->total, instr->counter); instr->ntuples += instr->tuplecount; instr->nloops += 1; @@ -160,7 +156,7 @@ InstrEndLoop(Instrumentation *instr) instr->running = false; INSTR_TIME_SET_ZERO(instr->starttime); INSTR_TIME_SET_ZERO(instr->counter); - instr->firsttuple = 0; + INSTR_TIME_SET_ZERO(instr->firsttuple); instr->tuplecount = 0; } @@ -173,14 +169,14 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add) dst->running = true; dst->firsttuple = add->firsttuple; } - else if (dst->running && add->running && dst->firsttuple > add->firsttuple) + else if (dst->running && add->running && INSTR_TIME_LT(dst->firsttuple, add->firsttuple)) dst->firsttuple = add->firsttuple; INSTR_TIME_ADD(dst->counter, add->counter); dst->tuplecount += add->tuplecount; - dst->startup += add->startup; - dst->total += add->total; + INSTR_TIME_ADD(dst->startup, add->startup); + INSTR_TIME_ADD(dst->total, add->total); dst->ntuples += add->ntuples; dst->ntuples2 += add->ntuples2; dst->nloops += add->nloops; diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index ffe470f2b84c..dfc8b3c97657 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -78,13 +78,13 @@ typedef struct Instrumentation bool running; /* true if we've completed first tuple */ instr_time starttime; /* start time of current iteration of node */ instr_time counter; /* accumulated runtime for this node */ - double firsttuple; /* time for first tuple of this cycle */ + instr_time firsttuple; /* time for first tuple of this cycle */ double tuplecount; /* # of tuples emitted so far this cycle */ BufferUsage bufusage_start; /* buffer usage at start */ WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics across all completed cycles: */ - double startup; /* total startup time (in seconds) */ - double total; /* total time (in seconds) */ + instr_time startup; /* total startup time */ + instr_time total; /* total time */ double ntuples; /* total tuples produced */ double ntuples2; /* secondary node-specific tuple counter */ double nloops; /* # of run cycles for this node */ diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index f71a851b18d8..1c1c18f780a1 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -184,6 +184,8 @@ GetTimerFrequency(void) #define INSTR_TIME_ACCUM_DIFF(x,y,z) \ ((x).ticks += (y).ticks - (z).ticks) +#define INSTR_TIME_LT(x,y) \ + ((x).ticks > (y).ticks) #define INSTR_TIME_GET_DOUBLE(t) \ ((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S) From a5cb86c7f86b10962474b7741e501be434da832c Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sat, 1 Mar 2025 19:31:30 -0800 Subject: [PATCH 2/7] Separate node and trigger instrumentation from other use of Instrumentation struct Previously different places (e.g. query "total time") were repurposing the Instrumentation struct initially introduced for capturing per-node statistics during execution. This overuse of the same struct is confusing, e.g. by cluttering calls of InstrStartNode/InstrStopNode in unrelated code paths, and prevents future refactorings. Instead, simplify the Instrumentation struct to only track time and WAL/buffer usage. Similarly, drop the use of InstrEndLoop outside of per-node instrumentation - these calls were added without any apparent benefit since the relevant fields were never read. Introduce the NodeInstrumentation struct to carry forward the per-node instrumentation information, and introduce TriggerInstrumentation to capture trigger timing and firings (previously counted in "ntuples"). --- contrib/auto_explain/auto_explain.c | 8 +- .../pg_stat_statements/pg_stat_statements.c | 8 +- contrib/postgres_fdw/postgres_fdw.c | 2 +- src/backend/commands/explain.c | 21 ++- src/backend/commands/trigger.c | 22 ++-- src/backend/executor/execMain.c | 10 +- src/backend/executor/execParallel.c | 22 ++-- src/backend/executor/execProcnode.c | 4 +- src/backend/executor/instrument.c | 120 ++++++++++++++++-- src/include/executor/instrument.h | 62 +++++++-- src/include/nodes/execnodes.h | 5 +- 11 files changed, 209 insertions(+), 75 deletions(-) diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c index c10f2fc0f255..ee0c3b4c91bb 100644 --- a/contrib/auto_explain/auto_explain.c +++ b/contrib/auto_explain/auto_explain.c @@ -315,7 +315,7 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags) MemoryContext oldcxt; oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false); + queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL); MemoryContextSwitchTo(oldcxt); } } @@ -381,12 +381,6 @@ explain_ExecutorEnd(QueryDesc *queryDesc) */ oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - /* - * Make sure stats accumulation is done. (Note: it's okay if several - * levels of hook all do this.) - */ - InstrEndLoop(queryDesc->totaltime); - /* Log plan if duration is exceeded. */ msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total); if (msec >= auto_explain_log_min_duration) diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 47de4c98ae31..7f56592f5362 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -1023,7 +1023,7 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags) MemoryContext oldcxt; oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false); + queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL); MemoryContextSwitchTo(oldcxt); } } @@ -1082,12 +1082,6 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) if (queryId != INT64CONST(0) && queryDesc->totaltime && pgss_enabled(nesting_level)) { - /* - * Make sure stats accumulation is done. (Note: it's okay if several - * levels of hook all do this.) - */ - InstrEndLoop(queryDesc->totaltime); - pgss_store(queryDesc->sourceText, queryId, queryDesc->plannedstmt->stmt_location, diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c index 5e178c21b390..6f7a10a02674 100644 --- a/contrib/postgres_fdw/postgres_fdw.c +++ b/contrib/postgres_fdw/postgres_fdw.c @@ -2778,7 +2778,7 @@ postgresIterateDirectModify(ForeignScanState *node) if (!resultRelInfo->ri_projectReturning) { TupleTableSlot *slot = node->ss.ss_ScanTupleSlot; - Instrumentation *instr = node->ss.ps.instrument; + NodeInstrumentation *instr = node->ss.ps.instrument; Assert(!dmstate->has_returning); diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index f0a3a29d6795..f167e1f6371a 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -1099,18 +1099,15 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) for (nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++) { Trigger *trig = rInfo->ri_TrigDesc->triggers + nt; - Instrumentation *instr = rInfo->ri_TrigInstrument + nt; + TriggerInstrumentation *tginstr = rInfo->ri_TrigInstrument + nt; char *relname; char *conname = NULL; - /* Must clean up instrumentation state */ - InstrEndLoop(instr); - /* * We ignore triggers that were never invoked; they likely aren't * relevant to the current query type. */ - if (instr->ntuples == 0) + if (tginstr->firings == 0) continue; ExplainOpenGroup("Trigger", NULL, true, es); @@ -1135,10 +1132,10 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) if (show_relname) appendStringInfo(es->str, " on %s", relname); if (es->timing) - appendStringInfo(es->str, ": time=%.3f calls=%.0f\n", - INSTR_TIME_GET_MILLISEC(instr->total), instr->ntuples); + appendStringInfo(es->str, ": time=%.3f calls=%d\n", + INSTR_TIME_GET_MILLISEC(tginstr->instr.total), tginstr->firings); else - appendStringInfo(es->str, ": calls=%.0f\n", instr->ntuples); + appendStringInfo(es->str, ": calls=%d\n", tginstr->firings); } else { @@ -1147,9 +1144,9 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) ExplainPropertyText("Constraint Name", conname, es); ExplainPropertyText("Relation", relname, es); if (es->timing) - ExplainPropertyFloat("Time", "ms", INSTR_TIME_GET_MILLISEC(instr->total), 3, + ExplainPropertyFloat("Time", "ms", INSTR_TIME_GET_MILLISEC(tginstr->instr.total), 3, es); - ExplainPropertyFloat("Calls", NULL, instr->ntuples, 0, es); + ExplainPropertyInteger("Calls", NULL, tginstr->firings, es); } if (conname) @@ -1893,7 +1890,7 @@ ExplainNode(PlanState *planstate, List *ancestors, for (int n = 0; n < w->num_workers; n++) { - Instrumentation *instrument = &w->instrument[n]; + NodeInstrumentation *instrument = &w->instrument[n]; double nloops = instrument->nloops; double startup_ms; double total_ms; @@ -2300,7 +2297,7 @@ ExplainNode(PlanState *planstate, List *ancestors, for (int n = 0; n < w->num_workers; n++) { - Instrumentation *instrument = &w->instrument[n]; + NodeInstrumentation *instrument = &w->instrument[n]; double nloops = instrument->nloops; if (nloops <= 0) diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 12c97f2c023b..a26a017b78df 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -90,7 +90,7 @@ static bool TriggerEnabled(EState *estate, ResultRelInfo *relinfo, static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata, int tgindx, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation * instr, MemoryContext per_tuple_context); static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, @@ -2309,7 +2309,7 @@ static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata, int tgindx, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation * instr, MemoryContext per_tuple_context) { LOCAL_FCINFO(fcinfo, 0); @@ -2344,7 +2344,7 @@ ExecCallTriggerFunc(TriggerData *trigdata, * If doing EXPLAIN ANALYZE, start charging time to this trigger. */ if (instr) - InstrStartNode(instr + tgindx); + InstrStartTrigger(instr + tgindx); /* * Do the function evaluation in the per-tuple memory context, so that @@ -2389,10 +2389,10 @@ ExecCallTriggerFunc(TriggerData *trigdata, /* * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count - * one "tuple returned" (really the number of firings). + * the firing of the trigger. */ if (instr) - InstrStopNode(instr + tgindx, 1); + InstrStopTrigger(instr + tgindx, 1); return (HeapTuple) DatumGetPointer(result); } @@ -3947,7 +3947,7 @@ static void AfterTriggerExecute(EState *estate, ResultRelInfo *dst_relInfo, TriggerDesc *trigdesc, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation * instr, MemoryContext per_tuple_context, TupleTableSlot *trig_tuple_slot1, TupleTableSlot *trig_tuple_slot2); @@ -4340,7 +4340,7 @@ AfterTriggerExecute(EState *estate, ResultRelInfo *src_relInfo, ResultRelInfo *dst_relInfo, TriggerDesc *trigdesc, - FmgrInfo *finfo, Instrumentation *instr, + FmgrInfo *finfo, TriggerInstrumentation * instr, MemoryContext per_tuple_context, TupleTableSlot *trig_tuple_slot1, TupleTableSlot *trig_tuple_slot2) @@ -4381,7 +4381,7 @@ AfterTriggerExecute(EState *estate, * to include time spent re-fetching tuples in the trigger cost. */ if (instr) - InstrStartNode(instr + tgindx); + InstrStartTrigger(instr + tgindx); /* * Fetch the required tuple(s). @@ -4604,10 +4604,10 @@ AfterTriggerExecute(EState *estate, /* * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count - * one "tuple returned" (really the number of firings). + * the firing of the trigger. */ if (instr) - InstrStopNode(instr + tgindx, 1); + InstrStopTrigger(instr + tgindx, 1); } @@ -4723,7 +4723,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events, Relation rel = NULL; TriggerDesc *trigdesc = NULL; FmgrInfo *finfo = NULL; - Instrumentation *instr = NULL; + TriggerInstrumentation *instr = NULL; TupleTableSlot *slot1 = NULL, *slot2 = NULL; diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 797d8b1ca1cb..6a2ca742f97e 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -331,7 +331,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, /* Allow instrumentation of Executor overall runtime */ if (queryDesc->totaltime) - InstrStartNode(queryDesc->totaltime); + InstrStart(queryDesc->totaltime); /* * extract information from the query descriptor and the query feature. @@ -383,7 +383,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, dest->rShutdown(dest); if (queryDesc->totaltime) - InstrStopNode(queryDesc->totaltime, estate->es_processed); + InstrStop(queryDesc->totaltime); MemoryContextSwitchTo(oldcontext); } @@ -433,7 +433,7 @@ standard_ExecutorFinish(QueryDesc *queryDesc) /* Allow instrumentation of Executor overall runtime */ if (queryDesc->totaltime) - InstrStartNode(queryDesc->totaltime); + InstrStart(queryDesc->totaltime); /* Run ModifyTable nodes to completion */ ExecPostprocessPlan(estate); @@ -443,7 +443,7 @@ standard_ExecutorFinish(QueryDesc *queryDesc) AfterTriggerEndQuery(estate); if (queryDesc->totaltime) - InstrStopNode(queryDesc->totaltime, 0); + InstrStop(queryDesc->totaltime); MemoryContextSwitchTo(oldcontext); @@ -1266,7 +1266,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigWhenExprs = (ExprState **) palloc0_array(ExprState *, n); if (instrument_options) - resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false); + resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(n, instrument_options); } else { diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 26200c5a3d6e..8d04b8c6ece7 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -86,7 +86,7 @@ typedef struct FixedParallelExecutorState * instrument_options: Same meaning here as in instrument.c. * * instrument_offset: Offset, relative to the start of this structure, - * of the first Instrumentation object. This will depend on the length of + * of the first NodeInstrumentation object. This will depend on the length of * the plan_node_id array. * * num_workers: Number of workers. @@ -103,11 +103,15 @@ struct SharedExecutorInstrumentation int num_workers; int num_plan_nodes; int plan_node_id[FLEXIBLE_ARRAY_MEMBER]; - /* array of num_plan_nodes * num_workers Instrumentation objects follows */ + + /* + * array of num_plan_nodes * num_workers NodeInstrumentation objects + * follows + */ }; #define GetInstrumentationArray(sei) \ (AssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \ - (Instrumentation *) (((char *) sei) + sei->instrument_offset)) + (NodeInstrumentation *) (((char *) sei) + sei->instrument_offset)) /* Context object for ExecParallelEstimate. */ typedef struct ExecParallelEstimateContext @@ -724,7 +728,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, instrumentation_len = MAXALIGN(instrumentation_len); instrument_offset = instrumentation_len; instrumentation_len += - mul_size(sizeof(Instrumentation), + mul_size(sizeof(NodeInstrumentation), mul_size(e.nnodes, nworkers)); shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len); shm_toc_estimate_keys(&pcxt->estimator, 1); @@ -810,7 +814,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, */ if (estate->es_instrument) { - Instrumentation *instrument; + NodeInstrumentation *instrument; int i; instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len); @@ -820,7 +824,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, instrumentation->num_plan_nodes = e.nnodes; instrument = GetInstrumentationArray(instrumentation); for (i = 0; i < nworkers * e.nnodes; ++i) - InstrInit(&instrument[i], estate->es_instrument); + InstrInitNode(&instrument[i], estate->es_instrument); shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instrumentation); pei->instrumentation = instrumentation; @@ -1052,7 +1056,7 @@ static bool ExecParallelRetrieveInstrumentation(PlanState *planstate, SharedExecutorInstrumentation *instrumentation) { - Instrumentation *instrument; + NodeInstrumentation *instrument; int i; int n; int ibytes; @@ -1080,7 +1084,7 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, * Switch into per-query memory context. */ oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt); - ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation)); + ibytes = mul_size(instrumentation->num_workers, sizeof(NodeInstrumentation)); planstate->worker_instrument = palloc(ibytes + offsetof(WorkerInstrumentation, instrument)); MemoryContextSwitchTo(oldcontext); @@ -1312,7 +1316,7 @@ ExecParallelReportInstrumentation(PlanState *planstate, { int i; int plan_node_id = planstate->plan->plan_node_id; - Instrumentation *instrument; + NodeInstrumentation *instrument; InstrEndLoop(planstate->instrument); diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index f5f9cfbeeada..d286471254b1 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -413,8 +413,8 @@ ExecInitNode(Plan *node, EState *estate, int eflags) /* Set up instrumentation for this node if requested */ if (estate->es_instrument) - result->instrument = InstrAlloc(1, estate->es_instrument, - result->async_capable); + result->instrument = InstrAllocNode(1, estate->es_instrument, + result->async_capable); return result; } diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 20653a5c4c4e..41a342cab7f4 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -26,9 +26,9 @@ static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); static void WalUsageAdd(WalUsage *dst, WalUsage *add); -/* Allocate new instrumentation structure(s) */ +/* General purpose instrumentation handling */ Instrumentation * -InstrAlloc(int n, int instrument_options, bool async_mode) +InstrAlloc(int n, int instrument_options) { Instrumentation *instr; @@ -41,6 +41,108 @@ InstrAlloc(int n, int instrument_options, bool async_mode) bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; int i; + for (i = 0; i < n; i++) + { + instr[i].need_bufusage = need_buffers; + instr[i].need_walusage = need_wal; + instr[i].need_timer = need_timer; + } + } + + return instr; +} + +void +InstrStart(Instrumentation *instr) +{ + if (instr->need_timer && + !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) + elog(ERROR, "InstrStart called twice in a row"); + + if (instr->need_bufusage) + instr->bufusage_start = pgBufferUsage; + + if (instr->need_walusage) + instr->walusage_start = pgWalUsage; +} + +void +InstrStop(Instrumentation *instr) +{ + instr_time endtime; + + /* let's update the time only if the timer was requested */ + if (instr->need_timer) + { + if (INSTR_TIME_IS_ZERO(instr->starttime)) + elog(ERROR, "InstrStop called without start"); + + INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime); + + INSTR_TIME_SET_ZERO(instr->starttime); + } + + /* Add delta of buffer usage since entry to node's totals */ + if (instr->need_bufusage) + BufferUsageAccumDiff(&instr->bufusage, + &pgBufferUsage, &instr->bufusage_start); + + if (instr->need_walusage) + WalUsageAccumDiff(&instr->walusage, + &pgWalUsage, &instr->walusage_start); +} + +/* Trigger instrumentation handling */ +TriggerInstrumentation * +InstrAllocTrigger(int n, int instrument_options) +{ + TriggerInstrumentation *tginstr = palloc0(n * sizeof(TriggerInstrumentation)); + bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; + bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; + bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; + int i; + + for (i = 0; i < n; i++) + { + tginstr[i].instr.need_bufusage = need_buffers; + tginstr[i].instr.need_walusage = need_wal; + tginstr[i].instr.need_timer = need_timer; + } + + return tginstr; +} + +void +InstrStartTrigger(TriggerInstrumentation * tginstr) +{ + InstrStart(&tginstr->instr); +} + +void +InstrStopTrigger(TriggerInstrumentation * tginstr, int firings) +{ + InstrStop(&tginstr->instr); + tginstr->firings += firings; +} + +/* Node instrumentation handling */ + +/* Allocate new node instrumentation structure(s) */ +NodeInstrumentation * +InstrAllocNode(int n, int instrument_options, bool async_mode) +{ + NodeInstrumentation *instr; + + /* initialize all fields to zeroes, then modify as needed */ + instr = palloc0(n * sizeof(NodeInstrumentation)); + if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL)) + { + bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; + bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; + bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; + int i; + for (i = 0; i < n; i++) { instr[i].need_bufusage = need_buffers; @@ -55,9 +157,9 @@ InstrAlloc(int n, int instrument_options, bool async_mode) /* Initialize a pre-allocated instrumentation structure. */ void -InstrInit(Instrumentation *instr, int instrument_options) +InstrInitNode(NodeInstrumentation * instr, int instrument_options) { - memset(instr, 0, sizeof(Instrumentation)); + memset(instr, 0, sizeof(NodeInstrumentation)); instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0; instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0; instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; @@ -65,7 +167,7 @@ InstrInit(Instrumentation *instr, int instrument_options) /* Entry to a plan node */ void -InstrStartNode(Instrumentation *instr) +InstrStartNode(NodeInstrumentation * instr) { if (instr->need_timer && !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) @@ -81,7 +183,7 @@ InstrStartNode(Instrumentation *instr) /* Exit from a plan node */ void -InstrStopNode(Instrumentation *instr, double nTuples) +InstrStopNode(NodeInstrumentation * instr, double nTuples) { double save_tuplecount = instr->tuplecount; instr_time endtime; @@ -129,7 +231,7 @@ InstrStopNode(Instrumentation *instr, double nTuples) /* Update tuple count */ void -InstrUpdateTupleCount(Instrumentation *instr, double nTuples) +InstrUpdateTupleCount(NodeInstrumentation * instr, double nTuples) { /* count the returned tuples */ instr->tuplecount += nTuples; @@ -137,7 +239,7 @@ InstrUpdateTupleCount(Instrumentation *instr, double nTuples) /* Finish a run cycle for a plan node */ void -InstrEndLoop(Instrumentation *instr) +InstrEndLoop(NodeInstrumentation * instr) { /* Skip if nothing has happened, or already shut down */ if (!instr->running) @@ -162,7 +264,7 @@ InstrEndLoop(Instrumentation *instr) /* aggregate instrumentation information */ void -InstrAggNode(Instrumentation *dst, Instrumentation *add) +InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add) { if (!dst->running && add->running) { diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index dfc8b3c97657..4986f6cea54c 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -67,7 +67,40 @@ typedef enum InstrumentOption INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; +/* + * General purpose instrumentation that can capture time and WAL/buffer usage + * + * Initialized through InstrAlloc, followed by one or more calls to a pair of + * InstrStart/InstrStop (activity is measured inbetween). + */ typedef struct Instrumentation +{ + /* Parameters set at creation: */ + bool need_timer; /* true if we need timer data */ + bool need_bufusage; /* true if we need buffer usage data */ + bool need_walusage; /* true if we need WAL usage data */ + /* Internal state keeping: */ + instr_time starttime; /* start time of last InstrStart */ + BufferUsage bufusage_start; /* buffer usage at start */ + WalUsage walusage_start; /* WAL usage at start */ + /* Accumulated statistics: */ + instr_time total; /* total runtime */ + BufferUsage bufusage; /* total buffer usage */ + WalUsage walusage; /* total WAL usage */ +} Instrumentation; + +/* Trigger instrumentation */ +typedef struct TriggerInstrumentation +{ + Instrumentation instr; + int firings; /* number of times the instrumented trigger + * was fired */ +} TriggerInstrumentation; + +/* + * Specialized instrumentation for per-node execution statistics + */ +typedef struct NodeInstrumentation { /* Parameters set at node creation: */ bool need_timer; /* true if we need timer data */ @@ -92,25 +125,34 @@ typedef struct Instrumentation double nfiltered2; /* # of tuples removed by "other" quals */ BufferUsage bufusage; /* total buffer usage */ WalUsage walusage; /* total WAL usage */ -} Instrumentation; +} NodeInstrumentation; typedef struct WorkerInstrumentation { int num_workers; /* # of structures that follow */ - Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; + NodeInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; } WorkerInstrumentation; extern PGDLLIMPORT BufferUsage pgBufferUsage; extern PGDLLIMPORT WalUsage pgWalUsage; -extern Instrumentation *InstrAlloc(int n, int instrument_options, - bool async_mode); -extern void InstrInit(Instrumentation *instr, int instrument_options); -extern void InstrStartNode(Instrumentation *instr); -extern void InstrStopNode(Instrumentation *instr, double nTuples); -extern void InstrUpdateTupleCount(Instrumentation *instr, double nTuples); -extern void InstrEndLoop(Instrumentation *instr); -extern void InstrAggNode(Instrumentation *dst, Instrumentation *add); +extern Instrumentation *InstrAlloc(int n, int instrument_options); +extern void InstrStart(Instrumentation *instr); +extern void InstrStop(Instrumentation *instr); + +extern TriggerInstrumentation * InstrAllocTrigger(int n, int instrument_options); +extern void InstrStartTrigger(TriggerInstrumentation * tginstr); +extern void InstrStopTrigger(TriggerInstrumentation * tginstr, int firings); + +extern NodeInstrumentation * InstrAllocNode(int n, int instrument_options, + bool async_mode); +extern void InstrInitNode(NodeInstrumentation * instr, int instrument_options); +extern void InstrStartNode(NodeInstrumentation * instr); +extern void InstrStopNode(NodeInstrumentation * instr, double nTuples); +extern void InstrUpdateTupleCount(NodeInstrumentation * instr, double nTuples); +extern void InstrEndLoop(NodeInstrumentation * instr); +extern void InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add); + extern void InstrStartParallelQuery(void); extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage); extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 3968429f9919..498ba2bb8d9e 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -521,7 +521,7 @@ typedef struct ResultRelInfo ExprState **ri_TrigWhenExprs; /* optional runtime measurements for triggers */ - Instrumentation *ri_TrigInstrument; + TriggerInstrumentation *ri_TrigInstrument; /* On-demand created slots for triggers / returning processing */ TupleTableSlot *ri_ReturningSlot; /* for trigger output tuples */ @@ -1172,7 +1172,8 @@ typedef struct PlanState ExecProcNodeMtd ExecProcNodeReal; /* actual function, if above is a * wrapper */ - Instrumentation *instrument; /* Optional runtime stats for this node */ + NodeInstrumentation *instrument; /* Optional runtime stats for this + * node */ WorkerInstrumentation *worker_instrument; /* per-worker instrumentation */ /* Per-worker JIT instrumentation */ From dc918a0868ba3e9de47acdef33f6e87adb313add Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sun, 31 Aug 2025 16:34:42 -0700 Subject: [PATCH 3/7] Replace direct changes of pgBufferUsage/pgWalUsage with INSTR_* macros This encapsulates the ownership of these globals better, and will allow a subsequent refactoring. --- src/backend/access/transam/xlog.c | 10 +++++----- src/backend/storage/buffer/bufmgr.c | 24 ++++++++++++------------ src/backend/storage/buffer/localbuf.c | 6 +++--- src/backend/storage/file/buffile.c | 8 ++++---- src/backend/utils/activity/pgstat_io.c | 8 ++++---- src/include/executor/instrument.h | 19 +++++++++++++++++++ 6 files changed, 47 insertions(+), 28 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1b7ef589fc09..b4d25dfd2bb2 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1081,10 +1081,10 @@ XLogInsertRecord(XLogRecData *rdata, /* Report WAL traffic to the instrumentation. */ if (inserted) { - pgWalUsage.wal_bytes += rechdr->xl_tot_len; - pgWalUsage.wal_records++; - pgWalUsage.wal_fpi += num_fpi; - pgWalUsage.wal_fpi_bytes += fpi_bytes; + INSTR_WALUSAGE_ADD(wal_bytes, rechdr->xl_tot_len); + INSTR_WALUSAGE_INCR(wal_records); + INSTR_WALUSAGE_ADD(wal_fpi, num_fpi); + INSTR_WALUSAGE_ADD(wal_fpi_bytes, fpi_bytes); /* Required for the flush of pending stats WAL data */ pgstat_report_fixed = true; @@ -2064,7 +2064,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) WriteRqst.Flush = 0; XLogWrite(WriteRqst, tli, false); LWLockRelease(WALWriteLock); - pgWalUsage.wal_buffers_full++; + INSTR_WALUSAGE_INCR(wal_buffers_full); TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); /* diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index eb55102b0d7f..a4171aa270bb 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -800,7 +800,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN { PinLocalBuffer(bufHdr, true); - pgBufferUsage.local_blks_hit++; + INSTR_BUFUSAGE_INCR(local_blks_hit); return true; } @@ -821,7 +821,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN { if (BufferTagsEqual(&tag, &bufHdr->tag)) { - pgBufferUsage.shared_blks_hit++; + INSTR_BUFUSAGE_INCR(shared_blks_hit); return true; } UnpinBuffer(bufHdr); @@ -1222,14 +1222,14 @@ PinBufferForBlock(Relation rel, { bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr); if (*foundPtr) - pgBufferUsage.local_blks_hit++; + INSTR_BUFUSAGE_INCR(local_blks_hit); } else { bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum, strategy, foundPtr, io_context); if (*foundPtr) - pgBufferUsage.shared_blks_hit++; + INSTR_BUFUSAGE_INCR(shared_blks_hit); } if (rel) { @@ -1963,9 +1963,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) true); if (persistence == RELPERSISTENCE_TEMP) - pgBufferUsage.local_blks_hit += 1; + INSTR_BUFUSAGE_INCR(local_blks_hit); else - pgBufferUsage.shared_blks_hit += 1; + INSTR_BUFUSAGE_INCR(shared_blks_hit); if (operation->rel) pgstat_count_buffer_hit(operation->rel); @@ -2033,9 +2033,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) io_start, 1, io_buffers_len * BLCKSZ); if (persistence == RELPERSISTENCE_TEMP) - pgBufferUsage.local_blks_read += io_buffers_len; + INSTR_BUFUSAGE_ADD(local_blks_read, io_buffers_len); else - pgBufferUsage.shared_blks_read += io_buffers_len; + INSTR_BUFUSAGE_ADD(shared_blks_read, io_buffers_len); /* * Track vacuum cost when issuing IO, not after waiting for it. @@ -2919,7 +2919,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, TerminateBufferIO(buf_hdr, false, BM_VALID, true, false); } - pgBufferUsage.shared_blks_written += extend_by; + INSTR_BUFUSAGE_ADD(shared_blks_written, extend_by); *extended_by = extend_by; @@ -3079,7 +3079,7 @@ MarkBufferDirty(Buffer buffer) */ if (!(old_buf_state & BM_DIRTY)) { - pgBufferUsage.shared_blks_dirtied++; + INSTR_BUFUSAGE_INCR(shared_blks_dirtied); if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } @@ -4489,7 +4489,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_WRITE, io_start, 1, BLCKSZ); - pgBufferUsage.shared_blks_written++; + INSTR_BUFUSAGE_INCR(shared_blks_written); /* * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and @@ -5648,7 +5648,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) if (dirtied) { - pgBufferUsage.shared_blks_dirtied++; + INSTR_BUFUSAGE_INCR(shared_blks_dirtied); if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 15aac7d1c9fe..4481920ea5ff 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -217,7 +217,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln) /* Mark not-dirty */ TerminateLocalBufferIO(bufHdr, true, 0, false); - pgBufferUsage.local_blks_written++; + INSTR_BUFUSAGE_INCR(local_blks_written); } static Buffer @@ -478,7 +478,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr, *extended_by = extend_by; - pgBufferUsage.local_blks_written += extend_by; + INSTR_BUFUSAGE_ADD(local_blks_written, extend_by); return first_block; } @@ -509,7 +509,7 @@ MarkLocalBufferDirty(Buffer buffer) buf_state = pg_atomic_read_u32(&bufHdr->state); if (!(buf_state & BM_DIRTY)) - pgBufferUsage.local_blks_dirtied++; + INSTR_BUFUSAGE_INCR(local_blks_dirtied); buf_state |= BM_DIRTY; diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 85b316d879dc..55d0d170287b 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -474,13 +474,13 @@ BufFileLoadBuffer(BufFile *file) if (track_io_timing) { INSTR_TIME_SET_CURRENT(io_time); - INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start); + INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_read_time, io_time, io_start); } /* we choose not to advance curOffset here */ if (file->nbytes > 0) - pgBufferUsage.temp_blks_read++; + INSTR_BUFUSAGE_INCR(temp_blks_read); } /* @@ -548,13 +548,13 @@ BufFileDumpBuffer(BufFile *file) if (track_io_timing) { INSTR_TIME_SET_CURRENT(io_time); - INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start); + INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_write_time, io_time, io_start); } file->curOffset += bytestowrite; wpos += bytestowrite; - pgBufferUsage.temp_blks_written++; + INSTR_BUFUSAGE_INCR(temp_blks_written); } file->dirty = false; diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index 13ae57ed6498..4f6274eb573f 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -135,17 +135,17 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, { pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time)); if (io_object == IOOBJECT_RELATION) - INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(shared_blk_write_time, io_time); else if (io_object == IOOBJECT_TEMP_RELATION) - INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(local_blk_write_time, io_time); } else if (io_op == IOOP_READ) { pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time)); if (io_object == IOOBJECT_RELATION) - INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(shared_blk_read_time, io_time); else if (io_object == IOOBJECT_TEMP_RELATION) - INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(local_blk_read_time, io_time); } } diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 4986f6cea54c..8e435e1f92c8 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -161,4 +161,23 @@ extern void BufferUsageAccumDiff(BufferUsage *dst, extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub); +#define INSTR_BUFUSAGE_INCR(fld) do { \ + pgBufferUsage.fld++; \ + } while(0) +#define INSTR_BUFUSAGE_ADD(fld,val) do { \ + pgBufferUsage.fld += val; \ + } while(0) +#define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ + INSTR_TIME_ADD(pgBufferUsage.fld, val); \ + } while (0) +#define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ + INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \ + } while (0) +#define INSTR_WALUSAGE_INCR(fld) do { \ + pgWalUsage.fld++; \ + } while(0) +#define INSTR_WALUSAGE_ADD(fld,val) do { \ + pgWalUsage.fld += val; \ + } while(0) + #endif /* INSTRUMENT_H */ From 4fecc84b447fdf0eb81c64ae541fc736cd511a56 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 9 Sep 2025 02:16:59 -0700 Subject: [PATCH 4/7] Optimize measuring WAL/buffer usage through stack-based instrumentation Previously, in order to determine the buffer/WAL usage of a given code section, we utilized continuously incrementing global counters that get updated when the actual activity (e.g. shared block read) occurred, and then calculated a diff when the code section ended. This resulted in a bottleneck for executor node instrumentation specifically, with the function BufferUsageAccumDiff showing up in profiles and in some cases adding up to 10% overhead to an EXPLAIN (ANALYZE, BUFFERS) run. Instead, introduce a stack-based mechanism, where the actual activity writes into the current stack entry. In the case of executor nodes, this means that each node gets its own stack entry that is pushed at InstrStartNode, and popped at InstrEndNode. Stack entries are zero initialized (avoiding the diff mechanism) and get added to their parent entry when they are finalized, i.e. no more modifications can occur. To correctly handle abort situations, any use of instrumentation stacks must involve either a top-level Instrumentation struct, and its associated InstrStart/ InstrStop helpers (which use resource owners to handle aborts), or dedicated PG_TRY/PG_FINALLY calls that ensure the stack is in a consistent state after an abort. --- .../pg_stat_statements/pg_stat_statements.c | 105 ++++----- src/backend/commands/explain.c | 8 +- src/backend/executor/execMain.c | 28 ++- src/backend/executor/execProcnode.c | 31 +++ src/backend/executor/instrument.c | 218 ++++++++++++++---- src/include/executor/executor.h | 1 + src/include/executor/instrument.h | 64 ++++- src/include/utils/resowner.h | 1 + 8 files changed, 333 insertions(+), 123 deletions(-) diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 7f56592f5362..1ed3660cf9bc 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -911,21 +911,13 @@ pgss_planner(Query *parse, { instr_time start; instr_time duration; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + InstrStack stack = {0}; - /* We need to track buffer usage as the planner can access them. */ - bufusage_start = pgBufferUsage; - - /* - * Similarly the planner could write some WAL records in some cases - * (e.g. setting a hint bit with those being WAL-logged) - */ - walusage_start = pgWalUsage; INSTR_TIME_SET_CURRENT(start); + /* We need to track buffer/WAL usage as the planner can access them. */ + InstrPushStack(&stack); + nesting_level++; PG_TRY(); { @@ -938,6 +930,7 @@ pgss_planner(Query *parse, } PG_FINALLY(); { + InstrPopStack(&stack, true); nesting_level--; } PG_END_TRY(); @@ -945,14 +938,6 @@ pgss_planner(Query *parse, INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, start); - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(query_string, parse->queryId, parse->stmt_location, @@ -960,8 +945,8 @@ pgss_planner(Query *parse, PGSS_PLAN, INSTR_TIME_GET_MILLISEC(duration), 0, - &bufusage, - &walusage, + &stack.bufusage, + &stack.walusage, NULL, NULL, 0, @@ -1089,8 +1074,13 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) PGSS_EXEC, INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total), queryDesc->estate->es_total_processed, - &queryDesc->totaltime->bufusage, - &queryDesc->totaltime->walusage, + + /* + * Check if stack is initialized - it is not when ExecutorRun wasn't + * called + */ + queryDesc->totaltime->stack ? &queryDesc->totaltime->stack->bufusage : NULL, + queryDesc->totaltime->stack ? &queryDesc->totaltime->stack->walusage : NULL, queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL, NULL, queryDesc->estate->es_parallel_workers_to_launch, @@ -1157,14 +1147,10 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, instr_time start; instr_time duration; uint64 rows; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + InstrStack stack = {0}; - bufusage_start = pgBufferUsage; - walusage_start = pgWalUsage; INSTR_TIME_SET_CURRENT(start); + InstrPushStack(&stack); nesting_level++; PG_TRY(); @@ -1180,6 +1166,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, } PG_FINALLY(); { + InstrPopStack(&stack, true); nesting_level--; } PG_END_TRY(); @@ -1208,14 +1195,6 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ? qc->nprocessed : 0; - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(queryString, saved_queryId, saved_stmt_location, @@ -1223,8 +1202,8 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, PGSS_EXEC, INSTR_TIME_GET_MILLISEC(duration), rows, - &bufusage, - &walusage, + &stack.bufusage, + &stack.walusage, NULL, NULL, 0, @@ -1454,27 +1433,33 @@ pgss_store(const char *query, int64 queryId, } } entry->counters.rows += rows; - entry->counters.shared_blks_hit += bufusage->shared_blks_hit; - entry->counters.shared_blks_read += bufusage->shared_blks_read; - entry->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied; - entry->counters.shared_blks_written += bufusage->shared_blks_written; - entry->counters.local_blks_hit += bufusage->local_blks_hit; - entry->counters.local_blks_read += bufusage->local_blks_read; - entry->counters.local_blks_dirtied += bufusage->local_blks_dirtied; - entry->counters.local_blks_written += bufusage->local_blks_written; - entry->counters.temp_blks_read += bufusage->temp_blks_read; - entry->counters.temp_blks_written += bufusage->temp_blks_written; - entry->counters.shared_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_read_time); - entry->counters.shared_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_write_time); - entry->counters.local_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_read_time); - entry->counters.local_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_write_time); - entry->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time); - entry->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time); + if (bufusage) + { + entry->counters.shared_blks_hit += bufusage->shared_blks_hit; + entry->counters.shared_blks_read += bufusage->shared_blks_read; + entry->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied; + entry->counters.shared_blks_written += bufusage->shared_blks_written; + entry->counters.local_blks_hit += bufusage->local_blks_hit; + entry->counters.local_blks_read += bufusage->local_blks_read; + entry->counters.local_blks_dirtied += bufusage->local_blks_dirtied; + entry->counters.local_blks_written += bufusage->local_blks_written; + entry->counters.temp_blks_read += bufusage->temp_blks_read; + entry->counters.temp_blks_written += bufusage->temp_blks_written; + entry->counters.shared_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_read_time); + entry->counters.shared_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_write_time); + entry->counters.local_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_read_time); + entry->counters.local_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_write_time); + entry->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time); + entry->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time); + } entry->counters.usage += USAGE_EXEC(total_time); - entry->counters.wal_records += walusage->wal_records; - entry->counters.wal_fpi += walusage->wal_fpi; - entry->counters.wal_bytes += walusage->wal_bytes; - entry->counters.wal_buffers_full += walusage->wal_buffers_full; + if (walusage) + { + entry->counters.wal_records += walusage->wal_records; + entry->counters.wal_fpi += walusage->wal_fpi; + entry->counters.wal_bytes += walusage->wal_bytes; + entry->counters.wal_buffers_full += walusage->wal_buffers_full; + } if (jitusage) { entry->counters.jit_functions += jitusage->created_functions; diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index f167e1f6371a..0dbace901e40 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -2286,9 +2286,9 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->bufusage); + show_buffer_usage(es, &planstate->instrument->stack.bufusage); if (es->wal && planstate->instrument) - show_wal_usage(es, &planstate->instrument->walusage); + show_wal_usage(es, &planstate->instrument->stack.walusage); /* Prepare per-worker buffer/WAL usage */ if (es->workers_state && (es->buffers || es->wal) && es->verbose) @@ -2305,9 +2305,9 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->bufusage); + show_buffer_usage(es, &instrument->stack.bufusage); if (es->wal) - show_wal_usage(es, &instrument->walusage); + show_wal_usage(es, &instrument->stack.walusage); ExplainCloseWorker(n, es); } } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 6a2ca742f97e..9bafe1c9f391 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -329,6 +329,13 @@ standard_ExecutorRun(QueryDesc *queryDesc, */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + /* + * Start up required top-level instrumentation stack for WAL/buffer + * tracking + */ + if (!queryDesc->totaltime && (estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL))) + queryDesc->totaltime = InstrAlloc(1, estate->es_instrument); + /* Allow instrumentation of Executor overall runtime */ if (queryDesc->totaltime) InstrStart(queryDesc->totaltime); @@ -383,7 +390,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, dest->rShutdown(dest); if (queryDesc->totaltime) - InstrStop(queryDesc->totaltime); + InstrStop(queryDesc->totaltime, false); MemoryContextSwitchTo(oldcontext); } @@ -442,8 +449,15 @@ standard_ExecutorFinish(QueryDesc *queryDesc) if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS)) AfterTriggerEndQuery(estate); + /* + * Accumulate per node statistics, and then shut down instrumentation + * stack + */ + if (queryDesc->totaltime && estate->es_instrument) + ExecAccumNodeInstrumentation(queryDesc->planstate); + if (queryDesc->totaltime) - InstrStop(queryDesc->totaltime); + InstrStop(queryDesc->totaltime, true); MemoryContextSwitchTo(oldcontext); @@ -1266,7 +1280,15 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigWhenExprs = (ExprState **) palloc0_array(ExprState *, n); if (instrument_options) - resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(n, instrument_options); + { + /* + * Triggers do not individually track buffer/WAL usage, even if + * otherwise tracked + */ + int opts = (instrument_options & INSTRUMENT_TIMER) != 0 ? INSTRUMENT_TIMER : INSTRUMENT_ROWS; + + resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(n, opts); + } } else { diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index d286471254b1..d00cf820a272 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -122,6 +122,7 @@ static TupleTableSlot *ExecProcNodeFirst(PlanState *node); static TupleTableSlot *ExecProcNodeInstr(PlanState *node); static bool ExecShutdownNode_walker(PlanState *node, void *context); +static bool ExecAccumNodeInstrumentation_walker(PlanState *node, void *context); /* ------------------------------------------------------------------------ @@ -828,6 +829,36 @@ ExecShutdownNode_walker(PlanState *node, void *context) return false; } +/* + * ExecAccumNodeInstrumentation + * + * Accumulate instrumentation stats from all execution nodes to their respective + * parents (or the original parent instrumentation stack). + * + * This must run after the cleanup done by ExecShutdownNode, and not rely on any + * resources cleaned up by it. We also expect shutdown actions to have occurred, + * e.g. parallel worker instrumentation to have been added to the leader. + */ +void +ExecAccumNodeInstrumentation(PlanState *node) +{ + (void) ExecAccumNodeInstrumentation_walker(node, NULL); +} + +static bool +ExecAccumNodeInstrumentation_walker(PlanState *node, void *context) +{ + if (node == NULL) + return false; + + planstate_tree_walker(node, ExecAccumNodeInstrumentation_walker, context); + + if (node->instrument && node->instrument->stack.previous) + InstrStackAdd(node->instrument->stack.previous, &node->instrument->stack); + + return false; +} + /* * ExecSetTupleBound * diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 41a342cab7f4..37055d01f61b 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -16,37 +16,103 @@ #include #include "executor/instrument.h" +#include "utils/memutils.h" +#include "utils/resowner.h" BufferUsage pgBufferUsage; static BufferUsage save_pgBufferUsage; WalUsage pgWalUsage; +InstrStack *pgInstrStack = NULL; static WalUsage save_pgWalUsage; static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); static void WalUsageAdd(WalUsage *dst, WalUsage *add); +/* + * Use ResourceOwner mechanism to correctly reset pgInstrStack on abort. + */ +static void ResOwnerReleaseInstrumentation(Datum res); +static const ResourceOwnerDesc instrumentation_resowner_desc = +{ + .name = "instrumentation", + .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, + .release_priority = RELEASE_PRIO_INSTRUMENTATION, + .ReleaseResource = ResOwnerReleaseInstrumentation, + .DebugPrint = NULL, /* default message is fine */ +}; + +static inline void +ResourceOwnerRememberInstrStack(ResourceOwner owner, InstrStack * stack) +{ + ResourceOwnerRemember(owner, PointerGetDatum(stack), &instrumentation_resowner_desc); +} + +static inline void +ResourceOwnerForgetInstrStack(ResourceOwner owner, InstrStack * stack) +{ + ResourceOwnerForget(owner, PointerGetDatum(stack), &instrumentation_resowner_desc); +} + +static bool +StackIsParent(InstrStack * stack, InstrStack * entry) +{ + if (entry->previous == NULL) + return false; + + if (entry->previous == stack) + return true; + + return StackIsParent(stack, entry->previous); +} + +static void +ResOwnerReleaseInstrumentation(Datum res) +{ + InstrStack *stack = (InstrStack *) DatumGetPointer(res); + + if (pgInstrStack) + { + /* + * Because registered resources are *not* cleaned up in a guaranteed + * order, we may get a child context after we've processed the parent. + * Thus, we only change the stack if its not already a parent of the + * stack being released. + * + * If we already walked up the stack with an earlier resource, simply + * accumulate all collected stats before the abort to the current + * stack. + * + * Note that StackIsParent will recurse as needed, so it is + * inadvisible to use deeply nested stacks. + */ + if (!StackIsParent(pgInstrStack, stack)) + InstrPopStack(stack, true); + else + InstrStackAdd(pgInstrStack, stack); + } + + /* + * Ensure long-lived memory is freed now, as we don't expect InstrStop to + * be called + */ + pfree(stack); +} /* General purpose instrumentation handling */ Instrumentation * InstrAlloc(int n, int instrument_options) { - Instrumentation *instr; + Instrumentation *instr = palloc0(n * sizeof(Instrumentation)); + bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; + bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; + bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; + int i; - /* initialize all fields to zeroes, then modify as needed */ - instr = palloc0(n * sizeof(Instrumentation)); - if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL)) + for (i = 0; i < n; i++) { - bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; - bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; - bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; - int i; - - for (i = 0; i < n; i++) - { - instr[i].need_bufusage = need_buffers; - instr[i].need_walusage = need_wal; - instr[i].need_timer = need_timer; - } + instr[i].need_bufusage = need_buffers; + instr[i].need_walusage = need_wal; + instr[i].need_timer = need_timer; } return instr; @@ -59,15 +125,31 @@ InstrStart(Instrumentation *instr) !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) elog(ERROR, "InstrStart called twice in a row"); - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; + if (instr->need_bufusage || instr->need_walusage) + { + Assert(CurrentResourceOwner != NULL); + instr->owner = CurrentResourceOwner; + + /* + * Allocate the stack resource in a memory context that survives + * during an abort. This will be freed by InstrStop (regular + * execution) or ResOwnerReleaseInstrumentation (abort). + * + * We don't do this in InstrAlloc to avoid leaking when InstrStart + + * InstrStop isn't called. + */ + if (instr->stack == NULL) + instr->stack = MemoryContextAllocZero(CurTransactionContext, sizeof(InstrStack)); - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + ResourceOwnerEnlarge(instr->owner); + ResourceOwnerRememberInstrStack(instr->owner, instr->stack); + + InstrPushStack(instr->stack); + } } void -InstrStop(Instrumentation *instr) +InstrStop(Instrumentation *instr, bool finalize) { instr_time endtime; @@ -83,14 +165,28 @@ InstrStop(Instrumentation *instr) INSTR_TIME_SET_ZERO(instr->starttime); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); + if (instr->need_bufusage || instr->need_walusage) + { + InstrPopStack(instr->stack, finalize); - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + Assert(instr->owner != NULL); + ResourceOwnerForgetInstrStack(instr->owner, instr->stack); + instr->owner = NULL; + + if (finalize) + { + /* + * To avoid keeping memory allocated beyond when its needed, copy + * the result to the current memory context, and free it in the + * transaction context. + */ + InstrStack *stack = palloc(sizeof(InstrStack)); + + memcpy(stack, instr->stack, sizeof(InstrStack)); + pfree(instr->stack); + instr->stack = stack; + } + } } /* Trigger instrumentation handling */ @@ -98,15 +194,20 @@ TriggerInstrumentation * InstrAllocTrigger(int n, int instrument_options) { TriggerInstrumentation *tginstr = palloc0(n * sizeof(TriggerInstrumentation)); - bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; - bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; int i; + /* + * To avoid having to determine when the last trigger fired, we never + * track WAL/buffer usage for now + */ + Assert((instrument_options & INSTRUMENT_BUFFERS) == 0); + Assert((instrument_options & INSTRUMENT_WAL) == 0); + for (i = 0; i < n; i++) { - tginstr[i].instr.need_bufusage = need_buffers; - tginstr[i].instr.need_walusage = need_wal; + tginstr[i].instr.need_bufusage = false; + tginstr[i].instr.need_walusage = false; tginstr[i].instr.need_timer = need_timer; } @@ -122,7 +223,11 @@ InstrStartTrigger(TriggerInstrumentation * tginstr) void InstrStopTrigger(TriggerInstrumentation * tginstr, int firings) { - InstrStop(&tginstr->instr); + /* + * trigger instrumentation does not track WAL/buffer usage, so its okay to + * never finalize + */ + InstrStop(&tginstr->instr, false); tginstr->firings += firings; } @@ -173,12 +278,13 @@ InstrStartNode(NodeInstrumentation * instr) !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) elog(ERROR, "InstrStartNode called twice in a row"); - /* save buffer usage totals at node entry, if needed */ - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(pgInstrStack != NULL); - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + InstrPushStack(&instr->stack); + } } /* Exit from a plan node */ @@ -203,14 +309,14 @@ InstrStopNode(NodeInstrumentation * instr, double nTuples) INSTR_TIME_SET_ZERO(instr->starttime); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(instr->stack.previous != NULL); - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + /* Adding to parent is handled by ExecAccumNodeInstrumentation */ + InstrPopStack(&instr->stack, false); + } /* Is this the first tuple of this cycle? */ if (!instr->running) @@ -287,13 +393,13 @@ InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add) /* Add delta of buffer usage since entry to node's totals */ if (dst->need_bufusage) - BufferUsageAdd(&dst->bufusage, &add->bufusage); + BufferUsageAdd(&dst->stack.bufusage, &add->stack.bufusage); if (dst->need_walusage) - WalUsageAdd(&dst->walusage, &add->walusage); + WalUsageAdd(&dst->stack.walusage, &add->stack.walusage); } -/* note current values during parallel executor startup */ +/* start instrumentation during parallel executor startup */ void InstrStartParallelQuery(void) { @@ -315,10 +421,28 @@ InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage) void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) { + if (pgInstrStack != NULL) + { + InstrStack *dst = pgInstrStack; + + BufferUsageAdd(&dst->bufusage, bufusage); + WalUsageAdd(&dst->walusage, walusage); + } + BufferUsageAdd(&pgBufferUsage, bufusage); WalUsageAdd(&pgWalUsage, walusage); } +void +InstrStackAdd(InstrStack * dst, InstrStack * add) +{ + Assert(dst != NULL); + Assert(add != NULL); + + BufferUsageAdd(&dst->bufusage, &add->bufusage); + WalUsageAdd(&dst->walusage, &add->walusage); +} + /* dst += add */ static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 7cd6a49309f0..8f29c6baf301 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -300,6 +300,7 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function); extern Node *MultiExecProcNode(PlanState *node); extern void ExecEndNode(PlanState *node); extern void ExecShutdownNode(PlanState *node); +extern void ExecAccumNodeInstrumentation(PlanState *node); extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node); diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 8e435e1f92c8..30d81fceaaa3 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -67,12 +67,25 @@ typedef enum InstrumentOption INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; +/* Stack of WAL/buffer usage used for per-node instrumentation */ +typedef struct InstrStack +{ + struct InstrStack *previous; + BufferUsage bufusage; + WalUsage walusage; +} InstrStack; + /* * General purpose instrumentation that can capture time and WAL/buffer usage * * Initialized through InstrAlloc, followed by one or more calls to a pair of * InstrStart/InstrStop (activity is measured inbetween). + * + * Uses resource owner mechanism for handling aborts, as such, the caller *must* not exit out of + * top level transaction between InstrStart/InstrStop calls in regular execution. If this is needed, + * directly use InstrPushStack/InstrPopStack in a PG_TRY/PG_FINALLY block instead. */ +struct ResourceOwnerData; typedef struct Instrumentation { /* Parameters set at creation: */ @@ -81,12 +94,10 @@ typedef struct Instrumentation bool need_walusage; /* true if we need WAL usage data */ /* Internal state keeping: */ instr_time starttime; /* start time of last InstrStart */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics: */ instr_time total; /* total runtime */ - BufferUsage bufusage; /* total buffer usage */ - WalUsage walusage; /* total WAL usage */ + InstrStack *stack; /* stack tracking buffer/WAL usage */ + struct ResourceOwnerData *owner; } Instrumentation; /* Trigger instrumentation */ @@ -99,6 +110,10 @@ typedef struct TriggerInstrumentation /* * Specialized instrumentation for per-node execution statistics + * + * Requires use of an outer InstrStart/InstrStop to handle the stack used for WAL/buffer + * usage statistics, and relies on it for managing aborts. Solely intended for + * the executor and anyone reporting about its activities (e.g. EXPLAIN ANALYZE). */ typedef struct NodeInstrumentation { @@ -113,8 +128,6 @@ typedef struct NodeInstrumentation instr_time counter; /* accumulated runtime for this node */ instr_time firsttuple; /* time for first tuple of this cycle */ double tuplecount; /* # of tuples emitted so far this cycle */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics across all completed cycles: */ instr_time startup; /* total startup time */ instr_time total; /* total time */ @@ -123,8 +136,7 @@ typedef struct NodeInstrumentation double nloops; /* # of run cycles for this node */ double nfiltered1; /* # of tuples removed by scanqual or joinqual */ double nfiltered2; /* # of tuples removed by "other" quals */ - BufferUsage bufusage; /* total buffer usage */ - WalUsage walusage; /* total WAL usage */ + InstrStack stack; /* stack tracking buffer/WAL usage */ } NodeInstrumentation; typedef struct WorkerInstrumentation @@ -135,10 +147,31 @@ typedef struct WorkerInstrumentation extern PGDLLIMPORT BufferUsage pgBufferUsage; extern PGDLLIMPORT WalUsage pgWalUsage; +extern PGDLLIMPORT InstrStack * pgInstrStack; + +extern void InstrStackAdd(InstrStack * dst, InstrStack * add); + +static inline void +InstrPushStack(InstrStack * stack) +{ + stack->previous = pgInstrStack; + pgInstrStack = stack; +} + +static inline void +InstrPopStack(InstrStack * stack, bool add_to_parent) +{ + Assert(stack != NULL); + + pgInstrStack = stack->previous; + + if (pgInstrStack && add_to_parent) + InstrStackAdd(pgInstrStack, stack); +} extern Instrumentation *InstrAlloc(int n, int instrument_options); extern void InstrStart(Instrumentation *instr); -extern void InstrStop(Instrumentation *instr); +extern void InstrStop(Instrumentation *instr, bool finalize); extern TriggerInstrumentation * InstrAllocTrigger(int n, int instrument_options); extern void InstrStartTrigger(TriggerInstrumentation * tginstr); @@ -163,21 +196,34 @@ extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, #define INSTR_BUFUSAGE_INCR(fld) do { \ pgBufferUsage.fld++; \ + if (pgInstrStack) \ + pgInstrStack->bufusage.fld++; \ } while(0) #define INSTR_BUFUSAGE_ADD(fld,val) do { \ pgBufferUsage.fld += val; \ + if (pgInstrStack) \ + pgInstrStack->bufusage.fld += val; \ } while(0) #define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ INSTR_TIME_ADD(pgBufferUsage.fld, val); \ + if (pgInstrStack) \ + INSTR_TIME_ADD(pgInstrStack->bufusage.fld, val); \ } while (0) #define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \ + if (pgInstrStack) \ + INSTR_TIME_ACCUM_DIFF(pgInstrStack->bufusage.fld, endval, startval); \ } while (0) + #define INSTR_WALUSAGE_INCR(fld) do { \ pgWalUsage.fld++; \ + if (pgInstrStack) \ + pgInstrStack->walusage.fld++; \ } while(0) #define INSTR_WALUSAGE_ADD(fld,val) do { \ pgWalUsage.fld += val; \ + if (pgInstrStack) \ + pgInstrStack->walusage.fld += val; \ } while(0) #endif /* INSTRUMENT_H */ diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h index aede4bfc820a..c02b75480ffb 100644 --- a/src/include/utils/resowner.h +++ b/src/include/utils/resowner.h @@ -75,6 +75,7 @@ typedef uint32 ResourceReleasePriority; #define RELEASE_PRIO_SNAPSHOT_REFS 500 #define RELEASE_PRIO_FILES 600 #define RELEASE_PRIO_WAITEVENTSETS 700 +#define RELEASE_PRIO_INSTRUMENTATION 800 /* 0 is considered invalid */ #define RELEASE_PRIO_FIRST 1 From 050ea03f825bccbfe1a1cec5b8a6290ad8c7bb4a Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 9 Sep 2025 02:24:49 -0700 Subject: [PATCH 5/7] Use Instrumentation stack for parallel query aggregation in workers --- src/backend/access/brin/brin.c | 6 ++++-- src/backend/access/gin/gininsert.c | 6 ++++-- src/backend/access/nbtree/nbtsort.c | 6 ++++-- src/backend/commands/vacuumparallel.c | 6 ++++-- src/backend/executor/execParallel.c | 6 ++++-- src/backend/executor/instrument.c | 19 +++++++++---------- src/include/executor/instrument.h | 4 ++-- 7 files changed, 31 insertions(+), 22 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 45d306037a4b..50111d1bbcbd 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -2884,6 +2884,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + Instrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -2933,7 +2934,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2948,7 +2949,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index fc6af7c751b5..fbc6a72e5e41 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -2107,6 +2107,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + Instrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -2175,7 +2176,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2190,7 +2191,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index d7695dc1108f..133660565ce3 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1750,6 +1750,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + Instrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -1825,7 +1826,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) } /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Perform sorting of spool, and possibly a spool2 */ sortmem = maintenance_work_mem / btshared->scantuplesortstates; @@ -1835,7 +1836,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); #ifdef BTREE_BUILD_STATS diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index 8a37c08871a0..a223304397fe 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -994,6 +994,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) PVIndStats *indstats; PVShared *shared; TidStore *dead_items; + Instrumentation *instr; BufferUsage *buffer_usage; WalUsage *wal_usage; int nindexes; @@ -1083,7 +1084,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) error_context_stack = &errcallback; /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Process indexes to perform vacuum/cleanup */ parallel_vacuum_process_safe_indexes(&pvs); @@ -1091,7 +1092,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) /* Report buffer/WAL usage during parallel execution */ buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false); wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); /* Report any remaining cost-based vacuum delay time */ diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 8d04b8c6ece7..de3b6ca16375 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -1455,6 +1455,7 @@ void ParallelQueryMain(dsm_segment *seg, shm_toc *toc) { FixedParallelExecutorState *fpes; + Instrumentation *instr; BufferUsage *buffer_usage; WalUsage *wal_usage; DestReceiver *receiver; @@ -1515,7 +1516,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) * leader, which also doesn't count buffer accesses and WAL activity that * occur during executor startup. */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Run the plan. If we specified a tuple bound, be careful not to demand @@ -1531,7 +1532,8 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) /* Report buffer/WAL usage during parallel execution. */ buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); /* Report instrumentation data if any instrumentation options are set. */ diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 37055d01f61b..02c33b7deadd 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -20,10 +20,8 @@ #include "utils/resowner.h" BufferUsage pgBufferUsage; -static BufferUsage save_pgBufferUsage; WalUsage pgWalUsage; InstrStack *pgInstrStack = NULL; -static WalUsage save_pgWalUsage; static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); static void WalUsageAdd(WalUsage *dst, WalUsage *add); @@ -400,21 +398,22 @@ InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add) } /* start instrumentation during parallel executor startup */ -void +Instrumentation * InstrStartParallelQuery(void) { - save_pgBufferUsage = pgBufferUsage; - save_pgWalUsage = pgWalUsage; + Instrumentation *instr = InstrAlloc(1, INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + + InstrStart(instr); + return instr; } /* report usage after parallel executor shutdown */ void -InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage) +InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage) { - memset(bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage); - memset(walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage); + InstrStop(instr, true); + memcpy(bufusage, &instr->stack->bufusage, sizeof(BufferUsage)); + memcpy(walusage, &instr->stack->walusage, sizeof(WalUsage)); } /* accumulate work done by workers in leader's stats */ diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 30d81fceaaa3..adcbc75a7572 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -186,8 +186,8 @@ extern void InstrUpdateTupleCount(NodeInstrumentation * instr, double nTuples); extern void InstrEndLoop(NodeInstrumentation * instr); extern void InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add); -extern void InstrStartParallelQuery(void); -extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage); +extern Instrumentation *InstrStartParallelQuery(void); +extern void InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage); extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); extern void BufferUsageAccumDiff(BufferUsage *dst, const BufferUsage *add, const BufferUsage *sub); From 6189c02dd98257678b380da5ad6d5e557e8141bd Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 9 Sep 2025 02:26:56 -0700 Subject: [PATCH 6/7] Convert remaining users of pgBufferUsage to use InstrStart/InstrStop, drop the global --- src/backend/access/heap/vacuumlazy.c | 29 +++++++++++------------ src/backend/commands/analyze.c | 31 ++++++++++++------------ src/backend/commands/explain.c | 26 +++++++-------------- src/backend/commands/explain_dr.c | 31 +++++++++++++----------- src/backend/commands/prepare.c | 26 +++++++-------------- src/backend/executor/instrument.c | 35 +--------------------------- src/include/executor/instrument.h | 8 +------ 7 files changed, 66 insertions(+), 120 deletions(-) diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 30778a15639e..bc51f38199da 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -641,8 +641,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, TimestampTz starttime = 0; PgStat_Counter startreadtime = 0, startwritetime = 0; - WalUsage startwalusage = pgWalUsage; - BufferUsage startbufferusage = pgBufferUsage; + Instrumentation *instr = NULL; ErrorContextCallback errcallback; char **indnames = NULL; @@ -657,6 +656,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, startreadtime = pgStatBlockReadTime; startwritetime = pgStatBlockWriteTime; } + instr = InstrAlloc(1, INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + InstrStart(instr); } /* Used for instrumentation and stats report */ @@ -972,14 +973,14 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, { TimestampTz endtime = GetCurrentTimestamp(); + InstrStop(instr, true); + if (verbose || params.log_vacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, params.log_vacuum_min_duration)) { long secs_dur; int usecs_dur; - WalUsage walusage; - BufferUsage bufferusage; StringInfoData buf; char *msgfmt; int32 diff; @@ -988,19 +989,17 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, int64 total_blks_hit; int64 total_blks_read; int64 total_blks_dirtied; + BufferUsage bufusage = instr->stack->bufusage; + WalUsage walusage = instr->stack->walusage; TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur); - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage); - memset(&bufferusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage); - - total_blks_hit = bufferusage.shared_blks_hit + - bufferusage.local_blks_hit; - total_blks_read = bufferusage.shared_blks_read + - bufferusage.local_blks_read; - total_blks_dirtied = bufferusage.shared_blks_dirtied + - bufferusage.local_blks_dirtied; + + total_blks_hit = bufusage.shared_blks_hit + + bufusage.local_blks_hit; + total_blks_read = bufusage.shared_blks_read + + bufusage.local_blks_read; + total_blks_dirtied = bufusage.shared_blks_dirtied + + bufusage.local_blks_dirtied; initStringInfo(&buf); if (verbose) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 5e2a7a8234ec..56a71d6cb8ba 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -308,9 +308,7 @@ do_analyze_rel(Relation onerel, const VacuumParams params, Oid save_userid; int save_sec_context; int save_nestlevel; - WalUsage startwalusage = pgWalUsage; - BufferUsage startbufferusage = pgBufferUsage; - BufferUsage bufferusage; + Instrumentation *instr = NULL; PgStat_Counter startreadtime = 0; PgStat_Counter startwritetime = 0; @@ -361,6 +359,9 @@ do_analyze_rel(Relation onerel, const VacuumParams params, } pg_rusage_init(&ru0); + + instr = InstrAlloc(1, INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + InstrStart(instr); } /* Used for instrumentation and stats report */ @@ -741,12 +742,13 @@ do_analyze_rel(Relation onerel, const VacuumParams params, { TimestampTz endtime = GetCurrentTimestamp(); + InstrStop(instr, true); + if (verbose || params.log_analyze_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, params.log_analyze_min_duration)) { long delay_in_ms; - WalUsage walusage; double read_rate = 0; double write_rate = 0; char *msgfmt; @@ -754,18 +756,15 @@ do_analyze_rel(Relation onerel, const VacuumParams params, int64 total_blks_hit; int64 total_blks_read; int64 total_blks_dirtied; - - memset(&bufferusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage); - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage); - - total_blks_hit = bufferusage.shared_blks_hit + - bufferusage.local_blks_hit; - total_blks_read = bufferusage.shared_blks_read + - bufferusage.local_blks_read; - total_blks_dirtied = bufferusage.shared_blks_dirtied + - bufferusage.local_blks_dirtied; + BufferUsage bufusage = instr->stack->bufusage; + WalUsage walusage = instr->stack->walusage; + + total_blks_hit = bufusage.shared_blks_hit + + bufusage.local_blks_hit; + total_blks_read = bufusage.shared_blks_read + + bufusage.local_blks_read; + total_blks_dirtied = bufusage.shared_blks_dirtied + + bufusage.local_blks_dirtied; /* * We do not expect an analyze to take > 25 days and it simplifies diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 0dbace901e40..fdba163860f6 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -322,14 +322,16 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, QueryEnvironment *queryEnv) { PlannedStmt *plan; - instr_time planstart, - planduration; - BufferUsage bufusage_start, - bufusage; + Instrumentation *instr = NULL; MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; MemoryContext saved_ctx = NULL; + if (es->buffers) + instr = InstrAlloc(1, INSTRUMENT_TIMER | INSTRUMENT_BUFFERS); + else + instr = InstrAlloc(1, INSTRUMENT_TIMER); + if (es->memory) { /* @@ -346,15 +348,12 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, saved_ctx = MemoryContextSwitchTo(planner_ctx); } - if (es->buffers) - bufusage_start = pgBufferUsage; - INSTR_TIME_SET_CURRENT(planstart); + InstrStart(instr); /* plan the query */ plan = pg_plan_query(query, queryString, cursorOptions, params, es); - INSTR_TIME_SET_CURRENT(planduration); - INSTR_TIME_SUBTRACT(planduration, planstart); + InstrStop(instr, true); if (es->memory) { @@ -362,16 +361,9 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, MemoryContextMemConsumed(planner_ctx, &mem_counters); } - /* calc differences of buffer counters. */ - if (es->buffers) - { - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - } - /* run it (if needed) and produce output */ ExplainOnePlan(plan, into, es, queryString, params, queryEnv, - &planduration, (es->buffers ? &bufusage : NULL), + &instr->total, (es->buffers ? &instr->stack->bufusage : NULL), es->memory ? &mem_counters : NULL); } diff --git a/src/backend/commands/explain_dr.c b/src/backend/commands/explain_dr.c index 5833aa80fb5b..d612b624f44f 100644 --- a/src/backend/commands/explain_dr.c +++ b/src/backend/commands/explain_dr.c @@ -110,15 +110,20 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self) MemoryContext oldcontext; StringInfo buf = &myState->buf; int natts = typeinfo->natts; - instr_time start, - end; - BufferUsage instr_start; + Instrumentation *instr = NULL; /* only measure time, buffers if requested */ - if (myState->es->timing) - INSTR_TIME_SET_CURRENT(start); - if (myState->es->buffers) - instr_start = pgBufferUsage; + if (myState->es->timing || myState->es->buffers) + { + InstrumentOption instrument_options = 0; + + if (myState->es->timing) + instrument_options |= INSTRUMENT_TIMER; + if (myState->es->buffers) + instrument_options |= INSTRUMENT_BUFFERS; + instr = InstrAlloc(1, instrument_options); + InstrStart(instr); + } /* Set or update my derived attribute info, if needed */ if (myState->attrinfo != typeinfo || myState->nattrs != natts) @@ -186,18 +191,16 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self) MemoryContextSwitchTo(oldcontext); MemoryContextReset(myState->tmpcontext); + if (myState->es->timing || myState->es->buffers) + InstrStop(instr, true); + /* Update timing data */ if (myState->es->timing) - { - INSTR_TIME_SET_CURRENT(end); - INSTR_TIME_ACCUM_DIFF(myState->metrics.timeSpent, end, start); - } + INSTR_TIME_ADD(myState->metrics.timeSpent, instr->total); /* Update buffer metrics */ if (myState->es->buffers) - BufferUsageAccumDiff(&myState->metrics.bufferUsage, - &pgBufferUsage, - &instr_start); + BufferUsageAdd(&myState->metrics.bufferUsage, &instr->stack->bufusage); return true; } diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index 34b6410d6a26..77b4c59e71cf 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -578,14 +578,16 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, ListCell *p; ParamListInfo paramLI = NULL; EState *estate = NULL; - instr_time planstart; - instr_time planduration; - BufferUsage bufusage_start, - bufusage; + Instrumentation *instr = NULL; MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; MemoryContext saved_ctx = NULL; + if (es->buffers) + instr = InstrAlloc(1, INSTRUMENT_TIMER | INSTRUMENT_BUFFERS); + else + instr = InstrAlloc(1, INSTRUMENT_TIMER); + if (es->memory) { /* See ExplainOneQuery about this */ @@ -596,9 +598,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, saved_ctx = MemoryContextSwitchTo(planner_ctx); } - if (es->buffers) - bufusage_start = pgBufferUsage; - INSTR_TIME_SET_CURRENT(planstart); + InstrStart(instr); /* Look it up in the hash table */ entry = FetchPreparedStatement(execstmt->name, true); @@ -633,8 +633,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, cplan = GetCachedPlan(entry->plansource, paramLI, CurrentResourceOwner, pstate->p_queryEnv); - INSTR_TIME_SET_CURRENT(planduration); - INSTR_TIME_SUBTRACT(planduration, planstart); + InstrStop(instr, true); if (es->memory) { @@ -642,13 +641,6 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, MemoryContextMemConsumed(planner_ctx, &mem_counters); } - /* calc differences of buffer counters. */ - if (es->buffers) - { - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - } - plan_list = cplan->stmt_list; /* Explain each query */ @@ -658,7 +650,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, if (pstmt->commandType != CMD_UTILITY) ExplainOnePlan(pstmt, into, es, query_string, paramLI, pstate->p_queryEnv, - &planduration, (es->buffers ? &bufusage : NULL), + &instr->total, (es->buffers ? &instr->stack->bufusage : NULL), es->memory ? &mem_counters : NULL); else ExplainOneUtility(pstmt->utilityStmt, into, es, pstate, paramLI); diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 02c33b7deadd..2a141fdae079 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -19,11 +19,9 @@ #include "utils/memutils.h" #include "utils/resowner.h" -BufferUsage pgBufferUsage; WalUsage pgWalUsage; InstrStack *pgInstrStack = NULL; -static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); static void WalUsageAdd(WalUsage *dst, WalUsage *add); /* @@ -428,7 +426,6 @@ InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) WalUsageAdd(&dst->walusage, walusage); } - BufferUsageAdd(&pgBufferUsage, bufusage); WalUsageAdd(&pgWalUsage, walusage); } @@ -443,7 +440,7 @@ InstrStackAdd(InstrStack * dst, InstrStack * add) } /* dst += add */ -static void +void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) { dst->shared_blks_hit += add->shared_blks_hit; @@ -464,36 +461,6 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time); } -/* dst += add - sub */ -void -BufferUsageAccumDiff(BufferUsage *dst, - const BufferUsage *add, - const BufferUsage *sub) -{ - dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit; - dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read; - dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied; - dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written; - dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit; - dst->local_blks_read += add->local_blks_read - sub->local_blks_read; - dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied; - dst->local_blks_written += add->local_blks_written - sub->local_blks_written; - dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read; - dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written; - INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time, - add->shared_blk_read_time, sub->shared_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time, - add->shared_blk_write_time, sub->shared_blk_write_time); - INSTR_TIME_ACCUM_DIFF(dst->local_blk_read_time, - add->local_blk_read_time, sub->local_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->local_blk_write_time, - add->local_blk_write_time, sub->local_blk_write_time); - INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time, - add->temp_blk_read_time, sub->temp_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time, - add->temp_blk_write_time, sub->temp_blk_write_time); -} - /* helper functions for WAL usage accumulation */ static void WalUsageAdd(WalUsage *dst, WalUsage *add) diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index adcbc75a7572..a627cfcac2d5 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -145,7 +145,6 @@ typedef struct WorkerInstrumentation NodeInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; } WorkerInstrumentation; -extern PGDLLIMPORT BufferUsage pgBufferUsage; extern PGDLLIMPORT WalUsage pgWalUsage; extern PGDLLIMPORT InstrStack * pgInstrStack; @@ -189,28 +188,23 @@ extern void InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add); extern Instrumentation *InstrStartParallelQuery(void); extern void InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage); extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); -extern void BufferUsageAccumDiff(BufferUsage *dst, - const BufferUsage *add, const BufferUsage *sub); +extern void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub); #define INSTR_BUFUSAGE_INCR(fld) do { \ - pgBufferUsage.fld++; \ if (pgInstrStack) \ pgInstrStack->bufusage.fld++; \ } while(0) #define INSTR_BUFUSAGE_ADD(fld,val) do { \ - pgBufferUsage.fld += val; \ if (pgInstrStack) \ pgInstrStack->bufusage.fld += val; \ } while(0) #define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ - INSTR_TIME_ADD(pgBufferUsage.fld, val); \ if (pgInstrStack) \ INSTR_TIME_ADD(pgInstrStack->bufusage.fld, val); \ } while (0) #define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ - INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \ if (pgInstrStack) \ INSTR_TIME_ACCUM_DIFF(pgInstrStack->bufusage.fld, endval, startval); \ } while (0) From 8d934812c9513d099911b255b683d6203b9862c4 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Thu, 30 Oct 2025 22:27:30 -0700 Subject: [PATCH 7/7] Index scans: Split heap and index buffer access reporting in EXPLAIN This makes it clear whether activity was on the index directly, or on the table based on heap fetches. --- src/backend/commands/explain.c | 56 ++++++++++++++++------------ src/backend/executor/execProcnode.c | 13 +++++++ src/backend/executor/instrument.c | 25 +++++++++++++ src/backend/executor/nodeIndexscan.c | 15 +++++++- src/include/access/genam.h | 3 ++ src/include/executor/instrument.h | 3 ++ 6 files changed, 91 insertions(+), 24 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index fdba163860f6..6b3ed2e9b404 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -143,7 +143,7 @@ static void show_instrumentation_count(const char *qlabel, int which, static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage); -static void show_buffer_usage(ExplainState *es, const BufferUsage *usage); +static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void show_memory_counters(ExplainState *es, const MemoryContextCounters *mem_counters); @@ -603,7 +603,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, } if (bufusage) - show_buffer_usage(es, bufusage); + show_buffer_usage(es, bufusage, NULL); if (mem_counters) show_memory_counters(es, mem_counters); @@ -1020,7 +1020,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage)) { es->indent++; - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); es->indent--; } } @@ -1034,7 +1034,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) BYTES_TO_KILOBYTES(metrics->bytesSent), es); ExplainPropertyText("Format", format, es); if (es->buffers) - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); } ExplainCloseGroup("Serialization", "Serialization", true, es); @@ -1960,6 +1960,9 @@ ExplainNode(PlanState *planstate, List *ancestors, show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); show_indexsearches_info(planstate, es); + + if (es->buffers && planstate->instrument) + show_buffer_usage(es, &((IndexScanState *) planstate)->iss_Instrument.table_stack.bufusage, "Table"); break; case T_IndexOnlyScan: show_scan_qual(((IndexOnlyScan *) plan)->indexqual, @@ -2278,7 +2281,8 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->stack.bufusage); + show_buffer_usage(es, &planstate->instrument->stack.bufusage, + IsA(plan, IndexScan) ? "Index" : NULL); if (es->wal && planstate->instrument) show_wal_usage(es, &planstate->instrument->stack.walusage); @@ -2297,7 +2301,7 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->stack.bufusage); + show_buffer_usage(es, &instrument->stack.bufusage, NULL); if (es->wal) show_wal_usage(es, &instrument->stack.walusage); ExplainCloseWorker(n, es); @@ -4097,7 +4101,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage) * Show buffer usage details. This better be sync with peek_buffer_usage. */ static void -show_buffer_usage(ExplainState *es, const BufferUsage *usage) +show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title) { if (es->format == EXPLAIN_FORMAT_TEXT) { @@ -4122,6 +4126,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared || has_local || has_temp) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "Buffers:"); if (has_shared) @@ -4177,6 +4183,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared_timing || has_local_timing || has_temp_timing) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "I/O Timings:"); if (has_shared_timing) @@ -4218,44 +4226,46 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) } else { - ExplainPropertyInteger("Shared Hit Blocks", NULL, + char *prefix = title ? psprintf("%s ", title) : pstrdup(""); + + ExplainPropertyInteger(psprintf("%sShared Hit Blocks", prefix), NULL, usage->shared_blks_hit, es); - ExplainPropertyInteger("Shared Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Read Blocks", prefix), NULL, usage->shared_blks_read, es); - ExplainPropertyInteger("Shared Dirtied Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Dirtied Blocks", prefix), NULL, usage->shared_blks_dirtied, es); - ExplainPropertyInteger("Shared Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Written Blocks", prefix), NULL, usage->shared_blks_written, es); - ExplainPropertyInteger("Local Hit Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Hit Blocks", prefix), NULL, usage->local_blks_hit, es); - ExplainPropertyInteger("Local Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Read Blocks", prefix), NULL, usage->local_blks_read, es); - ExplainPropertyInteger("Local Dirtied Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Dirtied Blocks", prefix), NULL, usage->local_blks_dirtied, es); - ExplainPropertyInteger("Local Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Written Blocks", prefix), NULL, usage->local_blks_written, es); - ExplainPropertyInteger("Temp Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sTemp Read Blocks", prefix), NULL, usage->temp_blks_read, es); - ExplainPropertyInteger("Temp Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sTemp Written Blocks", prefix), NULL, usage->temp_blks_written, es); if (track_io_timing) { - ExplainPropertyFloat("Shared I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sShared I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time), 3, es); - ExplainPropertyFloat("Shared I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sShared I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time), 3, es); - ExplainPropertyFloat("Local I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sLocal I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time), 3, es); - ExplainPropertyFloat("Local I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sLocal I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time), 3, es); - ExplainPropertyFloat("Temp I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sTemp I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time), 3, es); - ExplainPropertyFloat("Temp I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sTemp I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time), 3, es); } diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index d00cf820a272..f19af428d979 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -854,7 +854,20 @@ ExecAccumNodeInstrumentation_walker(PlanState *node, void *context) planstate_tree_walker(node, ExecAccumNodeInstrumentation_walker, context); if (node->instrument && node->instrument->stack.previous) + { + /* + * Index Scan nodes account for heap buffer usage separately, so we + * need to explitly add here + */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + InstrStackAdd(node->instrument->stack.previous, &iss->iss_Instrument.table_stack); + } + InstrStackAdd(node->instrument->stack.previous, &node->instrument->stack); + } return false; } diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 2a141fdae079..f6abc8d0c19e 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -395,6 +395,31 @@ InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add) WalUsageAdd(&dst->stack.walusage, &add->stack.walusage); } +void +InstrStartNodeStack(NodeInstrumentation * instr, InstrStack * stack) +{ + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(pgInstrStack != NULL); + + InstrPushStack(stack); + } +} + +void +InstrStopNodeStack(NodeInstrumentation * instr, InstrStack * stack) +{ + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(stack->previous != NULL); + + /* Adding to parent is handled by ExecAccumNodeInstrumentation */ + InstrPopStack(stack, false); + } +} + /* start instrumentation during parallel executor startup */ Instrumentation * InstrStartParallelQuery(void) diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 72b135e5dcf0..980c636a640a 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -83,6 +83,7 @@ IndexNext(IndexScanState *node) ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; + ItemPointer tid; TupleTableSlot *slot; /* @@ -128,8 +129,20 @@ IndexNext(IndexScanState *node) /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { + if (node->ss.ps.instrument) + InstrStartNodeStack(node->ss.ps.instrument, &node->iss_Instrument.table_stack); + + if (unlikely(!index_fetch_heap(scandesc, slot))) + continue; + + if (node->ss.ps.instrument) + InstrStopNodeStack(node->ss.ps.instrument, &node->iss_Instrument.table_stack); + + if (scandesc->xs_heap_continue) + elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + CHECK_FOR_INTERRUPTS(); /* diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 9200a22bd9f9..7813b4688f50 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -17,6 +17,7 @@ #include "access/htup.h" #include "access/sdir.h" #include "access/skey.h" +#include "executor/instrument.h" #include "nodes/tidbitmap.h" #include "storage/buf.h" #include "storage/lockdefs.h" @@ -40,6 +41,8 @@ typedef struct IndexScanInstrumentation { /* Index search count (incremented with pgstat_count_index_scan call) */ uint64 nsearches; + /* Buffer usage of heap access during index scans */ + InstrStack table_stack; } IndexScanInstrumentation; /* diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index a627cfcac2d5..9bff6d8303fa 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -185,6 +185,9 @@ extern void InstrUpdateTupleCount(NodeInstrumentation * instr, double nTuples); extern void InstrEndLoop(NodeInstrumentation * instr); extern void InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add); +extern void InstrStartNodeStack(NodeInstrumentation * dst, InstrStack * stack); +extern void InstrStopNodeStack(NodeInstrumentation * dst, InstrStack * stack); + extern Instrumentation *InstrStartParallelQuery(void); extern void InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage); extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage);