diff options
author | Robert Haas | 2025-07-21 19:21:25 +0000 |
---|---|---|
committer | Robert Haas | 2025-07-21 19:55:50 +0000 |
commit | 2ca4a0f83fae01c77908e09e44e6571085777079 (patch) | |
tree | 5e00e580bc7034220ac51764390b5d45cb0e766b | |
parent | c947e932ea2776a9e947fa0b6509f302bdb03dcc (diff) |
WIP: Add pg_plan_advice contrib module.advice
Provide a facility that (1) can be used to stabilize certain plan choices
so that the planner cannot reverse course without authorization and
(2) can be used by knowledgeable users to insist on plan choices contrary
to what the planner believes best. In both cases, terrible outcomes are
possible: users should think twice and perhaps three times before
constraining the planner's ability to do as it thinks best; nevertheless,
there are problems that are much more easily solved with these facilities
than without them.
We take the approach of analyzing a finished plan to produce textual
output, which we call "plan advice", that describes key decisions made
during plan; if that plan advice is provided during future planning
cycles, it will force those key decisions to be made in the same way.
Not all planner decisions can be controlled using advice; for example,
decisions about how to perform aggregation are currently out of scope,
as is choice of sort order. Plan advice can also be edited by the user,
or even written from scratch in simple cases, making it possible to
generate outcomes that the planner would not have produced. Partial
advice can be provided to control some planner outcomes but not others.
Currently, plan advice is focused only on specific outcomes, such as
the choice to use a sequential scan for a particular relation, and not
on estimates that might contribute to those outcomes, such as a
possibly-incorrect selectivity estimate. While it might be useful to
users to be able to provide plan advice that affects selectivity
estimates or other aspects of costing, that is out of scope for this
commit.
For more details, see the pg_plan_advice documentation and the README
file in contrib/pg_plan_advice.
NOTE: This code is currently highly incomplete. It only generates
advice; there's no way to make any use of what is generated. Worse,
the generated advice is incorrect in some cases and missing crucial
information in others. Furthermore, the README and documentation
postulated by the above draft commit message do not yet exist. Some
known problems are called out by XXX in the comments, but this is
certainly not a complete list of issues. IN SHORT: This is early
prototype code, and a lot of things will need to be changed, fixed,
or added before this at all resembles a completed patch.
21 files changed, 3536 insertions, 0 deletions
diff --git a/contrib/Makefile b/contrib/Makefile index 2f0a88d3f7..dd04c20acd 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -34,6 +34,7 @@ SUBDIRS = \ pg_freespacemap \ pg_logicalinspect \ pg_overexplain \ + pg_plan_advice \ pg_prewarm \ pg_stat_statements \ pg_surgery \ diff --git a/contrib/meson.build b/contrib/meson.build index ed30ee7d63..cb718dbdac 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -48,6 +48,7 @@ subdir('pgcrypto') subdir('pg_freespacemap') subdir('pg_logicalinspect') subdir('pg_overexplain') +subdir('pg_plan_advice') subdir('pg_prewarm') subdir('pgrowlocks') subdir('pg_stat_statements') diff --git a/contrib/pg_plan_advice/Makefile b/contrib/pg_plan_advice/Makefile new file mode 100644 index 0000000000..6f8127e551 --- /dev/null +++ b/contrib/pg_plan_advice/Makefile @@ -0,0 +1,30 @@ +# contrib/pg_plan_advice/Makefile + +MODULE_big = pg_plan_advice +OBJS = \ + $(WIN32RES) \ + pg_plan_advice.o \ + pgpa_collector.o \ + pgpa_identifier.o \ + pgpa_join.o \ + pgpa_output.o \ + pgpa_scan.o \ + pgpa_walker.o + +EXTENSION = pg_plan_advice +DATA = pg_plan_advice--1.0.sql +PGFILEDESC = "pg_plan_advice - help the planner get the right plan" + +REGRESS = pg_plan_advice +TAP_TESTS = 1 + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_plan_advice +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/pg_plan_advice/meson.build b/contrib/pg_plan_advice/meson.build new file mode 100644 index 0000000000..d9e3ec148c --- /dev/null +++ b/contrib/pg_plan_advice/meson.build @@ -0,0 +1,35 @@ +# Copyright (c) 2022-2024, PostgreSQL Global Development Group + +pg_plan_advice_sources = files( + 'pg_plan_advice.c', + 'pgpa_collector.c', + 'pgpa_identifier.c', + 'pgpa_join.c', + 'pgpa_output.c', + 'pgpa_scan.c', + 'pgpa_walker.c', +) + +if host_system == 'windows' + pg_plan_advice_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_plan_advice', + '--FILEDESC', 'pg_plan_advice - help the planner get the right plan',]) +endif + +pg_plan_advice = shared_module('pg_plan_advice', + pg_plan_advice_sources, + kwargs: contrib_mod_args, +) +contrib_targets += pg_plan_advice + +install_data( + 'pg_plan_advice--1.0.sql', + 'pg_plan_advice.control', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'pg_plan_advice', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), +} diff --git a/contrib/pg_plan_advice/pg_plan_advice--1.0.sql b/contrib/pg_plan_advice/pg_plan_advice--1.0.sql new file mode 100644 index 0000000000..729950bcce --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice--1.0.sql @@ -0,0 +1,40 @@ +/* contrib/pg_plan_advice/pg_plan_advice--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_plan_advice" to load this file. \quit + +CREATE FUNCTION pg_clear_collected_local_advice() +RETURNS void +AS 'MODULE_PATHNAME', 'pg_clear_collected_local_advice' +LANGUAGE C STRICT; + +CREATE FUNCTION pg_clear_collected_shared_advice() +RETURNS void +AS 'MODULE_PATHNAME', 'pg_clear_collected_shared_advice' +LANGUAGE C STRICT; + +CREATE FUNCTION pg_get_collected_local_advice( + OUT id bigint, + OUT userid oid, + OUT dbid oid, + OUT queryid bigint, + OUT collection_time timestamptz, + OUT query text, + OUT advice text +) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_get_collected_local_advice' +LANGUAGE C STRICT; + +CREATE FUNCTION pg_get_collected_shared_advice( + OUT id bigint, + OUT userid oid, + OUT dbid oid, + OUT queryid bigint, + OUT collection_time timestamptz, + OUT query text, + OUT advice text +) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_get_collected_shared_advice' +LANGUAGE C STRICT; diff --git a/contrib/pg_plan_advice/pg_plan_advice.c b/contrib/pg_plan_advice/pg_plan_advice.c new file mode 100644 index 0000000000..0c54cd8f54 --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice.c @@ -0,0 +1,342 @@ +/*------------------------------------------------------------------------- + * + * pg_plan_advice.c + * Main entrypoints for generating and applying planner advice + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pg_plan_advice.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pg_plan_advice.h" +#include "pgpa_collector.h" +#include "pgpa_identifier.h" +#include "pgpa_output.h" +#include "pgpa_walker.h" + +#include "commands/defrem.h" +#include "commands/explain.h" +#include "commands/explain_format.h" +#include "commands/explain_state.h" +#include "funcapi.h" +#include "storage/dsm_registry.h" +#include "utils/guc.h" + +PG_MODULE_MAGIC; + +static pgpa_shared_state *pgpa_state = NULL; +static dsa_area *pgpa_dsa_area = NULL; + +/* GUC variables */ +int pg_plan_advice_local_collection_limit = 0; +int pg_plan_advice_shared_collection_limit = 0; + +/* Saved hook values */ +static ExecutorStart_hook_type prev_ExecutorStart = NULL; +static explain_per_plan_hook_type prev_explain_per_plan_hook = NULL; + +/* Other file-level globals */ +static int es_extension_id; +static MemoryContext pgpa_memory_context = NULL; + +static void pg_plan_advice_ExecutorStart(QueryDesc *queryDesc, int eflags); +static void pg_plan_advice_explain_option_handler(ExplainState *es, + DefElem *opt, + ParseState *pstate); +static void pg_plan_advice_explain_per_plan_hook(PlannedStmt *plannedstmt, + IntoClause *into, + ExplainState *es, + const char *queryString, + ParamListInfo params, + QueryEnvironment *queryEnv); +static char *pg_plan_advice_generate(PlannedStmt *pstmt); + +/* + * Initialize this module. + */ +void +_PG_init(void) +{ + DefineCustomIntVariable("pg_plan_advice.local_collection_limit", + "# of advice entries to retain in per-backend memory", + NULL, + &pg_plan_advice_local_collection_limit, + 0, + 0, INT_MAX, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_plan_advice.shared_collection_limit", + "# of advice entries to retain in shared memory", + NULL, + &pg_plan_advice_shared_collection_limit, + 0, + 0, INT_MAX, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + /* Get an ID that we can use to cache data in an ExplainState. */ + es_extension_id = GetExplainExtensionId("pg_plan_advice"); + + /* Register the new EXPLAIN options implemented by this module. */ + RegisterExtensionExplainOption("plan_advice", + pg_plan_advice_explain_option_handler); + + /* Install hooks */ + prev_ExecutorStart = ExecutorStart_hook; + ExecutorStart_hook = pg_plan_advice_ExecutorStart; + prev_explain_per_plan_hook = explain_per_plan_hook; + explain_per_plan_hook = pg_plan_advice_explain_per_plan_hook; +} + +/* + * Initialize shared state when first created. + */ +static void +pgpa_init_shared_state(void *ptr) +{ + pgpa_shared_state *state = (pgpa_shared_state *) ptr; + + LWLockInitialize(&state->lock, LWLockNewTrancheId()); + state->dsa_tranche = LWLockNewTrancheId(); + state->area = DSA_HANDLE_INVALID; + state->shared_collector = InvalidDsaPointer; +} + +/* + * Return a pointer to a memory context where long-lived data managed by this + * module can be stored. + */ +MemoryContext +pg_plan_advice_get_mcxt(void) +{ + if (pgpa_memory_context == NULL) + pgpa_memory_context = AllocSetContextCreate(TopMemoryContext, + "pg_plan_advice", + ALLOCSET_DEFAULT_SIZES); + + return pgpa_memory_context; +} + +/* + * Get a pointer to our shared state. + * + * If no shared state exists, create and initialize it. If it does exist but + * this backend has not yet accessed it, attach to it. Otherwise, just return + * our cached pointer. + * + * Along the way, make sure the relevant LWLock tranches are registered. + */ +pgpa_shared_state * +pg_plan_advice_attach(void) +{ + pgpa_shared_state *state; + bool found; + + if (pgpa_state == NULL) + { + state = GetNamedDSMSegment("pg_plan_advice", sizeof(pgpa_shared_state), + pgpa_init_shared_state, &found); + LWLockRegisterTranche(state->lock.tranche, "pg_plan_advice_lock"); + LWLockRegisterTranche(state->dsa_tranche, "pg_plan_advice_dsa"); + pgpa_state = state; + } + + return pgpa_state; +} + +/* + * Return a pointer to pg_plan_advice's DSA area, creating it if needed. + */ +dsa_area * +pg_plan_advice_dsa_area(void) +{ + if (pgpa_dsa_area == NULL) + { + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_handle area_handle; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt()); + + LWLockAcquire(&state->lock, LW_EXCLUSIVE); + area_handle = state->area; + if (area_handle == DSA_HANDLE_INVALID) + { + pgpa_dsa_area = dsa_create(state->dsa_tranche); + dsa_pin(pgpa_dsa_area); + state->area = dsa_get_handle(pgpa_dsa_area); + LWLockRelease(&state->lock); + } + else + { + LWLockRelease(&state->lock); + pgpa_dsa_area = dsa_attach(area_handle); + } + + dsa_pin_mapping(pgpa_dsa_area); + + MemoryContextSwitchTo(oldcontext); + } + + return pgpa_dsa_area; +} + +static void +pg_plan_advice_ExecutorStart(QueryDesc *queryDesc, int eflags) +{ + PlannedStmt *pstmt = queryDesc->plannedstmt; + + if (pg_plan_advice_local_collection_limit > 0 + || pg_plan_advice_shared_collection_limit > 0) + { + char *advice; + + advice = pg_plan_advice_generate(pstmt); + + /* + * If the advice string is non-empty, pass it to the collectors. + * + * A query such as SELECT 2+2 or SELECT * FROM generate_series(1,10) + * will not produce any advice, since there are no query planning + * decisions that can be influenced. It wouldn't exactly be wrong to + * record the query together with the empty advice string, but there + * doesn't seem to be much value in it, so skip it to save space. + * + * If this proves confusing to users, we might need to revist the + * behavior here. + */ + if (advice[0] != '\0') + pgpa_collect_advice(pstmt->queryId, queryDesc->sourceText, + advice); + } + + if (prev_ExecutorStart) + prev_ExecutorStart(queryDesc, eflags); + else + standard_ExecutorStart(queryDesc, eflags); +} + +/* + * Handler for EXPLAIN (PLAN_ADVICE). + */ +static void +pg_plan_advice_explain_option_handler(ExplainState *es, DefElem *opt, + ParseState *pstate) +{ + bool *plan_advice; + + plan_advice = GetExplainExtensionState(es, es_extension_id); + + if (plan_advice == NULL) + { + plan_advice = palloc0_object(bool); + SetExplainExtensionState(es, es_extension_id, plan_advice); + } + + *plan_advice = defGetBoolean(opt); +} + +/* + * If the PLAN_ADVICE option was specified -- and not set to FALSE -- generate + * advice from the provided plan and add it to the EXPLAIN output. + */ +static void +pg_plan_advice_explain_per_plan_hook(PlannedStmt *plannedstmt, + IntoClause *into, + ExplainState *es, + const char *queryString, + ParamListInfo params, + QueryEnvironment *queryEnv) +{ + bool *plan_advice; + + if (prev_explain_per_plan_hook) + prev_explain_per_plan_hook(plannedstmt, into, es, queryString, + params, queryEnv); + + plan_advice = GetExplainExtensionState(es, es_extension_id); + if (plan_advice != NULL && *plan_advice) + { + char *advice = pg_plan_advice_generate(plannedstmt); + + /* + * The advice string likely spans multiple lines; the last line will + * not end in a newline, but the others will. In text format, it looks + * nicest to indent each line of the advice separately, beginning on + * the line below the "Plan Advice" label. For non-text formats, it's + * best not to add any special handling. + */ + if (es->format != EXPLAIN_FORMAT_TEXT) + ExplainPropertyText("Plan Advice", advice, es); + else if (*advice != '\0') + { + char *s; + + ExplainIndentText(es); + appendStringInfo(es->str, "Plan Advice:\n"); + + es->indent++; + + while ((s = strchr(advice, '\n')) != NULL) + { + ExplainIndentText(es); + appendBinaryStringInfo(es->str, advice, (s - advice) + 1); + advice = s + 1; + } + + if (*advice != '\0') + { + ExplainIndentText(es); + appendStringInfo(es->str, "%s\n", advice); + } + + es->indent--; + } + } +} + +/* + * Generate advice from a query plan and send it to the relevant collectors. + */ +static char * +pg_plan_advice_generate(PlannedStmt *pstmt) +{ + pgpa_plan_walker_context walker; + StringInfoData buf; + ListCell *lc; + const char **rt_identifiers; + + /* Construct unique identifiers for non-JOIN RTEs. */ + rt_identifiers = pgpa_create_identifiers_for_planned_stmt(pstmt); + + /* Initialization. */ + memset(&walker, 0, sizeof(pgpa_plan_walker_context)); + walker.pstmt = pstmt; + + /* Walk the main plan tree. */ + pgpa_plan_walker(&walker, pstmt->planTree, 0, NULL, NIL); + + /* Main plan tree walk won't reach subplans, so walk those. */ + foreach(lc, pstmt->subplans) + { + Plan *plan = lfirst(lc); + + if (plan != NULL) + pgpa_plan_walker(&walker, plan, 0, NULL, NIL); + } + + /* Put advice into string form. */ + initStringInfo(&buf); + pgpa_output_advice(&buf, &walker, rt_identifiers); + return buf.data; +} diff --git a/contrib/pg_plan_advice/pg_plan_advice.control b/contrib/pg_plan_advice/pg_plan_advice.control new file mode 100644 index 0000000000..aa6fdc9e7b --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice.control @@ -0,0 +1,5 @@ +# pg_plan_advice extension +comment = 'help the planner get the right plan' +default_version = '1.0' +module_pathname = '$libdir/pg_plan_advice' +relocatable = true diff --git a/contrib/pg_plan_advice/pg_plan_advice.h b/contrib/pg_plan_advice/pg_plan_advice.h new file mode 100644 index 0000000000..385d3f3052 --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice.h @@ -0,0 +1,35 @@ +/*------------------------------------------------------------------------- + * + * pg_plan_advice.h + * header file for pg_plan_advice contrib module + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pg_plan_advice.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_PLAN_ADVICE_H +#define PG_PLAN_ADVICE_H + +#include "storage/lwlock.h" +#include "utils/dsa.h" + +typedef struct pgpa_shared_state +{ + LWLock lock; + int dsa_tranche; + dsa_handle area; + dsa_pointer shared_collector; +} pgpa_shared_state; + +/* GUC variables */ +extern int pg_plan_advice_local_collection_limit; +extern int pg_plan_advice_shared_collection_limit; + +/* Function prototypes */ +extern MemoryContext pg_plan_advice_get_mcxt(void); +extern pgpa_shared_state *pg_plan_advice_attach(void); +extern dsa_area *pg_plan_advice_dsa_area(void); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_collector.c b/contrib/pg_plan_advice/pgpa_collector.c new file mode 100644 index 0000000000..1bcefe1fdf --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_collector.c @@ -0,0 +1,630 @@ +/*------------------------------------------------------------------------- + * + * pgpa_collector.c + * collect advice into backend-local or shared memory + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_collector.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pg_plan_advice.h" +#include "pgpa_collector.h" + +#include "datatype/timestamp.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/pg_list.h" +#include "utils/builtins.h" +#include "utils/timestamp.h" + +PG_FUNCTION_INFO_V1(pg_clear_collected_local_advice); +PG_FUNCTION_INFO_V1(pg_clear_collected_shared_advice); +PG_FUNCTION_INFO_V1(pg_get_collected_local_advice); +PG_FUNCTION_INFO_V1(pg_get_collected_shared_advice); + +#define ADVICE_CHUNK_SIZE 1024 +#define ADVICE_CHUNK_ARRAY_SIZE 64 + +#define PG_GET_ADVICE_COLUMNS 7 + +/* + * Advice extracted from one query plan, together with the query string + * and various other identifying details. + */ +typedef struct pgpa_collected_advice +{ + Oid userid; /* user OID */ + Oid dbid; /* database OID */ + uint64 queryid; /* query identifier */ + TimestampTz timestamp; /* query timestamp */ + int advice_offset; /* start of advice in textual data */ + char textual_data[FLEXIBLE_ARRAY_MEMBER]; +} pgpa_collected_advice; + +/* + * A bunch of pointers to pgpa_collected_advice objects, stored in + * backend-local memory. + */ +typedef struct pgpa_local_advice_chunk +{ + pgpa_collected_advice *entries[ADVICE_CHUNK_SIZE]; +} pgpa_local_advice_chunk; + +/* + * Information about all of the pgpa_collected_advice objects that we're + * storing in local memory. + * + * We assign consecutive IDs, starting from 0, to each pgpa_collected_advice + * object that we store. The actual storage is an array of chunks, which + * helps keep memcpy() overhead low when we start discarding older data. + */ +typedef struct pgpa_local_advice +{ + uint64 next_id; + uint64 oldest_id; + uint64 base_id; + int chunk_array_allocated_size; + pgpa_local_advice_chunk **chunks; +} pgpa_local_advice; + +/* + * Just like pgpa_local_advice_chunk, but stored in a dynamic shared area, + * so we must use dsa_pointer instead of native pointers. + */ +typedef struct pgpa_shared_advice_chunk +{ + dsa_pointer entries[ADVICE_CHUNK_SIZE]; +} pgpa_shared_advice_chunk; + +/* + * Just like pgpa_local_advice, but stored in a dynamic shared area, so + * we must use dsa_pointer instead of native pointers. + */ +typedef struct pgpa_shared_advice +{ + uint64 next_id; + uint64 oldest_id; + uint64 base_id; + int chunk_array_allocated_size; + dsa_pointer chunks; +} pgpa_shared_advice; + +/* Pointers to local and shared collectors */ +static pgpa_local_advice *local_collector = NULL; +static pgpa_shared_advice *shared_collector = NULL; + +/* Static functions */ +static pgpa_collected_advice *pgpa_make_collected_advice(Oid userid, + Oid dbid, + uint64 queryId, + TimestampTz timestamp, + const char *query_string, + const char *advice_string, + dsa_area *area, + dsa_pointer *result); +static void pgpa_store_local_advice(pgpa_collected_advice *ca); +static void pgpa_trim_local_advice(int limit); +static void pgpa_store_shared_advice(dsa_pointer ca_pointer); +static void pgpa_trim_shared_advice(dsa_area *area, int limit); + +/* Helper function to extract the query string from pgpa_collected_advice */ +static inline const char * +query_string(pgpa_collected_advice *ca) +{ + return ca->textual_data; +} + +/* Helper function to extract the advice string from pgpa_collected_advice */ +static inline const char * +advice_string(pgpa_collected_advice *ca) +{ + return ca->textual_data + ca->advice_offset; +} + +/* + * Store collected query advice into the local or shared advice collector, + * as appropriate. + */ +void +pgpa_collect_advice(uint64 queryId, const char *query_string, + const char *advice_string) +{ + Oid userid = GetUserId(); + Oid dbid = MyDatabaseId; + TimestampTz now = GetCurrentTimestamp(); + + if (pg_plan_advice_local_collection_limit > 0) + { + pgpa_collected_advice *ca; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt()); + ca = pgpa_make_collected_advice(userid, dbid, queryId, now, + query_string, advice_string, + NULL, NULL); + pgpa_store_local_advice(ca); + MemoryContextSwitchTo(oldcontext); + } + + if (pg_plan_advice_shared_collection_limit > 0) + { + dsa_area *area = pg_plan_advice_dsa_area(); + dsa_pointer ca_pointer; + + pgpa_make_collected_advice(userid, dbid, queryId, now, + query_string, advice_string, area, + &ca_pointer); + pgpa_store_shared_advice(ca_pointer); + } +} + +/* + * Allocate and fill a new pgpa_collected_advice object. + * + * If area != NULL, it is used to allocate the new object, and the resulting + * dsa_pointer is returned via *result. + * + * If area == NULL, the new object is allocated in the current memory context, + * and result is not examined or modified. + */ +static pgpa_collected_advice * +pgpa_make_collected_advice(Oid userid, Oid dbid, uint64 queryId, + TimestampTz timestamp, + const char *query_string, + const char *advice_string, + dsa_area *area, dsa_pointer *result) +{ + size_t query_string_length = strlen(query_string) + 1; + size_t advice_string_length = strlen(advice_string) + 1; + size_t total_length; + pgpa_collected_advice *ca; + + total_length = offsetof(pgpa_collected_advice, textual_data) + + query_string_length + advice_string_length; + + if (area == NULL) + ca = palloc(total_length); + else + { + *result = dsa_allocate(area, total_length); + ca = dsa_get_address(area, *result); + } + + ca->userid = GetUserId(); + ca->dbid = MyDatabaseId; + ca->queryid = queryId; + ca->timestamp = timestamp; + ca->advice_offset = query_string_length; + + memcpy(ca->textual_data, query_string, query_string_length); + memcpy(&ca->textual_data[ca->advice_offset], + advice_string, advice_string_length); + + return ca; +} + +/* + * Add a pg_collected_advice object to our backend-local advice collection. + * + * Caller is responsible for switching to the appropriate memory context; + * the provided object should have been allocated in that same context. + */ +static void +pgpa_store_local_advice(pgpa_collected_advice *ca) +{ + uint64 chunk_number; + uint64 chunk_offset; + pgpa_local_advice *la = local_collector; + + /* If the local advice collector isn't initialized yet, do that now. */ + if (la == NULL) + { + la = palloc0(sizeof(pgpa_local_advice)); + la->chunk_array_allocated_size = ADVICE_CHUNK_ARRAY_SIZE; + la->chunks = palloc0_array(pgpa_local_advice_chunk *, + la->chunk_array_allocated_size); + local_collector = la; + } + + /* Compute chunk and offset at which to store this advice. */ + chunk_number = (la->next_id - la->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (la->next_id - la->base_id) % ADVICE_CHUNK_SIZE; + + /* Extend chunk array, if needed. */ + if (chunk_number > la->chunk_array_allocated_size) + { + int new_size; + + new_size = la->chunk_array_allocated_size + ADVICE_CHUNK_ARRAY_SIZE; + la->chunks = repalloc0_array(la->chunks, + pgpa_local_advice_chunk *, + la->chunk_array_allocated_size, + new_size); + la->chunk_array_allocated_size = new_size; + } + + /* Allocate new chunk, if needed. */ + if (la->chunks[chunk_number] == NULL) + la->chunks[chunk_number] = palloc0_object(pgpa_local_advice_chunk); + + /* Save pointer and bump next-id counter. */ + Assert(la->chunks[chunk_number]->entries[chunk_offset] == NULL); + la->chunks[chunk_number]->entries[chunk_offset] = ca; + ++la->next_id; + + /* If we've exceeded the storage limit, discard old data. */ + pgpa_trim_local_advice(pg_plan_advice_local_collection_limit); +} + +/* + * Add a pg_collected_advice object to the shared advice collection. + * + * 'ca_pointer' should have been allocated from the pg_plan_advice DSA area + * and should point to an object of type pgpa_collected_advice. + */ +static void +pgpa_store_shared_advice(dsa_pointer ca_pointer) +{ + uint64 chunk_number; + uint64 chunk_offset; + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_area *area = pg_plan_advice_dsa_area(); + pgpa_shared_advice *sa = shared_collector; + dsa_pointer *chunk_array; + pgpa_shared_advice_chunk *chunk; + + /* Lock the shared state. */ + LWLockAcquire(&state->lock, LW_EXCLUSIVE); + + /* + * If we're not attached to the shared advice collector yet, fix that now. + * If we're the first ones to attach, we may need to create the object. + */ + if (sa == NULL) + { + if (state->shared_collector == InvalidDsaPointer) + state->shared_collector = + dsa_allocate0(area, sizeof(pgpa_shared_advice)); + shared_collector = sa = dsa_get_address(area, state->shared_collector); + } + + /* + * It's possible that some other backend may have succeeded in creating + * the main collector object but failed to allocate an initial chunk + * array, so we must be prepared to allocate the chunk array here whether + * or not we created the collector object. + */ + if (shared_collector->chunk_array_allocated_size == 0) + { + sa->chunks = + dsa_allocate0(area, + sizeof(dsa_pointer) * ADVICE_CHUNK_ARRAY_SIZE); + sa->chunk_array_allocated_size = ADVICE_CHUNK_ARRAY_SIZE; + } + + /* Compute chunk and offset at which to store this advice. */ + chunk_number = (sa->next_id - sa->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (sa->next_id - sa->base_id) % ADVICE_CHUNK_SIZE; + + /* Get the address of the chunk array and, if needed, extend it. */ + if (chunk_number > sa->chunk_array_allocated_size) + { + int new_size; + dsa_pointer new_chunks; + + /* + * DSA can't enlarge an existing allocation, so we must make a new + * allocation and copy data over. + */ + new_size = sa->chunk_array_allocated_size + ADVICE_CHUNK_ARRAY_SIZE; + new_chunks = dsa_allocate0(area, sizeof(dsa_pointer) * new_size); + chunk_array = dsa_get_address(area, new_chunks); + memcpy(chunk_array, dsa_get_address(area, sa->chunks), + sizeof(dsa_pointer) * sa->chunk_array_allocated_size); + dsa_free(area, sa->chunks); + sa->chunks = new_chunks; + sa->chunk_array_allocated_size = new_size; + } + else + chunk_array = dsa_get_address(area, sa->chunks); + + /* Get the address of the desired chunk, allocating it if needed. */ + if (chunk_array[chunk_number] == InvalidDsaPointer) + chunk_array[chunk_number] = + dsa_allocate0(area, sizeof(pgpa_shared_advice_chunk)); + chunk = dsa_get_address(area, chunk_array[chunk_number]); + + /* Save pointer and bump next-id counter. */ + Assert(chunk->entries[chunk_offset] == InvalidDsaPointer); + chunk->entries[chunk_offset] = ca_pointer; + ++sa->next_id; + + /* If we've exceeded the storage limit, discard old data. */ + pgpa_trim_shared_advice(area, pg_plan_advice_shared_collection_limit); + + /* Release lock on shared state. */ + LWLockRelease(&state->lock); +} + +/* + * Discard collected advice stored in backend-local memory in excess of the + * specified limit. + */ +static void +pgpa_trim_local_advice(int limit) +{ + pgpa_local_advice *la = local_collector; + uint64 current_count; + uint64 trim_count; + uint64 total_chunk_count; + uint64 trim_chunk_count; + uint64 remaining_chunk_count; + + /* If we haven't yet reached the limit, there's nothing to do. */ + current_count = la->next_id - la->oldest_id; + if (current_count <= limit) + return; + + /* Free enough entries to get us back down to the limit. */ + trim_count = current_count - limit; + while (trim_count > 0) + { + uint64 chunk_number; + uint64 chunk_offset; + + chunk_number = (la->oldest_id - la->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (la->oldest_id - la->base_id) % ADVICE_CHUNK_SIZE; + + Assert(la->chunks[chunk_number]->entries[chunk_offset] != NULL); + pfree(la->chunks[chunk_number]->entries[chunk_offset]); + la->chunks[chunk_number]->entries[chunk_offset] = NULL; + ++la->oldest_id; + --trim_count; + } + + /* Free any chunks that are now entirely unused. */ + trim_chunk_count = (la->oldest_id - la->base_id) / ADVICE_CHUNK_SIZE; + for (uint64 n = 0; n < trim_chunk_count; ++n) + pfree(la->chunks[n]); + + /* Slide remaining chunk pointers back toward the base of the array. */ + total_chunk_count = (la->next_id - la->base_id + + ADVICE_CHUNK_SIZE - 1) / ADVICE_CHUNK_SIZE; + remaining_chunk_count = total_chunk_count - trim_chunk_count; + if (remaining_chunk_count > 0) + memmove(&la->chunks[0], &la->chunks[trim_chunk_count], + sizeof(pgpa_local_advice_chunk *) * remaining_chunk_count); + + /* Adjust base ID value accordingly. */ + la->base_id += trim_chunk_count * ADVICE_CHUNK_SIZE; +} + +/* + * Discard collected advice stored in shared memory in excess of the + * specified limit. + */ +static void +pgpa_trim_shared_advice(dsa_area *area, int limit) +{ + pgpa_shared_advice *sa = shared_collector; + uint64 current_count; + uint64 trim_count; + uint64 total_chunk_count; + uint64 trim_chunk_count; + uint64 remaining_chunk_count; + dsa_pointer *chunk_array; + + /* If we haven't yet reached the limit, there's nothing to do. */ + current_count = sa->next_id - sa->oldest_id; + if (current_count <= limit) + return; + + /* Get a pointer to the chunk array. */ + chunk_array = dsa_get_address(area, sa->chunks); + + /* Free enough entries to get us back down to the limit. */ + trim_count = current_count - limit; + while (trim_count > 0) + { + uint64 chunk_number; + uint64 chunk_offset; + pgpa_shared_advice_chunk *chunk; + + chunk_number = (sa->oldest_id - sa->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (sa->oldest_id - sa->base_id) % ADVICE_CHUNK_SIZE; + + chunk = dsa_get_address(area, chunk_array[chunk_number]); + Assert(chunk->entries[chunk_offset] != InvalidDsaPointer); + dsa_free(area, chunk->entries[chunk_offset]); + chunk->entries[chunk_offset] = InvalidDsaPointer; + ++sa->oldest_id; + --trim_count; + } + + /* Free any chunks that are now entirely unused. */ + trim_chunk_count = (sa->oldest_id - sa->base_id) / ADVICE_CHUNK_SIZE; + for (uint64 n = 0; n < trim_chunk_count; ++n) + dsa_free(area, chunk_array[n]); + + /* Slide remaining chunk pointers back toward the base of the array. */ + total_chunk_count = (sa->next_id - sa->base_id + + ADVICE_CHUNK_SIZE - 1) / ADVICE_CHUNK_SIZE; + remaining_chunk_count = total_chunk_count - trim_chunk_count; + if (remaining_chunk_count > 0) + memmove(&chunk_array[0], &chunk_array[trim_chunk_count], + sizeof(dsa_pointer) * remaining_chunk_count); + + /* Adjust base ID value accordingly. */ + sa->base_id += trim_chunk_count * ADVICE_CHUNK_SIZE; +} + +/* + * SQL-callable function to discard advice collected in backend-local memory + */ +Datum +pg_clear_collected_local_advice(PG_FUNCTION_ARGS) +{ + pgpa_trim_local_advice(0); + + PG_RETURN_VOID(); +} + +/* + * SQL-callable function to discard advice collected in backend-local memory + */ +Datum +pg_clear_collected_shared_advice(PG_FUNCTION_ARGS) +{ + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_area *area = pg_plan_advice_dsa_area(); + + LWLockAcquire(&state->lock, LW_EXCLUSIVE); + + /* + * If we're not attached to the shared advice collector yet, fix that now; + * but if the collector doesn't even exist, we can return without doing + * anything else. + */ + if (shared_collector == NULL) + { + if (state->shared_collector == InvalidDsaPointer) + { + LWLockRelease(&state->lock); + return (Datum) 0; + } + shared_collector = dsa_get_address(area, state->shared_collector); + } + + /* Do the real work */ + pgpa_trim_shared_advice(area, 0); + + LWLockRelease(&state->lock); + + PG_RETURN_VOID(); +} + +/* + * SQL-callable SRF to return advice collected in backend-local memory + */ +Datum +pg_get_collected_local_advice(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + pgpa_local_advice *la = local_collector; + + /* + * XXX. Is this the correct thing from a security point of view? + */ + if (InSecurityRestrictedOperation()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("cannot call \"%s\" within security-restricted operation", + "pg_get_collected_local_advice"))); + + InitMaterializedSRF(fcinfo, 0); + + if (la == NULL) + return (Datum) 0; + + /* Loop over all entries. */ + for (uint64 id = la->oldest_id; id < la->next_id; ++id) + { + uint64 chunk_number; + uint64 chunk_offset; + pgpa_collected_advice *ca; + Datum values[PG_GET_ADVICE_COLUMNS]; + bool nulls[PG_GET_ADVICE_COLUMNS] = {0}; + + chunk_number = (id - la->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (id - la->base_id) % ADVICE_CHUNK_SIZE; + + ca = la->chunks[chunk_number]->entries[chunk_offset]; + + values[0] = UInt64GetDatum(id); + values[1] = ObjectIdGetDatum(ca->userid); + values[2] = ObjectIdGetDatum(ca->dbid); + values[3] = UInt64GetDatum(ca->queryid); + values[4] = TimestampGetDatum(ca->timestamp); + values[5] = CStringGetTextDatum(query_string(ca)); + values[6] = CStringGetTextDatum(advice_string(ca)); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + return (Datum) 0; +} + +/* + * SQL-callable SRF to return advice collected in shared memory + */ +Datum +pg_get_collected_shared_advice(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_area *area = pg_plan_advice_dsa_area(); + dsa_pointer *chunk_array; + pgpa_shared_advice *sa = shared_collector; + + InitMaterializedSRF(fcinfo, 0); + + /* Lock the shared state. */ + LWLockAcquire(&state->lock, LW_SHARED); + + /* + * If we're not attached to the shared advice collector yet, fix that now; + * but if the collector doesn't even exist, we can return without doing + * anything else. + */ + if (sa == NULL) + { + if (state->shared_collector == InvalidDsaPointer) + { + LWLockRelease(&state->lock); + return (Datum) 0; + } + shared_collector = sa = dsa_get_address(area, state->shared_collector); + } + + /* Get a pointer to the chunk array. */ + chunk_array = dsa_get_address(area, sa->chunks); + + /* Loop over all entries. */ + for (uint64 id = sa->oldest_id; id < sa->next_id; ++id) + { + uint64 chunk_number; + uint64 chunk_offset; + pgpa_shared_advice_chunk *chunk; + pgpa_collected_advice *ca; + Datum values[PG_GET_ADVICE_COLUMNS]; + bool nulls[PG_GET_ADVICE_COLUMNS] = {0}; + + chunk_number = (id - sa->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (id - sa->base_id) % ADVICE_CHUNK_SIZE; + + chunk = dsa_get_address(area, chunk_array[chunk_number]); + ca = dsa_get_address(area, chunk->entries[chunk_offset]); + + values[0] = UInt64GetDatum(id); + values[1] = ObjectIdGetDatum(ca->userid); + values[2] = ObjectIdGetDatum(ca->dbid); + values[3] = UInt64GetDatum(ca->queryid); + values[4] = TimestampGetDatum(ca->timestamp); + values[5] = CStringGetTextDatum(query_string(ca)); + values[6] = CStringGetTextDatum(advice_string(ca)); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + /* Release lock on shared state. */ + LWLockRelease(&state->lock); + + return (Datum) 0; +} diff --git a/contrib/pg_plan_advice/pgpa_collector.h b/contrib/pg_plan_advice/pgpa_collector.h new file mode 100644 index 0000000000..6f8efa27f0 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_collector.h @@ -0,0 +1,3 @@ + +extern void pgpa_collect_advice(uint64 queryId, const char *query_string, + const char *advice_string); diff --git a/contrib/pg_plan_advice/pgpa_identifier.c b/contrib/pg_plan_advice/pgpa_identifier.c new file mode 100644 index 0000000000..56ec832b23 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_identifier.c @@ -0,0 +1,385 @@ +/*------------------------------------------------------------------------- + * + * pgpa_identifier.c + * create appropriate identifiers for range table entries + * + * The goal of this module is to be able to produce identifiers for range + * table entries that are unique, understandable to human beings, and + * able to be reconstructed during future planning cycles. As an + * exception, we do not care about, or want to produce, identifiers for + * RTE_JOIN entries. This is because (1) we would end up with a ton of + * RTEs with unhelpful names like unnamed_join_17; (2) not all joins have + * RTEs; and (3) we intend to refer to joins by their constituent members + * rather than by reference to the join RTE. + * + * In general, we construct identifiers of the following form: + * + * alias_name#occurrence_number/child_table_name@subquery_name + * + * However, occurrence_number is omitted when it is the first occurrence + * within the same subquery, child_table_name is omitted for relations that + * are not child tables, and subquery_name is omitted for the topmost + * query level. Whenever an item is omitted, the preceding punctuation mark + * is also omitted. Identifier-style escaping is applied to alias_name and + * subquery_name. child_table_name is always a schema-qualified name, and + * identifier-style escaping is applied to the schema and to the relation + * names separately. + * + * The upshot of all of these rules is that in simple cases, the relation + * identifier is textually identical to the alias name, making life easier + * for users. However, even in complex cases, every relation identifier + * for a given query will be unique (or at least we hope so: if not, this + * code is buggy and the identifier format might need to be rethought). + * + * A key goal of this system is that we want to be able to reconstruct the + * same identifiers during a future planning cycle for the same query, so + * that if a certain behavior is specified for a certain identifier, we can + * properly identify the RTI for which that behavior is mandated. In order + * for this to work, subquery names must be unique and known before the + * subquery is planned, and the remainder of the identifier must not depend + * on any part of the query outside of the current subquery level. In + * particular, occurrence_number must be calculated relative to the range + * table for the relevant subquery, not the final flattened range table. + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_identifier.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgpa_identifier.h" + +#include "parser/parsetree.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + +static Index *pgpa_create_top_rti_map(Index rtable_length, List *rtable, + List *appinfos); +static int pgpa_occurrence_number(List *rtable, Index *top_rti_map, + SubPlanRTInfo *rtinfo, Index rti); + +/* + * Create a range table identifier from scratch. + * + * This function leaves the caller to do all the heavy lifting, so it's + * generally better to use one of the functions below instead. + * + * See the file header comments for more details on the format of an + * identifier. + */ +const char * +pgpa_create_identifier(const char *alias_name, int occurrence, + const char *partnsp, const char *partrel, + const char *plan_name) +{ + const char *result; + + result = quote_identifier(alias_name); + + if (occurrence != 1) + { + Assert(occurrence > 1); + result = psprintf("%s#%d", result, occurrence); + } + + if (partrel != NULL) + { + Assert(partnsp != NULL); + result = psprintf("%s/%s.%s", result, + quote_identifier(partnsp), + quote_identifier(partrel)); + } + + if (plan_name != NULL) + result = psprintf("%s@%s", result, quote_identifier(plan_name)); + + return result; +} + +/* + * Create a range table identifier for a given RTI. + */ +const char * +pgpa_create_identifier_by_rti(PlannerInfo *root, Index rti, char *plan_name) +{ + Index top_rti = rti; + int occurrence = 1; + RangeTblEntry *rte; + RangeTblEntry *top_rte; + char *partnsp = NULL; + char *partrel = NULL; + + /* + * If this is a child RTE, find the topmost parent that is still of type + * RTE_RELATION. We do this because we identify children of partitioned + * tables by the name of the child table, but subqueries can also have + * child rels and we don't care about those here. + */ + for (;;) + { + AppendRelInfo *appinfo = root->append_rel_array[top_rti]; + RangeTblEntry *parent_rte; + + if (appinfo == NULL) + break; + + parent_rte = planner_rt_fetch(appinfo->parent_relid, root); + if (parent_rte->rtekind != RTE_RELATION) + break; + + top_rti = appinfo->parent_relid; + } + + /* Get the range table entries for the RTI and top RTI. */ + rte = planner_rt_fetch(rti, root); + top_rte = planner_rt_fetch(top_rti, root); + Assert(rte->rtekind != RTE_JOIN); + Assert(top_rte->rtekind != RTE_JOIN); + + /* Work out the correct occurrence number. */ + for (Index prior_rti = 1; prior_rti < rti; ++prior_rti) + { + RangeTblEntry *prior_rte; + AppendRelInfo *appinfo = root->append_rel_array[prior_rti]; + + /* + * If this is a child rel of a parent that is a relation, skip it. + * + * Such range table entries are disambiguated by mentioning the schema + * and name of the table, not by counting them as separate occurrences + * of the same table. + */ + if (appinfo != NULL) + { + RangeTblEntry *parent_rte; + + parent_rte = planner_rt_fetch(appinfo->parent_relid, root); + if (parent_rte->rtekind == RTE_RELATION) + continue; + } + + /* Skip joins. */ + prior_rte = planner_rt_fetch(prior_rti, root); + if (prior_rte->rtekind == RTE_JOIN) + continue; + + /* Skip if the alias name differs. */ + if (strcmp(prior_rte->eref->aliasname, rte->eref->aliasname) != 0) + continue; + + /* Looks like a true duplicate. */ + ++occurrence; + } + + /* If this is a child table, get the schema and relation names. */ + if (rti != top_rti) + { + partnsp = get_namespace_name_or_temp(get_rel_namespace(rte->relid)); + partrel = get_rel_name(rte->relid); + } + + /* We have everything we need to create the identifier. */ + return pgpa_create_identifier(top_rte->eref->aliasname, + occurrence, partnsp, partrel, + plan_name); +} + +/* + * Create an array of range table identifiers for all the non-NULL, + * non-RTE_JOIN entries in the PlannedStmt's range table. + */ +const char ** +pgpa_create_identifiers_for_planned_stmt(PlannedStmt *pstmt) +{ + Index rtable_length = list_length(pstmt->rtable); + const char **result = palloc0_array(const char *, rtable_length); + Index *top_rti_map; + int rtinfoindex = 0; + SubPlanRTInfo *rtinfo = NULL; + SubPlanRTInfo *nextrtinfo = NULL; + + /* + * Account for relations addded by inheritance expansion of partitioned + * tables. + */ + top_rti_map = pgpa_create_top_rti_map(rtable_length, pstmt->rtable, + pstmt->appendRelations); + + /* + * When we begin iterating, we're processing the portion of the range + * table that originated from the top-level PlannerInfo, so subrtinfo is + * NULL. Later, subrtinfo will be the SubPlanRTInfo for the subquery whose + * portion of the range table we are processing. nextrtinfo is always the + * SubPlanRTInfo that follows the current one, if any, so when we're + * processing the top-level query's portion of the range table, the next + * SubPlanRTInfo is the very first one. + */ + if (pstmt->subrtinfos != NULL) + nextrtinfo = linitial(pstmt->subrtinfos); + + /* Main loop over the range table. */ + for (Index rti = 1; rti <= rtable_length; rti++) + { + const char *plan_name; + Index top_rti; + RangeTblEntry *rte; + RangeTblEntry *top_rte; + char *partnsp = NULL; + char *partrel = NULL; + int occurrence; + + /* + * Advance to the next SubPlanRTInfo, if it's time to do that. + * + * This loop probably shouldn't ever iterate more than once, because + * that would imply that a subquery was planned but added nothing to + * the range table; but let's be defensive and assume it can happen. + */ + while (nextrtinfo != NULL && rti > nextrtinfo->rtoffset) + { + rtinfo = nextrtinfo; + if (++rtinfoindex >= list_length(pstmt->subrtinfos)) + nextrtinfo = NULL; + else + nextrtinfo = list_nth(pstmt->subrtinfos, rtinfoindex); + } + + /* Fetch the range table entry, if any. */ + rte = rt_fetch(rti, pstmt->rtable); + + /* + * We can't and don't need to identify null entries, and we don't want + * to identify join entries. + */ + if (rte == NULL || rte->rtekind == RTE_JOIN) + continue; + + /* + * If this is not a relation added by partitioned table expansion, + * then the top RTI/RTE are just the same as this RTI/RTE. Otherwise, + * we need the information for the top RTI/RTE, and must also fetch + * the partition schema and name. + */ + top_rti = top_rti_map[rti - 1]; + if (rti == top_rti) + top_rte = rte; + else + { + top_rte = rt_fetch(top_rti, pstmt->rtable); + partnsp = + get_namespace_name_or_temp(get_rel_namespace(rte->relid)); + partrel = get_rel_name(rte->relid); + } + + /* Compute the correct occurrence number. */ + occurrence = pgpa_occurrence_number(pstmt->rtable, top_rti_map, + rtinfo, top_rti); + + /* Get the name of the current plan (NULL for toplevel query). */ + plan_name = rtinfo == NULL ? NULL : rtinfo->plan_name; + + /* OK, we're ready to create the identifier. */ + result[rti - 1] = pgpa_create_identifier(top_rte->eref->aliasname, + occurrence, + partnsp, partrel, + plan_name); + } + + return result; +} + +/* + * Build a mapping from each RTI to the RTI whose alias_name will be used to + * construct the range table identifier. + * + * For child relations, this is the topmost parent that is still of type + * RTE_RELATION. For other relations, it's just the original RTI. + * + * Since we're eventually going to need this information for every RTI in + * the range table, it's best to compute all the answers in a single pass over + * the AppendRelInfo list. Otherwise, we might end up searching through that + * list repeatedly for entries of interest. + * + * Note that the returned array is uses zero-based indexing, while RTIs use + * 1-based indexing, so subtract 1 from the RTI before looking it up in the + * array. + */ +static Index * +pgpa_create_top_rti_map(Index rtable_length, List *rtable, List *appinfos) +{ + Index *top_rti_map = palloc0_array(Index, rtable_length); + + /* Initially, make every RTI point to itself. */ + for (Index rti = 1; rti <= rtable_length; ++rti) + top_rti_map[rti - 1] = rti; + + /* Update the map for each AppendRelInfo object. */ + foreach_node(AppendRelInfo, appinfo, appinfos) + { + Index parent_rti = appinfo->parent_relid; + RangeTblEntry *parent_rte = rt_fetch(parent_rti, rtable); + + /* If the parent is not RTE_RELATION, ignore this entry. */ + if (parent_rte->rtekind != RTE_RELATION) + continue; + + /* + * Map the child to wherever we mapped the parent. Parents always + * precede their children in the AppendRelInfo list, so this should + * work out. + */ + top_rti_map[appinfo->child_relid - 1] = top_rti_map[parent_rti - 1]; + } + + return top_rti_map; +} + +/* + * Find the occurence number of a certain relation within a certain subquery. + * + * The same alias name can occur multiple times within a subquery, but we want + * to disambiguate by giving different occurrences different integer indexes. + * However, child tables are disambiguated by including the table name rather + * than by incrementing the occurrence number; and joins are not named and so + * shouldn't increment the occurence number either. + */ +static int +pgpa_occurrence_number(List *rtable, Index *top_rti_map, + SubPlanRTInfo *rtinfo, Index rti) +{ + Index rtoffset = (rtinfo == NULL) ? 0 : rtinfo->rtoffset; + int occurrence = 1; + RangeTblEntry *rte = rt_fetch(rti, rtable); + + for (Index prior_rti = rtoffset + 1; prior_rti < rti; ++prior_rti) + { + RangeTblEntry *prior_rte; + + /* + * If this is a child rel of a parent that is a relation, skip it. + * + * Such range table entries are disambiguated by mentioning the schema + * and name of the table, not by counting them as separate occurrences + * of the same table. + */ + if (top_rti_map[prior_rti - 1] != prior_rti) + break; + + /* Skip joins. */ + prior_rte = rt_fetch(prior_rti, rtable); + if (prior_rte->rtekind == RTE_JOIN) + continue; + + /* Skip if the alias name differs. */ + if (strcmp(prior_rte->eref->aliasname, rte->eref->aliasname) != 0) + continue; + + /* Looks like a true duplicate. */ + ++occurrence; + } + + return occurrence; +} diff --git a/contrib/pg_plan_advice/pgpa_identifier.h b/contrib/pg_plan_advice/pgpa_identifier.h new file mode 100644 index 0000000000..7684ba0a6a --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_identifier.h @@ -0,0 +1,29 @@ +/*------------------------------------------------------------------------- + * + * pgpa_identifier.h + * create appropriate identifiers for range table entries + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_identifier.h + * + *------------------------------------------------------------------------- + */ + +#ifndef PGPA_IDENTIFIER_H +#define PGPA_IDENTIFIER_H + +#include "nodes/pathnodes.h" +#include "nodes/plannodes.h" + +extern const char *pgpa_create_identifier(const char *alias_name, + int occurrence, + const char *partnsp, + const char *partrel, + const char *plan_name); +extern const char *pgpa_create_identifier_by_rti(PlannerInfo *root, + Index rti, + char *plan_name); +extern const char **pgpa_create_identifiers_for_planned_stmt(PlannedStmt *pstmt); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_join.c b/contrib/pg_plan_advice/pgpa_join.c new file mode 100644 index 0000000000..1984bd1352 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_join.c @@ -0,0 +1,601 @@ +/*------------------------------------------------------------------------- + * + * pgpa_join.c + * analysis of joins in Plan trees + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_join.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgpa_join.h" +#include "pgpa_scan.h" +#include "pgpa_walker.h" + +#include "nodes/pathnodes.h" +#include "nodes/print.h" +#include "parser/parsetree.h" + +/* + * Temporary object used when unrolling a join tree. + */ +struct pgpa_join_unroller +{ + unsigned nallocated; + unsigned nused; + Plan *outer_subplan; + ElidedNode *outer_elided_node; + pgpa_join_strategy *strategy; + Plan **inner_subplans; + ElidedNode **inner_elided_nodes; + pgpa_join_unroller **inner_unrollers; +}; + +static pgpa_join_strategy pgpa_decompose_join(pgpa_plan_walker_context *walker, + Plan *plan, + Plan **realouter, + Plan **realinner, + ElidedNode **elidedrealouter, + ElidedNode **elidedrealinner); +static ElidedNode *pgpa_descend_node(PlannedStmt *pstmt, Plan **plan); +static ElidedNode *pgpa_descend_any_gather(PlannedStmt *pstmt, Plan **plan); +static bool pgpa_descend_any_unique(PlannedStmt *pstmt, Plan **plan, + ElidedNode **elided_node); + +static Bitmapset *pgpa_add_member_relids(Bitmapset *relids, + pgpa_join_member *member); + +static bool is_result_node_with_child(Plan *plan); +static bool is_sorting_plan(Plan *plan); + +/* + * Create an initially-empty object for unrolling joins. + * + * See comments for pgpa_unrolled_join and pgpa_clumped_join in pgpa_join.h + * for a discussion of clumped and unrolled joins. This function creates a + * helper object that can later be used to create a pgpa_unrolled_join, after + * first calling pgpa_unroll_join one or more times. + */ +pgpa_join_unroller * +pgpa_create_join_unroller(void) +{ + pgpa_join_unroller *join_unroller; + + join_unroller = palloc0_object(pgpa_join_unroller); + join_unroller->nallocated = 4; + join_unroller->strategy = + palloc_array(pgpa_join_strategy, join_unroller->nallocated); + join_unroller->inner_subplans = + palloc_array(Plan *, join_unroller->nallocated); + join_unroller->inner_elided_nodes = + palloc_array(ElidedNode *, join_unroller->nallocated); + join_unroller->inner_unrollers = + palloc_array(pgpa_join_unroller *, join_unroller->nallocated); + + return join_unroller; +} + +/* + * Unroll one level of an unrollable join tree. + * + * Our basic goal here is to unroll join trees as they occur in the Plan + * tree into a simpler and more regular structure that we can more easily + * use for further processing. Unrolling is outer-deep, so if the plan tree + * has Join1(Join2(A,B),Join3(C,D)), the same join unroller object should be + * used for Join1 and Join2, but a different one will be needed for Join3, + * since that involves a join within the *inner* side of another join. + * + * pgpa_plan_walker creates a "top level" join unroller object when it + * encounters a join in a portion of the plan tree in which no join unroller + * is already active. From there, this function is responsible for determing + * to what portion of the plan tree that join unroller applies, and for + * creating any subordinate join unroller objects that are needed as a result + * of non-outer-deep join trees. We do this by returning the join unroller + * objects that should be used for further traversal of the outer and inner + * subtrees of the current plan node via *outer_join_unroller and + * *inner_join_unroller, respectively. + */ +void +pgpa_unroll_join(pgpa_plan_walker_context *walker, Plan *plan, + pgpa_join_unroller *join_unroller, + pgpa_join_unroller **outer_join_unroller, + pgpa_join_unroller **inner_join_unroller) +{ + pgpa_join_strategy strategy; + Plan *realinner, + *realouter; + ElidedNode *elidedinner, + *elidedouter; + int n; + + Assert(join_unroller != NULL); + + /* + * We need to pass the join_unroller object down through certain types of + * plan nodes -- anything that's considered part of the join strategy, and + * any other nodes that can occur in a join tree despite not being scans + * or joins. + * + * This includes: + * + * (1) Materialize, Memoize, and Hash nodes, which are part of the join + * strategy, + * + * (2) Gather and Gather Merge nodes, which can occur at any point in the + * join tree where the planner decided to initiate parallelism, + * + * (3) Sort and IncrementalSort nodes, which can occur beneath MergeJoin + * or GatherMerge, + * + * (4) Agg and Unique nodes, which can occur when we decide to make the + * nullable side of a semijoin unique and then join the result, and + * + * (5) Result nodes with children, which can be added either to project to + * enforce a one-time filter (but Result nodes without children are + * degenerate scans or joins). + */ + if (IsA(plan, Material) || IsA(plan, Memoize) || IsA(plan, Hash) + || IsA(plan, Gather) || IsA(plan, GatherMerge) + || is_sorting_plan(plan) || IsA(plan, Agg) || IsA(plan, Unique) + || is_result_node_with_child(plan)) + { + *outer_join_unroller = join_unroller; + return; + } + + /* + * Since we've already handled nodes that require pass-through treatment, + * this should be an unrollable join. + */ + strategy = pgpa_decompose_join(walker, plan, + &realouter, &realinner, + &elidedouter, &elidedinner); + + /* If our workspace is full, expand it. */ + if (join_unroller->nused >= join_unroller->nallocated) + { + join_unroller->nallocated *= 2; + join_unroller->strategy = + repalloc_array(join_unroller->strategy, + pgpa_join_strategy, + join_unroller->nallocated); + join_unroller->inner_subplans = + repalloc_array(join_unroller->inner_subplans, + Plan *, + join_unroller->nallocated); + join_unroller->inner_elided_nodes = + repalloc_array(join_unroller->inner_elided_nodes, + ElidedNode *, + join_unroller->nallocated); + join_unroller->inner_unrollers = + repalloc_array(join_unroller->inner_unrollers, + pgpa_join_unroller *, + join_unroller->nallocated); + } + + /* + * Since we're flattening outer-deep join trees, it follows that if the + * outer side is still an unrollable join, it should be unrolled into this + * same object. Otherwise, we've reached the limit of what we can unroll + * into this object and must remember the outer side as the final outer + * subplan. + */ + if (elidedouter == NULL && pgpa_is_join(realouter)) + *outer_join_unroller = join_unroller; + else + { + join_unroller->outer_subplan = realouter; + join_unroller->outer_elided_node = elidedouter; + } + + /* + * Store the inner subplan. If it's an unrollable join, it needs to be + * flattened in turn, but into a new unroller object, not this one. + */ + n = join_unroller->nused++; + join_unroller->strategy[n] = strategy; + join_unroller->inner_subplans[n] = realinner; + join_unroller->inner_elided_nodes[n] = elidedinner; + if (elidedinner == NULL && pgpa_is_join(realinner)) + *inner_join_unroller = pgpa_create_join_unroller(); + else + *inner_join_unroller = NULL; + join_unroller->inner_unrollers[n] = *inner_join_unroller; +} + +/* + * Use the data we've accumulated in a pgpa_join_unroller object to construct + * a pgpa_unrolled_join. + */ +pgpa_unrolled_join * +pgpa_build_unrolled_join(PlannedStmt *pstmt, + pgpa_join_unroller *join_unroller) +{ + pgpa_unrolled_join *ujoin; + int i; + + /* + * We shouldn't have gone even so far as to create a join unroller unless + * we found at least one unrollable join. + */ + Assert(join_unroller->nused > 0); + + /* Allocate result structures. */ + ujoin = palloc0_object(pgpa_unrolled_join); + ujoin->ninner = join_unroller->nused; + ujoin->strategy = palloc0_array(pgpa_join_strategy, join_unroller->nused); + ujoin->inner = palloc0_array(pgpa_join_member, join_unroller->nused); + + /* Handle the outermost join. */ + ujoin->outer.plan = join_unroller->outer_subplan; + ujoin->outer.elided_node = join_unroller->outer_elided_node; + ujoin->outer.scan = pgpa_build_scan(pstmt, ujoin->outer.plan, + ujoin->outer.elided_node, true); + + /* + * We want the joins from the deepest part of the plan tree to appear + * first in the result object, but the join unroller adds them in exactly + * the reverse of that order, so we need to flip the order of the arrays + * when constructing the final result. + */ + for (i = 0; i < join_unroller->nused; ++i) + { + int k = join_unroller->nused - i - 1; + + /* Copy strategy, Plan, and ElidedNode. */ + ujoin->strategy[i] = join_unroller->strategy[k]; + ujoin->inner[i].plan = join_unroller->inner_subplans[k]; + ujoin->inner[i].elided_node = join_unroller->inner_elided_nodes[k]; + + /* + * Fill in remaining details, using either the nested join unroller, + * or by deriving them from the plan and elided nodes. + */ + if (join_unroller->inner_unrollers[k] != NULL) + ujoin->inner[i].unrolled_join = + pgpa_build_unrolled_join(pstmt, + join_unroller->inner_unrollers[k]); + else + ujoin->inner[i].scan = pgpa_build_scan(pstmt, ujoin->inner[i].plan, + ujoin->inner[i].elided_node, + true); + } + + return ujoin; +} + +/* + * Free memory allocated for pgpa_join_unroller. + */ +void +pgpa_destroy_join_unroller(pgpa_join_unroller *join_unroller) +{ + pfree(join_unroller->strategy); + pfree(join_unroller->inner_subplans); + pfree(join_unroller->inner_elided_nodes); + pfree(join_unroller->inner_unrollers); + pfree(join_unroller); +} + +/* + * Identify the join strategy used by a join and the "real" inner and outer + * plans. + * + * For example, a Hash Join always has a Hash node on the inner side, but + * for all intents and purposes the real inner input is the Hash node's child, + * not the Hash node itself. + * + * Likewise, a Merge Join may have Sort note on the inner or outer side; if + * it does, the real input to the join is the Sort node's child, not the + * Sort node itself. + * + * In addition, with a Merge Join or a Nested Loop, the join planning code + * may add additional nodes such as Materialize or Memoize. We regard these + * as an aspect of the join strategy. As in the previous cases, the true input + * to the join is the underlying node. + * + * However, if any involved child node previously had a now-elided node stacked + * on top, then we can't "look through" that node -- indeed, what's going to be + * relevant for our purposes is the ElidedNode on top of that plan node, rather + * than the plan node itself. + * + * If there are multiple elided nodes, we want that one that would have been + * uppermost in the plan tree prior to setrefs processing; we expect to find + * that one last in the list of elided nodes. + * + * On return *realouter and *realinner will have been set to the real inner + * and real outer plans that we identified, and *elidedrealouter and + * *elidedrealinner to the last of any correspoding elided nodes. + */ +static pgpa_join_strategy +pgpa_decompose_join(pgpa_plan_walker_context *walker, Plan *plan, + Plan **realouter, Plan **realinner, + ElidedNode **elidedrealouter, ElidedNode **elidedrealinner) +{ + PlannedStmt *pstmt = walker->pstmt; + JoinType jointype = ((Join *) plan)->jointype; + Plan *outerplan = plan->lefttree; + Plan *innerplan = plan->righttree; + ElidedNode *elidedouter; + ElidedNode *elidedinner; + pgpa_join_strategy strategy; + bool uniqueouter; + bool uniqueinner; + + elidedouter = pgpa_last_elided_node(pstmt, outerplan); + elidedinner = pgpa_last_elided_node(pstmt, innerplan); + + switch (nodeTag(plan)) + { + case T_MergeJoin: + + /* + * The planner may have chosen to place a Material node on the + * inner side of the MergeJoin; if this is present, we record it + * as part of the join strategy. + */ + if (elidedinner == NULL && IsA(innerplan, Material)) + { + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_MERGE_JOIN_MATERIALIZE; + } + else + strategy = JSTRAT_MERGE_JOIN_PLAIN; + + /* + * For a MergeJoin, either the outer or the inner subplan, or + * both, may have needed to be sorted; we must disregard any Sort + * or IncrementalSort node to find the real inner or outer + * subplan. + */ + if (elidedouter == NULL && is_sorting_plan(outerplan)) + elidedouter = pgpa_descend_node(pstmt, &outerplan); + if (elidedinner == NULL && is_sorting_plan(innerplan)) + elidedinner = pgpa_descend_node(pstmt, &innerplan); + break; + + case T_NestLoop: + + /* + * The planner may have chosen to place a Material or Memoize node + * on the inner side of the NestLoop; if this is present, we + * record it as part of the join strategy. + */ + if (elidedinner == NULL && IsA(innerplan, Material)) + { + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_NESTED_LOOP_MATERIALIZE; + } + else if (elidedinner == NULL && IsA(innerplan, Memoize)) + { + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_NESTED_LOOP_MEMOIZE; + } + else + strategy = JSTRAT_NESTED_LOOP_PLAIN; + break; + + case T_HashJoin: + + /* + * The inner subplan of a HashJoin is always a Hash node; the real + * inner subplan is the Hash node's child. + */ + Assert(IsA(innerplan, Hash)); + Assert(elidedinner == NULL); + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_HASH_JOIN; + break; + + default: + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(plan)); + } + + /* + * The planner may have decided to implement a semijoin by first making + * the nullable side of the plan unique, and then performing a normal join + * against the result. Therefore, we might need to descend through a + * unique node on either side of the plan. + */ + uniqueouter = pgpa_descend_any_unique(pstmt, &outerplan, &elidedouter); + uniqueinner = pgpa_descend_any_unique(pstmt, &innerplan, &elidedinner); + + /* + * The planner may have decided to parallelize part of the join tree, so + * we could find a Gather or Gather Merge node here. Note that, if + * present, this will appear below both (1) nodes we considered as part of + * the join strategy and (2) nodes inserted for uniqueness, because we + * currently only consider gathering a partial path on top of the final + * path for some RelOptInfo. + */ + if (elidedouter == NULL) + elidedouter = pgpa_descend_any_gather(pstmt, &outerplan); + if (elidedinner == NULL) + elidedinner = pgpa_descend_any_gather(pstmt, &innerplan); + + /* + * It's possible that Result node has been inserted either to project a + * target list or to implement a one-time filter. If so, we can descend + * throught it. Note that a result node without a child would be a + * degenerate scan or join, and not something we could descend through. + * + * XXX. I suspect it's possible for this to happen above the Gather or + * Gather Merge node, too, but apparently we have no test case for that + * scenario. + */ + if (elidedouter == NULL && is_result_node_with_child(outerplan)) + elidedouter = pgpa_descend_node(pstmt, &outerplan); + if (elidedinner == NULL && is_result_node_with_child(innerplan)) + elidedinner = pgpa_descend_node(pstmt, &innerplan); + + /* + * If this is a semijoin that was converted to an inner join by making one + * side or the other unique, make a note that the inner or outer subplan, + * as appropriate, should be treated as a query plan feature when the main + * tree traversal reaches it. + * + * Conversely, if the planner could have made one side of the join unique + * and thereby converted it to an inner join, and chose not to do so, that + * is also worth noting. (XXX: I'm not sure that we need to emit + * non-unique advice in every case where these join-type tests pass.) + * + * NB: This code could appear slightly higher up in in this function, but + * none of the nodes through which we just descended should be have + * associated RTIs. + * + * NB: This seems like a somewhat hacky way of passing information up to + * the main tree walk, but I don't currently have a better idea. + */ + if (uniqueouter) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_UNIQUE, outerplan); + else if (jointype == JOIN_RIGHT_SEMI) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_NON_UNIQUE, outerplan); + if (uniqueinner) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_UNIQUE, innerplan); + else if (jointype == JOIN_SEMI) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_NON_UNIQUE, innerplan); + + /* Set output parameters. */ + *realouter = outerplan; + *realinner = innerplan; + *elidedrealouter = elidedouter; + *elidedrealinner = elidedinner; + return strategy; +} + +/* + * Descend through a Plan node in a join tree that the caller has determined + * to be irrelevant. + * + * Updates *plan, and returns the last of any elided nodes pertaining to the + * new plan node. + */ +static ElidedNode * +pgpa_descend_node(PlannedStmt *pstmt, Plan **plan) +{ + *plan = (*plan)->lefttree; + return pgpa_last_elided_node(pstmt, *plan); +} + +/* + * Descend through a Gather or Gather Merge node, if present, and any Sort + * or IncrementalSort node occurring under a Gather Merge. + * + * Caller should have verified that there is no ElidedNode pertaining to + * the initial value of *plan. + * + * Updates *plan, and returns the last of any elided nodes pertaining to the + * new plan node. + */ +static ElidedNode * +pgpa_descend_any_gather(PlannedStmt *pstmt, Plan **plan) +{ + if (IsA(*plan, Gather)) + return pgpa_descend_node(pstmt, plan); + + if (IsA(*plan, GatherMerge)) + { + ElidedNode *elided = pgpa_descend_node(pstmt, plan); + + if (elided == NULL && is_sorting_plan(*plan)) + elided = pgpa_descend_node(pstmt, plan); + + return elided; + } + + return NULL; +} + +/* + * If *plan is an Agg or Unique node, we want to descend through it, unless + * it has a corresponding elided node. If its immediate child is a Sort or + * IncrementalSort, we also want to descend through that, unless it has a + * corresponding elided node. + * + * On entry, *elided_node must be the last of any elided nodes corresponding + * to *plan; on exit, this will still be true, but *plan may have been updated. + * + * The reason we don't want to descend through elided nodes is that a single + * join tree can't cross through any sort of elided node: subqueries are + * planned separately, and planning inside an Append or MergeAppend is + * separate from planning outside of it. + * + * The return value is true if we descend through at least one node, and + * otherwise false. + */ +static bool +pgpa_descend_any_unique(PlannedStmt *pstmt, Plan **plan, + ElidedNode **elided_node) +{ + if (*elided_node != NULL) + return false; + + if (IsA(*plan, Agg) || IsA(*plan, Unique)) + { + *elided_node = pgpa_descend_node(pstmt, plan); + + if (*elided_node == NULL && is_sorting_plan(*plan)) + *elided_node = pgpa_descend_node(pstmt, plan); + + return true; + } + + return false; +} + +/* + * Create a Bitmapset of all RTIs that we associated with an unrolled join. + * + * As usual, join RTIs are excluded. + */ +Bitmapset * +pgpa_unrolled_join_all_relids(pgpa_unrolled_join *join) +{ + Bitmapset *relids = NULL; + + relids = pgpa_add_member_relids(relids, &join->outer); + + for (int k = 0; k < join->ninner; ++k) + relids = pgpa_add_member_relids(relids, &join->inner[k]); + + return relids; +} + +/* + * Add all RTIs that we associate with a join member to the provided set, + * returning the result. + */ +static Bitmapset * +pgpa_add_member_relids(Bitmapset *relids, pgpa_join_member *member) +{ + if (member->unrolled_join != NULL) + return bms_union(relids, + pgpa_unrolled_join_all_relids(member->unrolled_join)); + else + { + Assert(member->scan != NULL); + return bms_union(relids, member->scan->relids); + } +} + +/* + * Is this a Result node that has a child? + */ +static bool +is_result_node_with_child(Plan *plan) +{ + return IsA(plan, Result) && plan->lefttree != NULL; +} + +/* + * Is this a Plan node whose purpose is put the data in a certain order? + */ +static bool +is_sorting_plan(Plan *plan) +{ + return IsA(plan, Sort) || IsA(plan, IncrementalSort); +} diff --git a/contrib/pg_plan_advice/pgpa_join.h b/contrib/pg_plan_advice/pgpa_join.h new file mode 100644 index 0000000000..7c861fb446 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_join.h @@ -0,0 +1,118 @@ +/*------------------------------------------------------------------------- + * + * pgpa_join.h + * analysis of joins in Plan trees + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_join.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_JOIN_H +#define PGPA_JOIN_H + +#include "nodes/plannodes.h" + +struct pgpa_plan_walker_context; +struct pgpa_clumped_join; +typedef struct pgpa_join_unroller pgpa_join_unroller; +typedef struct pgpa_unrolled_join pgpa_unrolled_join; + +/* + * Although there are three main join strategies, we try to classify things + * more precisely here: merge joins have the option of using materialization + * on the inner side, and nested loops can use either materialization or + * memoization. + */ +typedef enum +{ + JSTRAT_MERGE_JOIN_PLAIN = 0, + JSTRAT_MERGE_JOIN_MATERIALIZE, + JSTRAT_NESTED_LOOP_PLAIN, + JSTRAT_NESTED_LOOP_MATERIALIZE, + JSTRAT_NESTED_LOOP_MEMOIZE, + JSTRAT_HASH_JOIN + /* update NUM_PGPA_JOIN_STRATEGY if you add anything here */ +} pgpa_join_strategy; + +#define NUM_PGPA_JOIN_STRATEGY ((int) JSTRAT_HASH_JOIN + 1) + +/* + * In an outer-deep join tree, every member of an unrolled join will be a scan, + * but join trees with other shapes can contain unrolled joins. + * + * The plan node we store here will be the inner or outer child of the join + * node, as appropriate, except that we look through subnodes that we regard as + * part of the join method itself. For instance, for a Nested Loop that + * materializes the inner input, we'll store the child of the Materialize node, + * not the Materialize node itself. + * + * If setrefs processing elided one or more nodes from the plan tree, then + * we'll store details about the topmost of those in elided_node; otherwise, + * it will be NULL. + * + * Exactly one of scan and unrolled_join will be non-NULL. + */ +typedef struct +{ + Plan *plan; + ElidedNode *elided_node; + struct pgpa_scan *scan; + pgpa_unrolled_join *unrolled_join; +} pgpa_join_member; + +/* + * Non-clumped joins are unrolled; that is, we convert outer-deep join trees + * to a flat structure. ((A JOIN B) JOIN C) JOIN D gets converted to + * outer = A, inner = <B C D>, provided that none of the joins involved are + * clumped. When joins aren't outer-deep, substructure is required, e.g. + * (A JOIN B) JOIN (C JOIN D) is represented as outer = A, inner = <B X>, + * where X is a clumped or unrolled join covering C-D. + */ +struct pgpa_unrolled_join +{ + /* Outermost member; must not itself be an unrolled join. */ + pgpa_join_member outer; + + /* Number of inner members. Length of the strategy and inner arrays. */ + unsigned ninner; + + /* Array of strategies, one per non-outermost member. */ + pgpa_join_strategy *strategy; + + /* Array of members, excluding the outermost. Deepest first. */ + pgpa_join_member *inner; +}; + +/* + * Classification of Plan nodes based on what structure we should construct. + */ +typedef enum +{ + PGPA_NON_JOIN, + PGPA_CLUMPED_JOIN, + PGPA_UNROLLED_JOIN +} pgpa_join_class; + +/* + * Does this plan node inherit from Join? + */ +static inline bool +pgpa_is_join(Plan *plan) +{ + return IsA(plan, NestLoop) || IsA(plan, MergeJoin) || IsA(plan, HashJoin); +} + +extern pgpa_join_unroller *pgpa_create_join_unroller(void); +extern void pgpa_unroll_join(struct pgpa_plan_walker_context *walker, + Plan *plan, + pgpa_join_unroller *join_unroller, + pgpa_join_unroller **outer_join_unroller, + pgpa_join_unroller **inner_join_unroller); +extern pgpa_unrolled_join *pgpa_build_unrolled_join(PlannedStmt *pstmt, + pgpa_join_unroller *join_unroller); +extern void pgpa_destroy_join_unroller(pgpa_join_unroller *join_unroller); +extern Bitmapset *pgpa_unrolled_join_all_relids(pgpa_unrolled_join *join); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_output.c b/contrib/pg_plan_advice/pgpa_output.c new file mode 100644 index 0000000000..1b660791ba --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_output.c @@ -0,0 +1,480 @@ +/*------------------------------------------------------------------------- + * + * pgpa_output.c + * produce textual output from the results of a plan tree walk + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_output.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgpa_output.h" +#include "pgpa_scan.h" + +typedef struct pgpa_output_context +{ + const char **rt_identifiers; + StringInfo buf; + List *unrolled_joins[NUM_PGPA_JOIN_STRATEGY]; + List *scans[NUM_PGPA_SCAN_STRATEGY]; + List *query_features[NUM_PGPA_QF_TYPES]; + int wrap_column; +} pgpa_output_context; + +static void pgpa_output_unrolled_join(pgpa_output_context *context, + pgpa_unrolled_join *join); +static void pgpa_output_join_member(pgpa_output_context *context, + pgpa_join_member *member); +static void pgpa_output_relations(pgpa_output_context *context, StringInfo buf, + Bitmapset *relids); + +static char *pgpa_cstring_join_strategy(pgpa_join_strategy strategy); +static char *pgpa_cstring_scan_strategy(pgpa_scan_strategy strategy); + +static void pgpa_maybe_linebreak(StringInfo buf, int wrap_column); + +/* + * Append query advice to the provided buffer. + * + * Before calling this function, 'walker' must be used to iterate over the + * main plan tree and all subplans from the PlannedStmt. + * + * 'rt_identifiers' is a table of unique identifiers, one for each RTI. + * See pgpa_create_identifiers_for_planned_stmt(). + * + * Results will be appended to 'buf'. + */ +void +pgpa_output_advice(StringInfo buf, pgpa_plan_walker_context *walker, + const char **rt_identifiers) +{ + ListCell *lc; + pgpa_output_context context; + + /* + * If the user chooses to use EXPLAIN (PLAN_ADVICE) in an 80-column window + * from a psql client with default settings, psql will add one space to the + * left of the output and EXPLAIN will add two more to the left of the + * advice. Thus, lines of more than 77 characters will wrap. We set the + * wrap limit to 76 here so that the output won't reach all the way to the + * very last column of the terminal. + * + * Of course, this is fairly arbitrary set of assumptions, and one could + * well make an argument for a different wrap limit, or for a configurable + * one. + */ + memset(&context, 0, sizeof(pgpa_output_context)); + context.rt_identifiers = rt_identifiers; + context.buf = buf; + context.wrap_column = 76; + + /* + * Put all the top-level scans for each strategy into a single list. + */ + foreach(lc, walker->scans) + { + pgpa_scan *scan = lfirst(lc); + + context.scans[scan->strategy] = + lappend(context.scans[scan->strategy], scan->relids); + } + + /* + * Each piece of JOIN_ORDER() advice fully describes the join order for a + * a single unrolled join. Merging is not permitted, because that would + * change the meaning, e.g. SEQ_SCAN(a b c d) means simply that sequential + * scans should be used for all of those relations, and is thus equivalent + * to SEQ_SCAN(a b) SEQ_SCAN(c d), but JOIN_ORDER(a b c d) means that "a" + * is the driving table which is then joined to "b" then "c" then "d", + * which is totally different from JOIN_ORDER(a b) and JOIN_ORDER(c d). + * + * As a side effect, the calls to pgpa_output_unrolled_join() will add to + * the context.unrolled_joins[] lists and context.scans[] lists, which + * will be important for emitting join and scan strategy advice further + * down. + */ + foreach(lc, walker->unrolled_joins) + { + pgpa_unrolled_join *ujoin = lfirst(lc); + + if (buf->len > 0) + appendStringInfoChar(buf, '\n'); + appendStringInfo(context.buf, "JOIN_ORDER("); + pgpa_output_unrolled_join(&context, ujoin); + appendStringInfoChar(context.buf, ')'); + pgpa_maybe_linebreak(context.buf, context.wrap_column); + } + + /* + * Emit just one piece of advice for each unrolled-join strategy that is + * used at least once in the query. + */ + for (int s = 0; s < NUM_PGPA_JOIN_STRATEGY; ++s) + { + char *strategy = pgpa_cstring_join_strategy(s); + bool first = true; + + if (context.unrolled_joins[s] == NIL) + continue; + + if (buf->len > 0) + appendStringInfoChar(buf, '\n'); + appendStringInfo(buf, "%s(", strategy); + + foreach(lc, context.unrolled_joins[s]) + { + Bitmapset *relids = lfirst(lc); + + if (first) + first = false; + else + { + pgpa_maybe_linebreak(context.buf, context.wrap_column); + appendStringInfoChar(context.buf, ' '); + } + + if (bms_membership(relids) == BMS_SINGLETON) + pgpa_output_relations(&context, buf, relids); + else + { + appendStringInfoChar(buf, '('); + pgpa_output_relations(&context, buf, relids); + appendStringInfoChar(buf, ')'); + } + } + appendStringInfoChar(buf, ')'); + pgpa_maybe_linebreak(context.buf, context.wrap_column); + } + + /* + * Emit just one piece of advice for each scan strategy that is used at + * least once in the query. + * + * XXX. We shouldn't emit anything for ordinary scans, and we should + * emit more detailed information for index, index-only, and + * bitmap-heap scans. + */ + for (int c = 0; c < NUM_PGPA_SCAN_STRATEGY; ++c) + { + char *cstrategy = pgpa_cstring_scan_strategy(c); + bool first = true; + + if (context.scans[c] == NIL) + continue; + + if (buf->len > 0) + appendStringInfoChar(buf, '\n'); + appendStringInfo(buf, "%s(", cstrategy); + + foreach(lc, context.scans[c]) + { + Bitmapset *relids = lfirst(lc); + + if (first) + first = false; + else + { + pgpa_maybe_linebreak(context.buf, context.wrap_column); + appendStringInfoChar(context.buf, ' '); + } + + if (bms_membership(relids) == BMS_SINGLETON) + pgpa_output_relations(&context, buf, relids); + else + { + appendStringInfoChar(buf, '('); + pgpa_output_relations(&context, buf, relids); + appendStringInfoChar(buf, ')'); + } + } + appendStringInfoChar(buf, ')'); + pgpa_maybe_linebreak(context.buf, context.wrap_column); + } + + /* Sort query features into one list per query feature type. */ + foreach(lc, walker->query_features) + { + pgpa_query_feature *qf = lfirst(lc); + + context.query_features[qf->type] = + lappend(context.query_features[qf->type], qf); + } + + /* + * Emit just one piece of advice for each type of query feature that is + * used at least once in the query. + */ + for (int t = 0; t < NUM_PGPA_QF_TYPES; ++t) + { + bool first = true; + + if (context.query_features[t] == NIL) + continue; + + if (buf->len > 0) + appendStringInfoChar(buf, '\n'); + + switch (t) + { + case PGPAQF_GATHER: + appendStringInfo(buf, "GATHER("); + break; + case PGPAQF_GATHER_MERGE: + appendStringInfo(buf, "GATHER_MERGE("); + break; + case PGPAQF_SEMIJOIN_NON_UNIQUE: + appendStringInfo(buf, "SEMIJOIN_NON_UNIQUE("); + break; + case PGPAQF_SEMIJOIN_UNIQUE: + appendStringInfo(buf, "SEMIJOIN_UNIQUE("); + break; + } + + foreach(lc, context.query_features[t]) + { + pgpa_query_feature *qf = lfirst(lc); + + if (first) + first = false; + else + { + pgpa_maybe_linebreak(context.buf, context.wrap_column); + appendStringInfoChar(context.buf, ' '); + } + + if (bms_membership(qf->relids) == BMS_SINGLETON) + pgpa_output_relations(&context, buf, qf->relids); + else + { + appendStringInfoChar(buf, '('); + pgpa_output_relations(&context, buf, qf->relids); + appendStringInfoChar(buf, ')'); + } + } + appendStringInfoChar(buf, ')'); + pgpa_maybe_linebreak(context.buf, context.wrap_column); + } +} + +/* + * Output the members of an unrolled join, first the outermost member, and + * then the inner members one by one, as part of JOIN_ORDER() advice. + * + * As we visit members, we add to the context->unrolled_joins[] and + * context->scans[] lists. The former updates are done by this function + * directly, and the latter via pgpa_output_join_member. + */ +static void +pgpa_output_unrolled_join(pgpa_output_context *context, + pgpa_unrolled_join *join) +{ + pgpa_output_join_member(context, &join->outer); + + for (int k = 0; k < join->ninner; ++k) + { + pgpa_join_member *member = &join->inner[k]; + Bitmapset *relids; + + pgpa_maybe_linebreak(context->buf, context->wrap_column); + appendStringInfoChar(context->buf, ' '); + pgpa_output_join_member(context, member); + + if (member->unrolled_join != NULL) + relids = pgpa_unrolled_join_all_relids(member->unrolled_join); + else + { + Assert(member->scan != NULL); + relids = member->scan->relids; + } + context->unrolled_joins[join->strategy[k]] = + lappend(context->unrolled_joins[join->strategy[k]], relids); + } +} + +/* + * Output a single member of an unrolled join as part of JOIN_ORDER() advice. + * + * If that member happens to be a scan, also add it to the appropriate + * context->scans[] list. + */ +static void +pgpa_output_join_member(pgpa_output_context *context, + pgpa_join_member *member) +{ + if (member->unrolled_join != NULL) + { + appendStringInfoChar(context->buf, '('); + pgpa_output_unrolled_join(context, member->unrolled_join); + appendStringInfoChar(context->buf, ')'); + } + else + { + pgpa_scan *scan = member->scan; + + Assert(scan != NULL); + if (bms_membership(scan->relids) == BMS_SINGLETON) + pgpa_output_relations(context, context->buf, scan->relids); + else + { + appendStringInfoChar(context->buf, '{'); + pgpa_output_relations(context, context->buf, scan->relids); + appendStringInfoChar(context->buf, '}'); + } + context->scans[scan->strategy] = + lappend(context->scans[scan->strategy], scan->relids); + } +} + +/* + * Output the identifiers for each RTI in the provided set. + * + * Identifiers are separated by spaces, and a line break is possible after + * each one. + */ +static void +pgpa_output_relations(pgpa_output_context *context, StringInfo buf, + Bitmapset *relids) +{ + int rti = -1; + bool first = true; + + while ((rti = bms_next_member(relids, rti)) >= 0) + { + const char *identifier = context->rt_identifiers[rti - 1]; + + if (identifier == NULL) + elog(ERROR, "no identifier for RTI %d", rti); + + if (first) + { + first = false; + appendStringInfoString(buf, identifier); + } + else + { + pgpa_maybe_linebreak(buf, context->wrap_column); + appendStringInfo(buf, " %s", identifier); + } + } +} + +/* + * Get a C string that corresponds to the specified join strategy. + */ +static char * +pgpa_cstring_join_strategy(pgpa_join_strategy strategy) +{ + switch (strategy) + { + case JSTRAT_MERGE_JOIN_PLAIN: + return "MERGE_JOIN_PLAIN"; + case JSTRAT_MERGE_JOIN_MATERIALIZE: + return "MERGE_JOIN_MATERIALIZE"; + case JSTRAT_NESTED_LOOP_PLAIN: + return "NESTED_LOOP_PLAIN"; + case JSTRAT_NESTED_LOOP_MATERIALIZE: + return "NESTED_LOOP_MATERIALIZE"; + case JSTRAT_NESTED_LOOP_MEMOIZE: + return "NESTED_LOOP_MEMOIZE"; + case JSTRAT_HASH_JOIN: + return "HASH_JOIN"; + } + + Assert(false); +} + +/* + * Get a C string that corresponds to the specified scan strategy. + */ +static char * +pgpa_cstring_scan_strategy(pgpa_scan_strategy strategy) +{ + switch (strategy) + { + case PGPA_SCAN_ORDINARY: + return "ORDINARY_SCAN"; + case PGPA_SCAN_SEQ: + return "SEQ_SCAN"; + case PGPA_SCAN_BITMAP_HEAP: + return "BITMAP_HEAP_SCAN"; + case PGPA_SCAN_FOREIGN: + return "FOREIGN_SCAN"; + case PGPA_SCAN_CUSTOM: + return "CUSTOM_SCAN"; + case PGPA_SCAN_INDEX: + return "INDEX_SCAN"; + case PGPA_SCAN_INDEX_ONLY: + return "INDEX_ONLY_SCAN"; + case PGPA_SCAN_PARTITIONWISE: + return "PARTITIONWISE"; + } + + Assert(false); +} + +/* + * Insert a line break into the StringInfoData, if needed. + * + * If wrap_column is zero or negative, this does nothing. Otherwise, we + * consider inserting a newline. We only insert a newline if the length of + * the last line in the buffer exceeds wrap_column, and not if we'd be + * inserting a newline at or before the beginning of the current line. + * + * The position at which the newline is inserted is simply wherever the + * buffer ended the last time this function was called. In other words, + * the caller is expected to call this function every time we reach a good + * place for a line break. + */ +static void +pgpa_maybe_linebreak(StringInfo buf, int wrap_column) +{ + char *trailing_nl; + int line_start; + int save_cursor; + + /* If line wrapping is disabled, exit quickly. */ + if (wrap_column <= 0) + return; + + /* + * Set line_start to the byte offset within buf->data of the first + * character of the current line, where the current line means the last + * one in the buffer. Note that line_start could be the offset of the + * trailing '\0' if the last character in the buffer is a line break. + */ + trailing_nl = strrchr(buf->data, '\n'); + if (trailing_nl == NULL) + line_start = 0; + else + line_start = (trailing_nl - buf->data) + 1; + + /* + * Remember that the current end of the buffer is a potential location to + * insert a line break on a future call to this function. + */ + save_cursor = buf->cursor; + buf->cursor = buf->len; + + /* If we haven't passed the wrap column, we don't need a newline. */ + if (buf->len - line_start <= wrap_column) + return; + + /* + * It only makes sense to insert a newline at a position later than the + * beginning of the current line. + */ + if (buf->cursor <= line_start) + return; + + /* Insert a newline at the previous cursor location. */ + enlargeStringInfo(buf, 1); + memmove(&buf->data[save_cursor] + 1, &buf->data[save_cursor], + buf->len - save_cursor); + ++buf->cursor; + buf->data[++buf->len] = '\0'; + buf->data[save_cursor] = '\n'; +} diff --git a/contrib/pg_plan_advice/pgpa_output.h b/contrib/pg_plan_advice/pgpa_output.h new file mode 100644 index 0000000000..f7d74d3794 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_output.h @@ -0,0 +1,21 @@ +/*------------------------------------------------------------------------- + * + * pgpa_output.h + * produce textual output from the results of a plan tree walk + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_output.c + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_OUTPUT_H +#define PGPA_OUTPUT_H + +#include "pgpa_walker.h" + +extern void pgpa_output_advice(StringInfo buf, + pgpa_plan_walker_context *walker, + const char **rt_identifiers); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_scan.c b/contrib/pg_plan_advice/pgpa_scan.c new file mode 100644 index 0000000000..5d0c6f5043 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_scan.c @@ -0,0 +1,193 @@ +/*------------------------------------------------------------------------- + * + * pgpa_scan.c + * analysis of scans in Plan trees + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_scan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pgpa_scan.h" +#include "pgpa_walker.h" + +#include "nodes/parsenodes.h" +#include "parser/parsetree.h" + +/* + * Build a pgpa_scan object for a Plan node. + * + * If there is at least one ElidedNode for this plan node, pass the uppermost + * one as elided_node, else pass NULL. + * + * Set the 'force' flag if we're inside of a join problem and not otherwise; + * see comments below for why this matters. + * + * The return value can be NULL only if force is false. + */ +pgpa_scan * +pgpa_build_scan(PlannedStmt *pstmt, Plan *plan, ElidedNode *elided_node, + bool force) +{ + pgpa_scan *scan; + pgpa_scan_strategy strategy = PGPA_SCAN_ORDINARY; + Bitmapset *relids = NULL; + int rti = -1; + bool filter_join_rtis = false; + + if (elided_node != NULL) + { + NodeTag elided_type = elided_node->elided_type; + + /* + * If setrefs processing elided an Append or MergeAppend node that + * had only one surviving child, then this is a partitionwise + * "scan" -- which may really be a partitionwise join, but there's + * no need to distinguish. + * + * If it's a trivial SubqueryScan that was elided, then this is an + * "ordinary" scan i.e. one for which we need to generate advice + * because the planner has not made any meaningful choice. + */ + relids = elided_node->relids; + if (elided_type == T_Append || elided_type == T_MergeAppend) + strategy = PGPA_SCAN_PARTITIONWISE; + else + strategy = PGPA_SCAN_ORDINARY; + filter_join_rtis = true; + } + else if ((rti = pgpa_scanrelid(plan)) != 0) + { + relids = bms_make_singleton(rti); + + switch (nodeTag(plan)) + { + case T_SeqScan: + strategy = PGPA_SCAN_SEQ; + break; + case T_BitmapHeapScan: + strategy = PGPA_SCAN_BITMAP_HEAP; + break; + case T_ForeignScan: + strategy = PGPA_SCAN_FOREIGN; + break; + case T_CustomScan: + strategy = PGPA_SCAN_CUSTOM; + break; + case T_IndexScan: + strategy = PGPA_SCAN_INDEX; + break; + case T_IndexOnlyScan: + strategy = PGPA_SCAN_INDEX_ONLY; + break; + default: + strategy = PGPA_SCAN_ORDINARY; + break; + } + } + else if ((relids = pgpa_relids(plan)) != NULL) + { + switch (nodeTag(plan)) + { + case T_ForeignScan: + strategy = PGPA_SCAN_FOREIGN; + break; + case T_Append: + case T_MergeAppend: + { + int first_rti; + RangeTblEntry *first_rte; + + first_rti = bms_next_member(relids, -1); + first_rte = rt_fetch(first_rti, pstmt->rtable); + + /* + * Append and MergeAppend nodes can be the result either + * of set operations or of inheritance expansion. In the + * latter case, this plan node is telling us that the relid + * set found in the plan node was handled partitionwise, + * but in the former, it isn't. Even if the set operations + * planner has multiple ways to execute some set operation, + * we lack the ability to describe what it decided here, + * and hence just label this as an ordinary scan. + * + * XXX. This breaks with a single Append node that involves + * both set operations and inheritance expansion. Consider: + * + * create table foo (a int) partition by range (a); + * create table foo0 partition of foo for values + * from (0) to (1000); + * create table foo1 partition of foo for values + * from (1000) to (2000); + * explain select * from foo union all select 1; + */ + if (first_rte->rtekind == RTE_RELATION) + strategy = PGPA_SCAN_PARTITIONWISE; + else + strategy = PGPA_SCAN_ORDINARY; + } + break; + default: + strategy = PGPA_SCAN_ORDINARY; + break; + } + filter_join_rtis = true; + } + + /* + * If this plan node has no associated RTIs, it's not a scan. When the + * 'force' flag is set, that's unexpected, so throw an error, else return + * quietly. + */ + if (relids == NULL) + { + if (force) + elog(ERROR, "plan node has no RTIs: %d", (int) nodeTag(plan)); + return NULL; + } + + /* + * Ordinary scans appearing outside of a join problem need not be tracked, + * since nothing interesting can be said either about the choice of join + * method or about the join order. + */ + if (!force && strategy == PGPA_SCAN_ORDINARY) + return NULL; + + /* Now create the result object. */ + scan = palloc(sizeof(pgpa_scan)); + scan->plan = plan; + scan->strategy = strategy; + + /* + * In some cases, it's possible for RTIs of type RTE_JOIN to be included, + * and when that is the case, we want to filter them out. We always refer + * to groups of relations in terms of the relation RTIs, not the join RTIs, + * so the join RTIs are just noise. + * + * When the code earlier in this function knows that no join RTIs can have + * infiltrated the relids set, it can set filter_join_rtis = false to save + * a few CPU cycles here. + * + * One important case where join RTIs can definitely occur is when we + * encounter a Result node where a whole join result has been proven empty. + */ + if (!filter_join_rtis) + scan->relids = relids; + else + { + scan->relids = NULL; + while ((rti = bms_next_member(relids, rti)) >= 0) + { + RangeTblEntry *rte = rt_fetch(rti, pstmt->rtable); + + if (rte->rtekind != RTE_JOIN) + scan->relids = bms_add_member(scan->relids, rti); + } + } + + return scan; +} diff --git a/contrib/pg_plan_advice/pgpa_scan.h b/contrib/pg_plan_advice/pgpa_scan.h new file mode 100644 index 0000000000..c2a7c645b7 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_scan.h @@ -0,0 +1,76 @@ +/*------------------------------------------------------------------------- + * + * pgpa_scan.h + * analysis of scans in Plan trees + * + * Note that our definition of "scan" is extremely broad. It includes + * (1) single plan nodes that scan multiple RTIs, such as a degenerate + * Result node that replaces what would otherwise have been a join, and + * (2) Append and MergeAppend nodes implementing a partitionwise scan + * or a partitionwise join. + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_scan.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_SCAN_H +#define PGPA_SCAN_H + +#include "nodes/plannodes.h" + +/* + * Scan strategies. + * + * PGPA_SCAN_ORDINARY is any scan strategy that isn't interesting to us + * because there is no meaningful planner decision involved. For example, + * the only way to scan a subquery is a SubqueryScan, and the only way to + * scan a VALUES construct is a ValuesScan. We need not care exactly which + * type of planner node was used in such cases, because the same thing will + * happen when replanning. + * + * PGPA_SCAN_ORDINARY also includes Result nodes that correspond to scans + * or even joins that are proved empty. We don't know whether or not the scan + * or join will still be provably empty at replanning time, but if it is, + * then no scan-type advice is needed, and if it's not, we can't recommend + * a scan type based on the current plan. + * + * PGPA_SCAN_PARTITIONWISE also lumps together scans and joins: this can + * be either a partitionwise scan of a partitioned table or a partitionwise + * join between several partitioned tables. Note that all decisions about + * whether or not to use partitionwise join are meaningful: no matter what + * we decided this time, we could do more or fewer things partitionwise the + * next time. + * + * Other scan strategies map one-to-one to plan nodes. + */ +typedef enum +{ + PGPA_SCAN_ORDINARY = 0, + PGPA_SCAN_SEQ, + PGPA_SCAN_BITMAP_HEAP, + PGPA_SCAN_FOREIGN, + PGPA_SCAN_CUSTOM, + PGPA_SCAN_INDEX, + PGPA_SCAN_INDEX_ONLY, + PGPA_SCAN_PARTITIONWISE + /* update NUM_PGPA_SCAN_STRATEGY if you add anything here */ +} pgpa_scan_strategy; + +#define NUM_PGPA_SCAN_STRATEGY ((int) PGPA_SCAN_PARTITIONWISE + 1) + +/* + * All of the details we need regarding a scan. + */ +typedef struct pgpa_scan +{ + Plan *plan; + pgpa_scan_strategy strategy; + Bitmapset *relids; +} pgpa_scan; + +extern pgpa_scan *pgpa_build_scan(PlannedStmt *pstmt, Plan *plan, + ElidedNode *elided_node, bool force); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_walker.c b/contrib/pg_plan_advice/pgpa_walker.c new file mode 100644 index 0000000000..9eb739220a --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_walker.c @@ -0,0 +1,408 @@ +#include "postgres.h" + +#include "pgpa_join.h" +#include "pgpa_scan.h" +#include "pgpa_walker.h" + +#include "nodes/plannodes.h" + +static pgpa_query_feature *pgpa_add_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, + Plan *plan); + +static void pgpa_qf_add_rti(List *active_query_features, Index rti); +static void pgpa_qf_add_rtis(List *active_query_features, Bitmapset *relids); +static void pgpa_qf_add_plan_rtis(List *active_query_features, Plan *plan); + +/* + * Iterate over the entire plan tree. + * + * XXX. More comments. + */ +void +pgpa_plan_walker(pgpa_plan_walker_context *walker, Plan *plan, + int join_unroll_level, + pgpa_join_unroller *join_unroller, + List *active_query_features) +{ + pgpa_join_unroller *outer_join_unroller = NULL; + pgpa_join_unroller *inner_join_unroller = NULL; + bool join_unroller_toplevel = false; + bool pushdown_query_features = false; + ListCell *lc; + List *extraplans = NIL; + List *elided_nodes = NIL; + bool is_query_feature = false; + + /* + * If this is a Gather or Gather Merge node, directly add it to the list + * of currently-active query features. + * + * Otherwise, check the future_query_features list to see whether this was + * previously identified as a plan node that needs to be treated as a + * query feature. + */ + if (IsA(plan, Gather)) + { + active_query_features = + lappend(active_query_features, + pgpa_add_feature(walker, PGPAQF_GATHER, plan)); + is_query_feature = true; + } + else if (IsA(plan, GatherMerge)) + { + active_query_features = + lappend(active_query_features, + pgpa_add_feature(walker, PGPAQF_GATHER_MERGE, plan)); + is_query_feature = true; + } + else + { + foreach_ptr(pgpa_query_feature, qf, walker->future_query_features) + { + if (qf->plan == plan) + { + is_query_feature = true; + active_query_features = lappend(active_query_features, qf); + walker->future_query_features = + list_delete_ptr(walker->future_query_features, plan); + break; + } + } + } + + /* + * Find all elided nodes for this Plan node. + */ + foreach_node(ElidedNode, n, walker->pstmt->elidedNodes) + { + if (n->plan_node_id == plan->plan_node_id) + elided_nodes = lappend(elided_nodes, n); + } + + /* If we found any elided_nodes, handle them. */ + if (elided_nodes != NIL) + { + int num_elided_nodes = list_length(elided_nodes); + ElidedNode *last_elided_node; + + /* XXX */ + last_elided_node = list_nth(elided_nodes, num_elided_nodes - 1); + pgpa_qf_add_rtis(active_query_features, last_elided_node->relids); + active_query_features = NIL; + + /* + * If there are multiple relids for the elided node, a clumped join + * should be built for it exactly once. When there's a join_unroller, + * the join unrolling process will have already built any necessary + * clumped join for the final elided node, so throw it out. + */ + if (join_unroller != NULL) + elided_nodes = list_truncate(elided_nodes, num_elided_nodes - 1); + + /* + * Every element of elided_nodes is an ElidedNode for which any + * necessary pgpa_scan has not yet been created. Do that here, and + * attach the resulting objects directly to the walker object, since + * we have nowhere else to put a reference to it. + * + * XXX. Isn't this a bald-faced lie? + */ + foreach_node(ElidedNode, n, elided_nodes) + { + pgpa_scan *scan; + + scan = pgpa_build_scan(walker->pstmt, plan, n, false); + if (scan != NULL) + walker->scans = lappend(walker->scans, scan); + } + + /* + * Because we only want to unroll joins that the planner would have + * considered as part of the same join problem, nodes elided during + * setrefs processing act as boundaries. + * + * In more detail, if an Append or MergeAppend was elided, then j a + * partitionwise join was chosen and only a single child survived; if + * a SubqueryScan was elided, the subquery was separately without + * flattening it into the parent. In either case, the join order and + * join methods beneath the elided node should be described separately + * from the join order and methods above the elided node. + */ + join_unroller = NULL; + } + + /* + * If this join needs to unrolled but there's no join unroller already + * available, create one. + */ + if (join_unroller == NULL && pgpa_is_join(plan)) + { + join_unroller = pgpa_create_join_unroller(); + join_unroller_toplevel = true; + ++join_unroll_level; + } + + /* + * We only need to handle scans here if we're not underneath a join + * unroller. When we are, the join unroller will create the pgpa_scan and + * and attach it to the unrolled join. Otherwise, do it here. + */ + if (join_unroll_level == 0) + { + pgpa_scan *scan; + + scan = pgpa_build_scan(walker->pstmt, plan, NULL, false); + if (scan != NULL) + walker->scans = lappend(walker->scans, scan); + } + + /* + * If this join is to be unrolled, pgpa_unroll_join() will return the join + * unroller object that should be passed down when we recurse into the + * outer and inner sides of the plan. + */ + if (join_unroller != NULL) + pgpa_unroll_join(walker, plan, join_unroller, + &outer_join_unroller, &inner_join_unroller); + + /* Add RTIs from the plan node to all active query features. */ + pgpa_qf_add_plan_rtis(active_query_features, plan); + + /* Recurse into the outer and inner subtrees. */ + if (plan->lefttree != NULL) + pgpa_plan_walker(walker, plan->lefttree, join_unroll_level, + outer_join_unroller, active_query_features); + if (plan->righttree != NULL) + pgpa_plan_walker(walker, plan->righttree, join_unroll_level, + inner_join_unroller, active_query_features); + + /* + * If we created a join unroller up above, then it's also our join to use + * it to build the final pgpa_unrolled_join, and to destroy the object. + */ + if (join_unroller_toplevel) + { + pgpa_unrolled_join *ujoin; + + ujoin = pgpa_build_unrolled_join(walker->pstmt, join_unroller); + walker->unrolled_joins = lappend(walker->unrolled_joins, ujoin); + pgpa_destroy_join_unroller(join_unroller); + } + + /* + * Some plan types can have additional children. Nodes like Append that + * can have any number of children store them in a List; a SubqueryScan + * just has a field for a single additional Plan. + */ + switch (nodeTag(plan)) + { + case T_Append: + { + Append *aplan = (Append *) plan; + + extraplans = aplan->appendplans; + pushdown_query_features = bms_is_empty(aplan->apprelids); + } + break; + case T_MergeAppend: + { + MergeAppend *maplan = (MergeAppend *) plan; + + extraplans = maplan->mergeplans; + pushdown_query_features = bms_is_empty(maplan->apprelids); + } + break; + case T_BitmapAnd: + extraplans = ((BitmapAnd *) plan)->bitmapplans; + break; + case T_BitmapOr: + extraplans = ((BitmapOr *) plan)->bitmapplans; + break; + case T_SubqueryScan: + + /* + * We don't pass down active_query_features across here, because + * those are specific to a subquery level. + */ + pgpa_plan_walker(walker, ((SubqueryScan *) plan)->subplan, + 0, NULL, NIL); + break; + case T_CustomScan: + extraplans = ((CustomScan *) plan)->custom_plans; + break; + default: + break; + } + + /* If we found a list of extra children, iterate over it. */ + foreach(lc, extraplans) + { + Plan *subplan = lfirst(lc); + + pgpa_plan_walker(walker, subplan, 0, NULL, + pushdown_query_features ? active_query_features : NIL); + } + + /* + * If the current node is a query feature, then active_query_features has + * been destructively modified as compared to the value passed down from + * the caller, and we need to put things back as they were. + * + * Exceptions: If the caller passed NIL, or if we reset the list to NIL + * because of the presence of an elided SubqueryScan, then we created a + * new list here instead of destructively modifying the caller's list. + * That case requires no special handling, because list_truncate() will + * simply exit quickly. + */ + if (is_query_feature) + { + int num_aqf = list_length(active_query_features); + + (void) list_truncate(active_query_features, num_aqf - 1); + } +} + +/* + * Arrange for the given plan node to be treated as a query feature when the + * tree walk reaches it. + * + * Make sure to only use this for nodes that the tree walk can't have reached + * yet! + */ +void +pgpa_add_future_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, Plan *plan) +{ + pgpa_query_feature *qf = pgpa_add_feature(walker, type, plan); + + walker->future_query_features = + lappend(walker->future_query_features, qf); +} + +/* + * Return the last of any elided nodes associated with this plan node ID. + * + * The last elided node is the one that would have been uppermost in the plan + * tree had it not been removed during setrefs processig. + */ +ElidedNode * +pgpa_last_elided_node(PlannedStmt *pstmt, Plan *plan) +{ + ElidedNode *elided_node = NULL; + + foreach_node(ElidedNode, n, pstmt->elidedNodes) + { + if (n->plan_node_id == plan->plan_node_id) + elided_node = n; + } + + return elided_node; +} + +/* + * Certain plan nodes can refer to a set of RTIs. Extract and return the set. + */ +Bitmapset * +pgpa_relids(Plan *plan) +{ + if (IsA(plan, Result)) + return ((Result *) plan)->relids; + else if (IsA(plan, ForeignScan)) + return ((ForeignScan *) plan)->fs_relids; + else if (IsA(plan, Append)) + return ((Append *) plan)->apprelids; + else if (IsA(plan, MergeAppend)) + return ((MergeAppend *) plan)->apprelids; + + return NULL; +} + +/* + * Extract the scanned RTI from a plan node. + * + * Returns 0 if there isn't one. + */ +Index +pgpa_scanrelid(Plan *plan) +{ + switch (nodeTag(plan)) + { + case T_SeqScan: + case T_SampleScan: + case T_BitmapHeapScan: + case T_TidScan: + case T_TidRangeScan: + case T_SubqueryScan: + case T_FunctionScan: + case T_TableFuncScan: + case T_ValuesScan: + case T_CteScan: + case T_NamedTuplestoreScan: + case T_WorkTableScan: + case T_ForeignScan: + case T_CustomScan: + case T_IndexScan: + case T_IndexOnlyScan: + return ((Scan *) plan)->scanrelid; + default: + return 0; + } +} + +/* + * Create a pgpa_query_feature and add it to the list of all query features + * for this plan. + */ +static pgpa_query_feature * +pgpa_add_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, Plan *plan) +{ + pgpa_query_feature *qf = palloc0_object(pgpa_query_feature); + + qf->type = type; + qf->plan = plan; + + walker->query_features = lappend(walker->query_features, qf); + + return qf; +} + +/* + * Add a single RTI to each active query feature. + */ +static void +pgpa_qf_add_rti(List *active_query_features, Index rti) +{ + foreach_ptr(pgpa_query_feature, qf, active_query_features) + { + qf->relids = bms_add_member(qf->relids, rti); + } +} + +/* + * Add a set of RTIs to each active query feature. + */ +static void +pgpa_qf_add_rtis(List *active_query_features, Bitmapset *relids) +{ + foreach_ptr(pgpa_query_feature, qf, active_query_features) + { + qf->relids = bms_add_members(qf->relids, relids); + } +} + +/* + * Add RTIs directly contained in a plan node to each active query feature. + */ +static void +pgpa_qf_add_plan_rtis(List *active_query_features, Plan *plan) +{ + Bitmapset *relids; + Index rti; + + if ((relids = pgpa_relids(plan)) != NULL) + pgpa_qf_add_rtis(active_query_features, relids); + else if ((rti = pgpa_scanrelid(plan)) != 0) + pgpa_qf_add_rti(active_query_features, rti); +} diff --git a/contrib/pg_plan_advice/pgpa_walker.h b/contrib/pg_plan_advice/pgpa_walker.h new file mode 100644 index 0000000000..871d31801d --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_walker.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * pgpa_walker.h + * Plan tree iteration + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_walker.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_WALKER_H +#define PGPA_WALKER_H + +#include "pgpa_join.h" + +/* + * We use the term "query feature" to refer to plan nodes that are interesting + * in the following way: to generate advice, we'll need to know the set of + * same-subquery, non-join RTIs occuring at or below that plan node, without + * admixture of parent and child RTIs. + * + * For example, Gather nodes, desiginated by PGPAQF_GATHER, and Gather Merge + * nodes, designated by PGPAQF_GATHER_MERGE, are query features, because we'll + * want to admit some kind of advice that describes the portion of the plan + * tree that appears beneath those nodes. + * + * Each semijoin can be implemented either by directly performing a semijoin, + * or by making one side unique and then performing a normal join. Either way, + * we use a query feature to notice what decision was made, so that we can + * describe it by enumerating the RTIs on that side of the join. + * + * To elaborate on the "no admixture of parent and child RTIs" rule, in all of + * these cases, if the entirety of an inheritance hierarchy appears beneath + * the query feature, we only want to name the parent table. But it's also + * possible to have cases where we must name child tables. This is particularly + * likely to happen when partitionwise join is in use, but could happen for + * Gather or Gather Merge even without that, if one of those appears below + * an Append or MergeAppend node for a single table. + */ +typedef enum pgpa_qf_type +{ + PGPAQF_GATHER, + PGPAQF_GATHER_MERGE, + PGPAQF_SEMIJOIN_NON_UNIQUE, + /* update NUM_PGPA_QF_TYPES if you add anything here */ + PGPAQF_SEMIJOIN_UNIQUE +} pgpa_qf_type; + +#define NUM_PGPA_QF_TYPES ((int) PGPAQF_SEMIJOIN_UNIQUE + 1) + +/* + * For each query feature, we keep track of the feature type and the set of + * relids that we found underneath the relevant plan node. See the comments + * on pgpa_qf_type, above, for additional details. + */ +typedef struct pgpa_query_feature +{ + pgpa_qf_type type; + Plan *plan; + Bitmapset *relids; +} pgpa_query_feature; + +typedef struct pgpa_plan_walker_context +{ + PlannedStmt *pstmt; + List *unrolled_joins; + List *scans; + List *query_features; + List *future_query_features; +} pgpa_plan_walker_context; + +extern void pgpa_plan_walker(pgpa_plan_walker_context *walker, Plan *plan, + int join_unroll_level, + pgpa_join_unroller *join_unroller, + List *active_query_features); + +extern void pgpa_add_future_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, + Plan *plan); + +extern ElidedNode *pgpa_last_elided_node(PlannedStmt *pstmt, Plan *plan); +extern Bitmapset *pgpa_relids(Plan *plan); +extern Index pgpa_scanrelid(Plan *plan); + +#endif diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index df850cf634..ad8416a4d8 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -4345,3 +4345,20 @@ z_streamp zic_t SubPlanRTInfo ElidedNode +pgpa_collected_advice +pgpa_join_class +pgpa_join_member +pgpa_join_strategy +pgpa_join_unroller +pgpa_local_advice +pgpa_local_advice_chunk +pgpa_output_context +pgpa_plan_walker_context +pgpa_qf_type +pgpa_query_feature +pgpa_scan +pgpa_scan_strategy +pgpa_shared_advice +pgpa_shared_advice_chunk +pgpa_shared_state +pgpa_unrolled_join |