diff options
21 files changed, 3536 insertions, 0 deletions
diff --git a/contrib/Makefile b/contrib/Makefile index 2f0a88d3f7..dd04c20acd 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -34,6 +34,7 @@ SUBDIRS = \ pg_freespacemap \ pg_logicalinspect \ pg_overexplain \ + pg_plan_advice \ pg_prewarm \ pg_stat_statements \ pg_surgery \ diff --git a/contrib/meson.build b/contrib/meson.build index ed30ee7d63..cb718dbdac 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -48,6 +48,7 @@ subdir('pgcrypto') subdir('pg_freespacemap') subdir('pg_logicalinspect') subdir('pg_overexplain') +subdir('pg_plan_advice') subdir('pg_prewarm') subdir('pgrowlocks') subdir('pg_stat_statements') diff --git a/contrib/pg_plan_advice/Makefile b/contrib/pg_plan_advice/Makefile new file mode 100644 index 0000000000..6f8127e551 --- /dev/null +++ b/contrib/pg_plan_advice/Makefile @@ -0,0 +1,30 @@ +# contrib/pg_plan_advice/Makefile + +MODULE_big = pg_plan_advice +OBJS = \ + $(WIN32RES) \ + pg_plan_advice.o \ + pgpa_collector.o \ + pgpa_identifier.o \ + pgpa_join.o \ + pgpa_output.o \ + pgpa_scan.o \ + pgpa_walker.o + +EXTENSION = pg_plan_advice +DATA = pg_plan_advice--1.0.sql +PGFILEDESC = "pg_plan_advice - help the planner get the right plan" + +REGRESS = pg_plan_advice +TAP_TESTS = 1 + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_plan_advice +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/pg_plan_advice/meson.build b/contrib/pg_plan_advice/meson.build new file mode 100644 index 0000000000..d9e3ec148c --- /dev/null +++ b/contrib/pg_plan_advice/meson.build @@ -0,0 +1,35 @@ +# Copyright (c) 2022-2024, PostgreSQL Global Development Group + +pg_plan_advice_sources = files( + 'pg_plan_advice.c', + 'pgpa_collector.c', + 'pgpa_identifier.c', + 'pgpa_join.c', + 'pgpa_output.c', + 'pgpa_scan.c', + 'pgpa_walker.c', +) + +if host_system == 'windows' + pg_plan_advice_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_plan_advice', + '--FILEDESC', 'pg_plan_advice - help the planner get the right plan',]) +endif + +pg_plan_advice = shared_module('pg_plan_advice', + pg_plan_advice_sources, + kwargs: contrib_mod_args, +) +contrib_targets += pg_plan_advice + +install_data( + 'pg_plan_advice--1.0.sql', + 'pg_plan_advice.control', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'pg_plan_advice', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), +} diff --git a/contrib/pg_plan_advice/pg_plan_advice--1.0.sql b/contrib/pg_plan_advice/pg_plan_advice--1.0.sql new file mode 100644 index 0000000000..729950bcce --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice--1.0.sql @@ -0,0 +1,40 @@ +/* contrib/pg_plan_advice/pg_plan_advice--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_plan_advice" to load this file. \quit + +CREATE FUNCTION pg_clear_collected_local_advice() +RETURNS void +AS 'MODULE_PATHNAME', 'pg_clear_collected_local_advice' +LANGUAGE C STRICT; + +CREATE FUNCTION pg_clear_collected_shared_advice() +RETURNS void +AS 'MODULE_PATHNAME', 'pg_clear_collected_shared_advice' +LANGUAGE C STRICT; + +CREATE FUNCTION pg_get_collected_local_advice( + OUT id bigint, + OUT userid oid, + OUT dbid oid, + OUT queryid bigint, + OUT collection_time timestamptz, + OUT query text, + OUT advice text +) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_get_collected_local_advice' +LANGUAGE C STRICT; + +CREATE FUNCTION pg_get_collected_shared_advice( + OUT id bigint, + OUT userid oid, + OUT dbid oid, + OUT queryid bigint, + OUT collection_time timestamptz, + OUT query text, + OUT advice text +) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_get_collected_shared_advice' +LANGUAGE C STRICT; diff --git a/contrib/pg_plan_advice/pg_plan_advice.c b/contrib/pg_plan_advice/pg_plan_advice.c new file mode 100644 index 0000000000..0c54cd8f54 --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice.c @@ -0,0 +1,342 @@ +/*------------------------------------------------------------------------- + * + * pg_plan_advice.c + * Main entrypoints for generating and applying planner advice + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pg_plan_advice.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pg_plan_advice.h" +#include "pgpa_collector.h" +#include "pgpa_identifier.h" +#include "pgpa_output.h" +#include "pgpa_walker.h" + +#include "commands/defrem.h" +#include "commands/explain.h" +#include "commands/explain_format.h" +#include "commands/explain_state.h" +#include "funcapi.h" +#include "storage/dsm_registry.h" +#include "utils/guc.h" + +PG_MODULE_MAGIC; + +static pgpa_shared_state *pgpa_state = NULL; +static dsa_area *pgpa_dsa_area = NULL; + +/* GUC variables */ +int pg_plan_advice_local_collection_limit = 0; +int pg_plan_advice_shared_collection_limit = 0; + +/* Saved hook values */ +static ExecutorStart_hook_type prev_ExecutorStart = NULL; +static explain_per_plan_hook_type prev_explain_per_plan_hook = NULL; + +/* Other file-level globals */ +static int es_extension_id; +static MemoryContext pgpa_memory_context = NULL; + +static void pg_plan_advice_ExecutorStart(QueryDesc *queryDesc, int eflags); +static void pg_plan_advice_explain_option_handler(ExplainState *es, + DefElem *opt, + ParseState *pstate); +static void pg_plan_advice_explain_per_plan_hook(PlannedStmt *plannedstmt, + IntoClause *into, + ExplainState *es, + const char *queryString, + ParamListInfo params, + QueryEnvironment *queryEnv); +static char *pg_plan_advice_generate(PlannedStmt *pstmt); + +/* + * Initialize this module. + */ +void +_PG_init(void) +{ + DefineCustomIntVariable("pg_plan_advice.local_collection_limit", + "# of advice entries to retain in per-backend memory", + NULL, + &pg_plan_advice_local_collection_limit, + 0, + 0, INT_MAX, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_plan_advice.shared_collection_limit", + "# of advice entries to retain in shared memory", + NULL, + &pg_plan_advice_shared_collection_limit, + 0, + 0, INT_MAX, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + /* Get an ID that we can use to cache data in an ExplainState. */ + es_extension_id = GetExplainExtensionId("pg_plan_advice"); + + /* Register the new EXPLAIN options implemented by this module. */ + RegisterExtensionExplainOption("plan_advice", + pg_plan_advice_explain_option_handler); + + /* Install hooks */ + prev_ExecutorStart = ExecutorStart_hook; + ExecutorStart_hook = pg_plan_advice_ExecutorStart; + prev_explain_per_plan_hook = explain_per_plan_hook; + explain_per_plan_hook = pg_plan_advice_explain_per_plan_hook; +} + +/* + * Initialize shared state when first created. + */ +static void +pgpa_init_shared_state(void *ptr) +{ + pgpa_shared_state *state = (pgpa_shared_state *) ptr; + + LWLockInitialize(&state->lock, LWLockNewTrancheId()); + state->dsa_tranche = LWLockNewTrancheId(); + state->area = DSA_HANDLE_INVALID; + state->shared_collector = InvalidDsaPointer; +} + +/* + * Return a pointer to a memory context where long-lived data managed by this + * module can be stored. + */ +MemoryContext +pg_plan_advice_get_mcxt(void) +{ + if (pgpa_memory_context == NULL) + pgpa_memory_context = AllocSetContextCreate(TopMemoryContext, + "pg_plan_advice", + ALLOCSET_DEFAULT_SIZES); + + return pgpa_memory_context; +} + +/* + * Get a pointer to our shared state. + * + * If no shared state exists, create and initialize it. If it does exist but + * this backend has not yet accessed it, attach to it. Otherwise, just return + * our cached pointer. + * + * Along the way, make sure the relevant LWLock tranches are registered. + */ +pgpa_shared_state * +pg_plan_advice_attach(void) +{ + pgpa_shared_state *state; + bool found; + + if (pgpa_state == NULL) + { + state = GetNamedDSMSegment("pg_plan_advice", sizeof(pgpa_shared_state), + pgpa_init_shared_state, &found); + LWLockRegisterTranche(state->lock.tranche, "pg_plan_advice_lock"); + LWLockRegisterTranche(state->dsa_tranche, "pg_plan_advice_dsa"); + pgpa_state = state; + } + + return pgpa_state; +} + +/* + * Return a pointer to pg_plan_advice's DSA area, creating it if needed. + */ +dsa_area * +pg_plan_advice_dsa_area(void) +{ + if (pgpa_dsa_area == NULL) + { + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_handle area_handle; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt()); + + LWLockAcquire(&state->lock, LW_EXCLUSIVE); + area_handle = state->area; + if (area_handle == DSA_HANDLE_INVALID) + { + pgpa_dsa_area = dsa_create(state->dsa_tranche); + dsa_pin(pgpa_dsa_area); + state->area = dsa_get_handle(pgpa_dsa_area); + LWLockRelease(&state->lock); + } + else + { + LWLockRelease(&state->lock); + pgpa_dsa_area = dsa_attach(area_handle); + } + + dsa_pin_mapping(pgpa_dsa_area); + + MemoryContextSwitchTo(oldcontext); + } + + return pgpa_dsa_area; +} + +static void +pg_plan_advice_ExecutorStart(QueryDesc *queryDesc, int eflags) +{ + PlannedStmt *pstmt = queryDesc->plannedstmt; + + if (pg_plan_advice_local_collection_limit > 0 + || pg_plan_advice_shared_collection_limit > 0) + { + char *advice; + + advice = pg_plan_advice_generate(pstmt); + + /* + * If the advice string is non-empty, pass it to the collectors. + * + * A query such as SELECT 2+2 or SELECT * FROM generate_series(1,10) + * will not produce any advice, since there are no query planning + * decisions that can be influenced. It wouldn't exactly be wrong to + * record the query together with the empty advice string, but there + * doesn't seem to be much value in it, so skip it to save space. + * + * If this proves confusing to users, we might need to revist the + * behavior here. + */ + if (advice[0] != '\0') + pgpa_collect_advice(pstmt->queryId, queryDesc->sourceText, + advice); + } + + if (prev_ExecutorStart) + prev_ExecutorStart(queryDesc, eflags); + else + standard_ExecutorStart(queryDesc, eflags); +} + +/* + * Handler for EXPLAIN (PLAN_ADVICE). + */ +static void +pg_plan_advice_explain_option_handler(ExplainState *es, DefElem *opt, + ParseState *pstate) +{ + bool *plan_advice; + + plan_advice = GetExplainExtensionState(es, es_extension_id); + + if (plan_advice == NULL) + { + plan_advice = palloc0_object(bool); + SetExplainExtensionState(es, es_extension_id, plan_advice); + } + + *plan_advice = defGetBoolean(opt); +} + +/* + * If the PLAN_ADVICE option was specified -- and not set to FALSE -- generate + * advice from the provided plan and add it to the EXPLAIN output. + */ +static void +pg_plan_advice_explain_per_plan_hook(PlannedStmt *plannedstmt, + IntoClause *into, + ExplainState *es, + const char *queryString, + ParamListInfo params, + QueryEnvironment *queryEnv) +{ + bool *plan_advice; + + if (prev_explain_per_plan_hook) + prev_explain_per_plan_hook(plannedstmt, into, es, queryString, + params, queryEnv); + + plan_advice = GetExplainExtensionState(es, es_extension_id); + if (plan_advice != NULL && *plan_advice) + { + char *advice = pg_plan_advice_generate(plannedstmt); + + /* + * The advice string likely spans multiple lines; the last line will + * not end in a newline, but the others will. In text format, it looks + * nicest to indent each line of the advice separately, beginning on + * the line below the "Plan Advice" label. For non-text formats, it's + * best not to add any special handling. + */ + if (es->format != EXPLAIN_FORMAT_TEXT) + ExplainPropertyText("Plan Advice", advice, es); + else if (*advice != '\0') + { + char *s; + + ExplainIndentText(es); + appendStringInfo(es->str, "Plan Advice:\n"); + + es->indent++; + + while ((s = strchr(advice, '\n')) != NULL) + { + ExplainIndentText(es); + appendBinaryStringInfo(es->str, advice, (s - advice) + 1); + advice = s + 1; + } + + if (*advice != '\0') + { + ExplainIndentText(es); + appendStringInfo(es->str, "%s\n", advice); + } + + es->indent--; + } + } +} + +/* + * Generate advice from a query plan and send it to the relevant collectors. + */ +static char * +pg_plan_advice_generate(PlannedStmt *pstmt) +{ + pgpa_plan_walker_context walker; + StringInfoData buf; + ListCell *lc; + const char **rt_identifiers; + + /* Construct unique identifiers for non-JOIN RTEs. */ + rt_identifiers = pgpa_create_identifiers_for_planned_stmt(pstmt); + + /* Initialization. */ + memset(&walker, 0, sizeof(pgpa_plan_walker_context)); + walker.pstmt = pstmt; + + /* Walk the main plan tree. */ + pgpa_plan_walker(&walker, pstmt->planTree, 0, NULL, NIL); + + /* Main plan tree walk won't reach subplans, so walk those. */ + foreach(lc, pstmt->subplans) + { + Plan *plan = lfirst(lc); + + if (plan != NULL) + pgpa_plan_walker(&walker, plan, 0, NULL, NIL); + } + + /* Put advice into string form. */ + initStringInfo(&buf); + pgpa_output_advice(&buf, &walker, rt_identifiers); + return buf.data; +} diff --git a/contrib/pg_plan_advice/pg_plan_advice.control b/contrib/pg_plan_advice/pg_plan_advice.control new file mode 100644 index 0000000000..aa6fdc9e7b --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice.control @@ -0,0 +1,5 @@ +# pg_plan_advice extension +comment = 'help the planner get the right plan' +default_version = '1.0' +module_pathname = '$libdir/pg_plan_advice' +relocatable = true diff --git a/contrib/pg_plan_advice/pg_plan_advice.h b/contrib/pg_plan_advice/pg_plan_advice.h new file mode 100644 index 0000000000..385d3f3052 --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice.h @@ -0,0 +1,35 @@ +/*------------------------------------------------------------------------- + * + * pg_plan_advice.h + * header file for pg_plan_advice contrib module + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pg_plan_advice.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_PLAN_ADVICE_H +#define PG_PLAN_ADVICE_H + +#include "storage/lwlock.h" +#include "utils/dsa.h" + +typedef struct pgpa_shared_state +{ + LWLock lock; + int dsa_tranche; + dsa_handle area; + dsa_pointer shared_collector; +} pgpa_shared_state; + +/* GUC variables */ +extern int pg_plan_advice_local_collection_limit; +extern int pg_plan_advice_shared_collection_limit; + +/* Function prototypes */ +extern MemoryContext pg_plan_advice_get_mcxt(void); +extern pgpa_shared_state *pg_plan_advice_attach(void); +extern dsa_area *pg_plan_advice_dsa_area(void); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_collector.c b/contrib/pg_plan_advice/pgpa_collector.c new file mode 100644 index 0000000000..1bcefe1fdf --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_collector.c @@ -0,0 +1,630 @@ +/*------------------------------------------------------------------------- + * + * pgpa_collector.c + * collect advice into backend-local or shared memory + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_collector.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pg_plan_advice.h" +#include "pgpa_collector.h" + +#include "datatype/timestamp.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/pg_list.h" +#include "utils/builtins.h" +#include "utils/timestamp.h" + +PG_FUNCTION_INFO_V1(pg_clear_collected_local_advice); +PG_FUNCTION_INFO_V1(pg_clear_collected_shared_advice); +PG_FUNCTION_INFO_V1(pg_get_collected_local_advice); +PG_FUNCTION_INFO_V1(pg_get_collected_shared_advice); + +#define ADVICE_CHUNK_SIZE 1024 +#define ADVICE_CHUNK_ARRAY_SIZE 64 + +#define PG_GET_ADVICE_COLUMNS 7 + +/* + * Advice extracted from one query plan, together with the query string + * and various other identifying details. + */ +typedef struct pgpa_collected_advice +{ + Oid userid; /* user OID */ + Oid dbid; /* database OID */ + uint64 queryid; /* query identifier */ + TimestampTz timestamp; /* query timestamp */ + int advice_offset; /* start of advice in textual data */ + char textual_data[FLEXIBLE_ARRAY_MEMBER]; +} pgpa_collected_advice; + +/* + * A bunch of pointers to pgpa_collected_advice objects, stored in + * backend-local memory. + */ +typedef struct pgpa_local_advice_chunk +{ + pgpa_collected_advice *entries[ADVICE_CHUNK_SIZE]; +} pgpa_local_advice_chunk; + +/* + * Information about all of the pgpa_collected_advice objects that we're + * storing in local memory. + * + * We assign consecutive IDs, starting from 0, to each pgpa_collected_advice + * object that we store. The actual storage is an array of chunks, which + * helps keep memcpy() overhead low when we start discarding older data. + */ +typedef struct pgpa_local_advice +{ + uint64 next_id; + uint64 oldest_id; + uint64 base_id; + int chunk_array_allocated_size; + pgpa_local_advice_chunk **chunks; +} pgpa_local_advice; + +/* + * Just like pgpa_local_advice_chunk, but stored in a dynamic shared area, + * so we must use dsa_pointer instead of native pointers. + */ +typedef struct pgpa_shared_advice_chunk +{ + dsa_pointer entries[ADVICE_CHUNK_SIZE]; +} pgpa_shared_advice_chunk; + +/* + * Just like pgpa_local_advice, but stored in a dynamic shared area, so + * we must use dsa_pointer instead of native pointers. + */ +typedef struct pgpa_shared_advice +{ + uint64 next_id; + uint64 oldest_id; + uint64 base_id; + int chunk_array_allocated_size; + dsa_pointer chunks; +} pgpa_shared_advice; + +/* Pointers to local and shared collectors */ +static pgpa_local_advice *local_collector = NULL; +static pgpa_shared_advice *shared_collector = NULL; + +/* Static functions */ +static pgpa_collected_advice *pgpa_make_collected_advice(Oid userid, + Oid dbid, + uint64 queryId, + TimestampTz timestamp, + const char *query_string, + const char *advice_string, + dsa_area *area, + dsa_pointer *result); +static void pgpa_store_local_advice(pgpa_collected_advice *ca); +static void pgpa_trim_local_advice(int limit); +static void pgpa_store_shared_advice(dsa_pointer ca_pointer); +static void pgpa_trim_shared_advice(dsa_area *area, int limit); + +/* Helper function to extract the query string from pgpa_collected_advice */ +static inline const char * +query_string(pgpa_collected_advice *ca) +{ + return ca->textual_data; +} + +/* Helper function to extract the advice string from pgpa_collected_advice */ +static inline const char * +advice_string(pgpa_collected_advice *ca) +{ + return ca->textual_data + ca->advice_offset; +} + +/* + * Store collected query advice into the local or shared advice collector, + * as appropriate. + */ +void +pgpa_collect_advice(uint64 queryId, const char *query_string, + const char *advice_string) +{ + Oid userid = GetUserId(); + Oid dbid = MyDatabaseId; + TimestampTz now = GetCurrentTimestamp(); + + if (pg_plan_advice_local_collection_limit > 0) + { + pgpa_collected_advice *ca; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt()); + ca = pgpa_make_collected_advice(userid, dbid, queryId, now, + query_string, advice_string, + NULL, NULL); + pgpa_store_local_advice(ca); + MemoryContextSwitchTo(oldcontext); + } + + if (pg_plan_advice_shared_collection_limit > 0) + { + dsa_area *area = pg_plan_advice_dsa_area(); + dsa_pointer ca_pointer; + + pgpa_make_collected_advice(userid, dbid, queryId, now, + query_string, advice_string, area, + &ca_pointer); + pgpa_store_shared_advice(ca_pointer); + } +} + +/* + * Allocate and fill a new pgpa_collected_advice object. + * + * If area != NULL, it is used to allocate the new object, and the resulting + * dsa_pointer is returned via *result. + * + * If area == NULL, the new object is allocated in the current memory context, + * and result is not examined or modified. + */ +static pgpa_collected_advice * +pgpa_make_collected_advice(Oid userid, Oid dbid, uint64 queryId, + TimestampTz timestamp, + const char *query_string, + const char *advice_string, + dsa_area *area, dsa_pointer *result) +{ + size_t query_string_length = strlen(query_string) + 1; + size_t advice_string_length = strlen(advice_string) + 1; + size_t total_length; + pgpa_collected_advice *ca; + + total_length = offsetof(pgpa_collected_advice, textual_data) + + query_string_length + advice_string_length; + + if (area == NULL) + ca = palloc(total_length); + else + { + *result = dsa_allocate(area, total_length); + ca = dsa_get_address(area, *result); + } + + ca->userid = GetUserId(); + ca->dbid = MyDatabaseId; + ca->queryid = queryId; + ca->timestamp = timestamp; + ca->advice_offset = query_string_length; + + memcpy(ca->textual_data, query_string, query_string_length); + memcpy(&ca->textual_data[ca->advice_offset], + advice_string, advice_string_length); + + return ca; +} + +/* + * Add a pg_collected_advice object to our backend-local advice collection. + * + * Caller is responsible for switching to the appropriate memory context; + * the provided object should have been allocated in that same context. + */ +static void +pgpa_store_local_advice(pgpa_collected_advice *ca) +{ + uint64 chunk_number; + uint64 chunk_offset; + pgpa_local_advice *la = local_collector; + + /* If the local advice collector isn't initialized yet, do that now. */ + if (la == NULL) + { + la = palloc0(sizeof(pgpa_local_advice)); + la->chunk_array_allocated_size = ADVICE_CHUNK_ARRAY_SIZE; + la->chunks = palloc0_array(pgpa_local_advice_chunk *, + la->chunk_array_allocated_size); + local_collector = la; + } + + /* Compute chunk and offset at which to store this advice. */ + chunk_number = (la->next_id - la->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (la->next_id - la->base_id) % ADVICE_CHUNK_SIZE; + + /* Extend chunk array, if needed. */ + if (chunk_number > la->chunk_array_allocated_size) + { + int new_size; + + new_size = la->chunk_array_allocated_size + ADVICE_CHUNK_ARRAY_SIZE; + la->chunks = repalloc0_array(la->chunks, + pgpa_local_advice_chunk *, + la->chunk_array_allocated_size, + new_size); + la->chunk_array_allocated_size = new_size; + } + + /* Allocate new chunk, if needed. */ + if (la->chunks[chunk_number] == NULL) + la->chunks[chunk_number] = palloc0_object(pgpa_local_advice_chunk); + + /* Save pointer and bump next-id counter. */ + Assert(la->chunks[chunk_number]->entries[chunk_offset] == NULL); + la->chunks[chunk_number]->entries[chunk_offset] = ca; + ++la->next_id; + + /* If we've exceeded the storage limit, discard old data. */ + pgpa_trim_local_advice(pg_plan_advice_local_collection_limit); +} + +/* + * Add a pg_collected_advice object to the shared advice collection. + * + * 'ca_pointer' should have been allocated from the pg_plan_advice DSA area + * and should point to an object of type pgpa_collected_advice. + */ +static void +pgpa_store_shared_advice(dsa_pointer ca_pointer) +{ + uint64 chunk_number; + uint64 chunk_offset; + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_area *area = pg_plan_advice_dsa_area(); + pgpa_shared_advice *sa = shared_collector; + dsa_pointer *chunk_array; + pgpa_shared_advice_chunk *chunk; + + /* Lock the shared state. */ + LWLockAcquire(&state->lock, LW_EXCLUSIVE); + + /* + * If we're not attached to the shared advice collector yet, fix that now. + * If we're the first ones to attach, we may need to create the object. + */ + if (sa == NULL) + { + if (state->shared_collector == InvalidDsaPointer) + state->shared_collector = + dsa_allocate0(area, sizeof(pgpa_shared_advice)); + shared_collector = sa = dsa_get_address(area, state->shared_collector); + } + + /* + * It's possible that some other backend may have succeeded in creating + * the main collector object but failed to allocate an initial chunk + * array, so we must be prepared to allocate the chunk array here whether + * or not we created the collector object. + */ + if (shared_collector->chunk_array_allocated_size == 0) + { + sa->chunks = + dsa_allocate0(area, + sizeof(dsa_pointer) * ADVICE_CHUNK_ARRAY_SIZE); + sa->chunk_array_allocated_size = ADVICE_CHUNK_ARRAY_SIZE; + } + + /* Compute chunk and offset at which to store this advice. */ + chunk_number = (sa->next_id - sa->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (sa->next_id - sa->base_id) % ADVICE_CHUNK_SIZE; + + /* Get the address of the chunk array and, if needed, extend it. */ + if (chunk_number > sa->chunk_array_allocated_size) + { + int new_size; + dsa_pointer new_chunks; + + /* + * DSA can't enlarge an existing allocation, so we must make a new + * allocation and copy data over. + */ + new_size = sa->chunk_array_allocated_size + ADVICE_CHUNK_ARRAY_SIZE; + new_chunks = dsa_allocate0(area, sizeof(dsa_pointer) * new_size); + chunk_array = dsa_get_address(area, new_chunks); + memcpy(chunk_array, dsa_get_address(area, sa->chunks), + sizeof(dsa_pointer) * sa->chunk_array_allocated_size); + dsa_free(area, sa->chunks); + sa->chunks = new_chunks; + sa->chunk_array_allocated_size = new_size; + } + else + chunk_array = dsa_get_address(area, sa->chunks); + + /* Get the address of the desired chunk, allocating it if needed. */ + if (chunk_array[chunk_number] == InvalidDsaPointer) + chunk_array[chunk_number] = + dsa_allocate0(area, sizeof(pgpa_shared_advice_chunk)); + chunk = dsa_get_address(area, chunk_array[chunk_number]); + + /* Save pointer and bump next-id counter. */ + Assert(chunk->entries[chunk_offset] == InvalidDsaPointer); + chunk->entries[chunk_offset] = ca_pointer; + ++sa->next_id; + + /* If we've exceeded the storage limit, discard old data. */ + pgpa_trim_shared_advice(area, pg_plan_advice_shared_collection_limit); + + /* Release lock on shared state. */ + LWLockRelease(&state->lock); +} + +/* + * Discard collected advice stored in backend-local memory in excess of the + * specified limit. + */ +static void +pgpa_trim_local_advice(int limit) +{ + pgpa_local_advice *la = local_collector; + uint64 current_count; + uint64 trim_count; + uint64 total_chunk_count; + uint64 trim_chunk_count; + uint64 remaining_chunk_count; + + /* If we haven't yet reached the limit, there's nothing to do. */ + current_count = la->next_id - la->oldest_id; + if (current_count <= limit) + return; + + /* Free enough entries to get us back down to the limit. */ + trim_count = current_count - limit; + while (trim_count > 0) + { + uint64 chunk_number; + uint64 chunk_offset; + + chunk_number = (la->oldest_id - la->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (la->oldest_id - la->base_id) % ADVICE_CHUNK_SIZE; + + Assert(la->chunks[chunk_number]->entries[chunk_offset] != NULL); + pfree(la->chunks[chunk_number]->entries[chunk_offset]); + la->chunks[chunk_number]->entries[chunk_offset] = NULL; + ++la->oldest_id; + --trim_count; + } + + /* Free any chunks that are now entirely unused. */ + trim_chunk_count = (la->oldest_id - la->base_id) / ADVICE_CHUNK_SIZE; + for (uint64 n = 0; n < trim_chunk_count; ++n) + pfree(la->chunks[n]); + + /* Slide remaining chunk pointers back toward the base of the array. */ + total_chunk_count = (la->next_id - la->base_id + + ADVICE_CHUNK_SIZE - 1) / ADVICE_CHUNK_SIZE; + remaining_chunk_count = total_chunk_count - trim_chunk_count; + if (remaining_chunk_count > 0) + memmove(&la->chunks[0], &la->chunks[trim_chunk_count], + sizeof(pgpa_local_advice_chunk *) * remaining_chunk_count); + + /* Adjust base ID value accordingly. */ + la->base_id += trim_chunk_count * ADVICE_CHUNK_SIZE; +} + +/* + * Discard collected advice stored in shared memory in excess of the + * specified limit. + */ +static void +pgpa_trim_shared_advice(dsa_area *area, int limit) +{ + pgpa_shared_advice *sa = shared_collector; + uint64 current_count; + uint64 trim_count; + uint64 total_chunk_count; + uint64 trim_chunk_count; + uint64 remaining_chunk_count; + dsa_pointer *chunk_array; + + /* If we haven't yet reached the limit, there's nothing to do. */ + current_count = sa->next_id - sa->oldest_id; + if (current_count <= limit) + return; + + /* Get a pointer to the chunk array. */ + chunk_array = dsa_get_address(area, sa->chunks); + + /* Free enough entries to get us back down to the limit. */ + trim_count = current_count - limit; + while (trim_count > 0) + { + uint64 chunk_number; + uint64 chunk_offset; + pgpa_shared_advice_chunk *chunk; + + chunk_number = (sa->oldest_id - sa->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (sa->oldest_id - sa->base_id) % ADVICE_CHUNK_SIZE; + + chunk = dsa_get_address(area, chunk_array[chunk_number]); + Assert(chunk->entries[chunk_offset] != InvalidDsaPointer); + dsa_free(area, chunk->entries[chunk_offset]); + chunk->entries[chunk_offset] = InvalidDsaPointer; + ++sa->oldest_id; + --trim_count; + } + + /* Free any chunks that are now entirely unused. */ + trim_chunk_count = (sa->oldest_id - sa->base_id) / ADVICE_CHUNK_SIZE; + for (uint64 n = 0; n < trim_chunk_count; ++n) + dsa_free(area, chunk_array[n]); + + /* Slide remaining chunk pointers back toward the base of the array. */ + total_chunk_count = (sa->next_id - sa->base_id + + ADVICE_CHUNK_SIZE - 1) / ADVICE_CHUNK_SIZE; + remaining_chunk_count = total_chunk_count - trim_chunk_count; + if (remaining_chunk_count > 0) + memmove(&chunk_array[0], &chunk_array[trim_chunk_count], + sizeof(dsa_pointer) * remaining_chunk_count); + + /* Adjust base ID value accordingly. */ + sa->base_id += trim_chunk_count * ADVICE_CHUNK_SIZE; +} + +/* + * SQL-callable function to discard advice collected in backend-local memory + */ +Datum +pg_clear_collected_local_advice(PG_FUNCTION_ARGS) +{ + pgpa_trim_local_advice(0); + + PG_RETURN_VOID(); +} + +/* + * SQL-callable function to discard advice collected in backend-local memory + */ +Datum +pg_clear_collected_shared_advice(PG_FUNCTION_ARGS) +{ + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_area *area = pg_plan_advice_dsa_area(); + + LWLockAcquire(&state->lock, LW_EXCLUSIVE); + + /* + * If we're not attached to the shared advice collector yet, fix that now; + * but if the collector doesn't even exist, we can return without doing + * anything else. + */ + if (shared_collector == NULL) + { + if (state->shared_collector == InvalidDsaPointer) + { + LWLockRelease(&state->lock); + return (Datum) 0; + } + shared_collector = dsa_get_address(area, state->shared_collector); + } + + /* Do the real work */ + pgpa_trim_shared_advice(area, 0); + + LWLockRelease(&state->lock); + + PG_RETURN_VOID(); +} + +/* + * SQL-callable SRF to return advice collected in backend-local memory + */ +Datum +pg_get_collected_local_advice(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + pgpa_local_advice *la = local_collector; + + /* + * XXX. Is this the correct thing from a security point of view? + */ + if (InSecurityRestrictedOperation()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("cannot call \"%s\" within security-restricted operation", + "pg_get_collected_local_advice"))); + + InitMaterializedSRF(fcinfo, 0); + + if (la == NULL) + return (Datum) 0; + + /* Loop over all entries. */ + for (uint64 id = la->oldest_id; id < la->next_id; ++id) + { + uint64 chunk_number; + uint64 chunk_offset; + pgpa_collected_advice *ca; + Datum values[PG_GET_ADVICE_COLUMNS]; + bool nulls[PG_GET_ADVICE_COLUMNS] = {0}; + + chunk_number = (id - la->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (id - la->base_id) % ADVICE_CHUNK_SIZE; + + ca = la->chunks[chunk_number]->entries[chunk_offset]; + + values[0] = UInt64GetDatum(id); + values[1] = ObjectIdGetDatum(ca->userid); + values[2] = ObjectIdGetDatum(ca->dbid); + values[3] = UInt64GetDatum(ca->queryid); + values[4] = TimestampGetDatum(ca->timestamp); + values[5] = CStringGetTextDatum(query_string(ca)); + values[6] = CStringGetTextDatum(advice_string(ca)); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + return (Datum) 0; +} + +/* + * SQL-callable SRF to return advice collected in shared memory + */ +Datum +pg_get_collected_shared_advice(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_area *area = pg_plan_advice_dsa_area(); + dsa_pointer *chunk_array; + pgpa_shared_advice *sa = shared_collector; + + InitMaterializedSRF(fcinfo, 0); + + /* Lock the shared state. */ + LWLockAcquire(&state->lock, LW_SHARED); + + /* + * If we're not attached to the shared advice collector yet, fix that now; + * but if the collector doesn't even exist, we can return without doing + * anything else. + */ + if (sa == NULL) + { + if (state->shared_collector == InvalidDsaPointer) + { + LWLockRelease(&state->lock); + return (Datum) 0; + } + shared_collector = sa = dsa_get_address(area, state->shared_collector); + } + + /* Get a pointer to the chunk array. */ + chunk_array = dsa_get_address(area, sa->chunks); + + /* Loop over all entries. */ + for (uint64 id = sa->oldest_id; id < sa->next_id; ++id) + { + uint64 chunk_number; + uint64 chunk_offset; + pgpa_shared_advice_chunk *chunk; + pgpa_collected_advice *ca; + Datum values[PG_GET_ADVICE_COLUMNS]; + bool nulls[PG_GET_ADVICE_COLUMNS] = {0}; + + chunk_number = (id - sa->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (id - sa->base_id) % ADVICE_CHUNK_SIZE; + + chunk = dsa_get_address(area, chunk_array[chunk_number]); + ca = dsa_get_address(area, chunk->entries[chunk_offset]); + + values[0] = UInt64GetDatum(id); + values[1] = ObjectIdGetDatum(ca->userid); + values[2] = ObjectIdGetDatum(ca->dbid); + values[3] = UInt64GetDatum(ca->queryid); + values[4] = TimestampGetDatum(ca->timestamp); + values[5] = CStringGetTextDatum(query_string(ca)); + values[6] = CStringGetTextDatum(advice_string(ca)); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + /* Release lock on shared state. */ + LWLockRelease(&state->lock); + + return (Datum) 0; +} diff --git a/contrib/pg_plan_advice/pgpa_collector.h b/contrib/pg_plan_advice/pgpa_collector.h new file mode 100644 index 0000000000..6f8efa27f0 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_collector.h @@ -0,0 +1,3 @@ + +extern void pgpa_collect_advice(uint64 queryId, const char *query_string, + const char *advice_string); diff --git a/contrib/pg_plan_advice/pgpa_identifier.c b/contrib/pg_plan_advice/pgpa_identifier.c new file mode 100644 index 0000000000..56ec832b23 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_identifier.c @@ -0,0 +1,385 @@ +/*------------------------------------------------------------------------- + * + * pgpa_identifier.c + * create appropriate identifiers for range table entries + * + * The goal of this module is to be able to produce identifiers for range + * table entries that are unique, understandable to human beings, and + * able to be reconstructed during future planning cycles. As an + * exception, we do not care about, or want to produce, identifiers for + * RTE_JOIN entries. This is because (1) we would end up with a ton of + * RTEs with unhelpful names like unnamed_join_17; (2) not all joins have + * RTEs; and (3) we intend to refer to joins by their constituent members + * rather than by reference to the join RTE. + * + * In general, we construct identifiers of the following form: + * + * alias_name#occurrence_number/child_table_name@subquery_name + * + * However, occurrence_number is omitted when it is the first occurrence + * within the same subquery, child_table_name is omitted for relations that + * are not child tables, and subquery_name is omitted for the topmost + * query level. Whenever an item is omitted, the preceding punctuation mark + * is also omitted. Identifier-style escaping is applied to alias_name and + * subquery_name. child_table_name is always a schema-qualified name, and + * identifier-style escaping is applied to the schema and to the relation + * names separately. + * + * The upshot of all of these rules is that in simple cases, the relation + * identifier is textually identical to the alias name, making life easier + * for users. However, even in complex cases, every relation identifier + * for a given query will be unique (or at least we hope so: if not, this + * code is buggy and the identifier format might need to be rethought). + * + * A key goal of this system is that we want to be able to reconstruct the + * same identifiers during a future planning cycle for the same query, so + * that if a certain behavior is specified for a certain identifier, we can + * properly identify the RTI for which that behavior is mandated. In order + * for this to work, subquery names must be unique and known before the + * subquery is planned, and the remainder of the identifier must not depend + * on any part of the query outside of the current subquery level. In + * particular, occurrence_number must be calculated relative to the range + * table for the relevant subquery, not the final flattened range table. + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_identifier.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgpa_identifier.h" + +#include "parser/parsetree.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + +static Index *pgpa_create_top_rti_map(Index rtable_length, List *rtable, + List *appinfos); +static int pgpa_occurrence_number(List *rtable, Index *top_rti_map, + SubPlanRTInfo *rtinfo, Index rti); + +/* + * Create a range table identifier from scratch. + * + * This function leaves the caller to do all the heavy lifting, so it's + * generally better to use one of the functions below instead. + * + * See the file header comments for more details on the format of an + * identifier. + */ +const char * +pgpa_create_identifier(const char *alias_name, int occurrence, + const char *partnsp, const char *partrel, + const char *plan_name) +{ + const char *result; + + result = quote_identifier(alias_name); + + if (occurrence != 1) + { + Assert(occurrence > 1); + result = psprintf("%s#%d", result, occurrence); + } + + if (partrel != NULL) + { + Assert(partnsp != NULL); + result = psprintf("%s/%s.%s", result, + quote_identifier(partnsp), + quote_identifier(partrel)); + } + + if (plan_name != NULL) + result = psprintf("%s@%s", result, quote_identifier(plan_name)); + + return result; +} + +/* + * Create a range table identifier for a given RTI. + */ +const char * +pgpa_create_identifier_by_rti(PlannerInfo *root, Index rti, char *plan_name) +{ + Index top_rti = rti; + int occurrence = 1; + RangeTblEntry *rte; + RangeTblEntry *top_rte; + char *partnsp = NULL; + char *partrel = NULL; + + /* + * If this is a child RTE, find the topmost parent that is still of type + * RTE_RELATION. We do this because we identify children of partitioned + * tables by the name of the child table, but subqueries can also have + * child rels and we don't care about those here. + */ + for (;;) + { + AppendRelInfo *appinfo = root->append_rel_array[top_rti]; + RangeTblEntry *parent_rte; + + if (appinfo == NULL) + break; + + parent_rte = planner_rt_fetch(appinfo->parent_relid, root); + if (parent_rte->rtekind != RTE_RELATION) + break; + + top_rti = appinfo->parent_relid; + } + + /* Get the range table entries for the RTI and top RTI. */ + rte = planner_rt_fetch(rti, root); + top_rte = planner_rt_fetch(top_rti, root); + Assert(rte->rtekind != RTE_JOIN); + Assert(top_rte->rtekind != RTE_JOIN); + + /* Work out the correct occurrence number. */ + for (Index prior_rti = 1; prior_rti < rti; ++prior_rti) + { + RangeTblEntry *prior_rte; + AppendRelInfo *appinfo = root->append_rel_array[prior_rti]; + + /* + * If this is a child rel of a parent that is a relation, skip it. + * + * Such range table entries are disambiguated by mentioning the schema + * and name of the table, not by counting them as separate occurrences + * of the same table. + */ + if (appinfo != NULL) + { + RangeTblEntry *parent_rte; + + parent_rte = planner_rt_fetch(appinfo->parent_relid, root); + if (parent_rte->rtekind == RTE_RELATION) + continue; + } + + /* Skip joins. */ + prior_rte = planner_rt_fetch(prior_rti, root); + if (prior_rte->rtekind == RTE_JOIN) + continue; + + /* Skip if the alias name differs. */ + if (strcmp(prior_rte->eref->aliasname, rte->eref->aliasname) != 0) + continue; + + /* Looks like a true duplicate. */ + ++occurrence; + } + + /* If this is a child table, get the schema and relation names. */ + if (rti != top_rti) + { + partnsp = get_namespace_name_or_temp(get_rel_namespace(rte->relid)); + partrel = get_rel_name(rte->relid); + } + + /* We have everything we need to create the identifier. */ + return pgpa_create_identifier(top_rte->eref->aliasname, + occurrence, partnsp, partrel, + plan_name); +} + +/* + * Create an array of range table identifiers for all the non-NULL, + * non-RTE_JOIN entries in the PlannedStmt's range table. + */ +const char ** +pgpa_create_identifiers_for_planned_stmt(PlannedStmt *pstmt) +{ + Index rtable_length = list_length(pstmt->rtable); + const char **result = palloc0_array(const char *, rtable_length); + Index *top_rti_map; + int rtinfoindex = 0; + SubPlanRTInfo *rtinfo = NULL; + SubPlanRTInfo *nextrtinfo = NULL; + + /* + * Account for relations addded by inheritance expansion of partitioned + * tables. + */ + top_rti_map = pgpa_create_top_rti_map(rtable_length, pstmt->rtable, + pstmt->appendRelations); + + /* + * When we begin iterating, we're processing the portion of the range + * table that originated from the top-level PlannerInfo, so subrtinfo is + * NULL. Later, subrtinfo will be the SubPlanRTInfo for the subquery whose + * portion of the range table we are processing. nextrtinfo is always the + * SubPlanRTInfo that follows the current one, if any, so when we're + * processing the top-level query's portion of the range table, the next + * SubPlanRTInfo is the very first one. + */ + if (pstmt->subrtinfos != NULL) + nextrtinfo = linitial(pstmt->subrtinfos); + + /* Main loop over the range table. */ + for (Index rti = 1; rti <= rtable_length; rti++) + { + const char *plan_name; + Index top_rti; + RangeTblEntry *rte; + RangeTblEntry *top_rte; + char *partnsp = NULL; + char *partrel = NULL; + int occurrence; + + /* + * Advance to the next SubPlanRTInfo, if it's time to do that. + * + * This loop probably shouldn't ever iterate more than once, because + * that would imply that a subquery was planned but added nothing to + * the range table; but let's be defensive and assume it can happen. + */ + while (nextrtinfo != NULL && rti > nextrtinfo->rtoffset) + { + rtinfo = nextrtinfo; + if (++rtinfoindex >= list_length(pstmt->subrtinfos)) + nextrtinfo = NULL; + else + nextrtinfo = list_nth(pstmt->subrtinfos, rtinfoindex); + } + + /* Fetch the range table entry, if any. */ + rte = rt_fetch(rti, pstmt->rtable); + + /* + * We can't and don't need to identify null entries, and we don't want + * to identify join entries. + */ + if (rte == NULL || rte->rtekind == RTE_JOIN) + continue; + + /* + * If this is not a relation added by partitioned table expansion, + * then the top RTI/RTE are just the same as this RTI/RTE. Otherwise, + * we need the information for the top RTI/RTE, and must also fetch + * the partition schema and name. + */ + top_rti = top_rti_map[rti - 1]; + if (rti == top_rti) + top_rte = rte; + else + { + top_rte = rt_fetch(top_rti, pstmt->rtable); + partnsp = + get_namespace_name_or_temp(get_rel_namespace(rte->relid)); + partrel = get_rel_name(rte->relid); + } + + /* Compute the correct occurrence number. */ + occurrence = pgpa_occurrence_number(pstmt->rtable, top_rti_map, + rtinfo, top_rti); + + /* Get the name of the current plan (NULL for toplevel query). */ + plan_name = rtinfo == NULL ? NULL : rtinfo->plan_name; + + /* OK, we're ready to create the identifier. */ + result[rti - 1] = pgpa_create_identifier(top_rte->eref->aliasname, + occurrence, + partnsp, partrel, + plan_name); + } + + return result; +} + +/* + * Build a mapping from each RTI to the RTI whose alias_name will be used to + * construct the range table identifier. + * + * For child relations, this is the topmost parent that is still of type + * RTE_RELATION. For other relations, it's just the original RTI. + * + * Since we're eventually going to need this information for every RTI in + * the range table, it's best to compute all the answers in a single pass over + * the AppendRelInfo list. Otherwise, we might end up searching through that + * list repeatedly for entries of interest. + * + * Note that the returned array is uses zero-based indexing, while RTIs use + * 1-based indexing, so subtract 1 from the RTI before looking it up in the + * array. + */ +static Index * +pgpa_create_top_rti_map(Index rtable_length, List *rtable, List *appinfos) +{ + Index *top_rti_map = palloc0_array(Index, rtable_length); + + /* Initially, make every RTI point to itself. */ + for (Index rti = 1; rti <= rtable_length; ++rti) + top_rti_map[rti - 1] = rti; + + /* Update the map for each AppendRelInfo object. */ + foreach_node(AppendRelInfo, appinfo, appinfos) + { + Index parent_rti = appinfo->parent_relid; + RangeTblEntry *parent_rte = rt_fetch(parent_rti, rtable); + + /* If the parent is not RTE_RELATION, ignore this entry. */ + if (parent_rte->rtekind != RTE_RELATION) + continue; + + /* + * Map the child to wherever we mapped the parent. Parents always + * precede their children in the AppendRelInfo list, so this should + * work out. + */ + top_rti_map[appinfo->child_relid - 1] = top_rti_map[parent_rti - 1]; + } + + return top_rti_map; +} + +/* + * Find the occurence number of a certain relation within a certain subquery. + * + * The same alias name can occur multiple times within a subquery, but we want + * to disambiguate by giving different occurrences different integer indexes. + * However, child tables are disambiguated by including the table name rather + * than by incrementing the occurrence number; and joins are not named and so + * shouldn't increment the occurence number either. + */ +static int +pgpa_occurrence_number(List *rtable, Index *top_rti_map, + SubPlanRTInfo *rtinfo, Index rti) +{ + Index rtoffset = (rtinfo == NULL) ? 0 : rtinfo->rtoffset; + int occurrence = 1; + RangeTblEntry *rte = rt_fetch(rti, rtable); + + for (Index prior_rti = rtoffset + 1; prior_rti < rti; ++prior_rti) + { + RangeTblEntry *prior_rte; + + /* + * If this is a child rel of a parent that is a relation, skip it. + * + * Such range table entries are disambiguated by mentioning the schema + * and name of the table, not by counting them as separate occurrences + * of the same table. + */ + if (top_rti_map[prior_rti - 1] != prior_rti) + break; + + /* Skip joins. */ + prior_rte = rt_fetch(prior_rti, rtable); + if (prior_rte->rtekind == RTE_JOIN) + continue; + + /* Skip if the alias name differs. */ + if (strcmp(prior_rte->eref->aliasname, rte->eref->aliasname) != 0) + continue; + + /* Looks like a true duplicate. */ + ++occurrence; + } + + return occurrence; +} diff --git a/contrib/pg_plan_advice/pgpa_identifier.h b/contrib/pg_plan_advice/pgpa_identifier.h new file mode 100644 index 0000000000..7684ba0a6a --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_identifier.h @@ -0,0 +1,29 @@ +/*------------------------------------------------------------------------- + * + * pgpa_identifier.h + * create appropriate identifiers for range table entries + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_identifier.h + * + *------------------------------------------------------------------------- + */ + +#ifndef PGPA_IDENTIFIER_H +#define PGPA_IDENTIFIER_H + +#include "nodes/pathnodes.h" +#include "nodes/plannodes.h" + +extern const char *pgpa_create_identifier(const char *alias_name, + int occurrence, + const char *partnsp, + const char *partrel, + const char *plan_name); +extern const char *pgpa_create_identifier_by_rti(PlannerInfo *root, + Index rti, + char *plan_name); +extern const char **pgpa_create_identifiers_for_planned_stmt(PlannedStmt *pstmt); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_join.c b/contrib/pg_plan_advice/pgpa_join.c new file mode 100644 index 0000000000..1984bd1352 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_join.c @@ -0,0 +1,601 @@ +/*------------------------------------------------------------------------- + * + * pgpa_join.c + * analysis of joins in Plan trees + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_join.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgpa_join.h" +#include "pgpa_scan.h" +#include "pgpa_walker.h" + +#include "nodes/pathnodes.h" +#include "nodes/print.h" +#include "parser/parsetree.h" + +/* + * Temporary object used when unrolling a join tree. + */ +struct pgpa_join_unroller +{ + unsigned nallocated; + unsigned nused; + Plan *outer_subplan; + ElidedNode *outer_elided_node; + pgpa_join_strategy *strategy; + Plan **inner_subplans; + ElidedNode **inner_elided_nodes; + pgpa_join_unroller **inner_unrollers; +}; + +static pgpa_join_strategy pgpa_decompose_join(pgpa_plan_walker_context *walker, + Plan *plan, + Plan **realouter, + Plan **realinner, + ElidedNode **elidedrealouter, + ElidedNode **elidedrealinner); +static ElidedNode *pgpa_descend_node(PlannedStmt *pstmt, Plan **plan); +static ElidedNode *pgpa_descend_any_gather(PlannedStmt *pstmt, Plan **plan); +static bool pgpa_descend_any_unique(PlannedStmt *pstmt, Plan **plan, + ElidedNode **elided_node); + +static Bitmapset *pgpa_add_member_relids(Bitmapset *relids, + pgpa_join_member *member); + +static bool is_result_node_with_child(Plan *plan); +static bool is_sorting_plan(Plan *plan); + +/* + * Create an initially-empty object for unrolling joins. + * + * See comments for pgpa_unrolled_join and pgpa_clumped_join in pgpa_join.h + * for a discussion of clumped and unrolled joins. This function creates a + * helper object that can later be used to create a pgpa_unrolled_join, after + * first calling pgpa_unroll_join one or more times. + */ +pgpa_join_unroller * +pgpa_create_join_unroller(void) +{ + pgpa_join_unroller *join_unroller; + + join_unroller = palloc0_object(pgpa_join_unroller); + join_unroller->nallocated = 4; + join_unroller->strategy = + palloc_array(pgpa_join_strategy, join_unroller->nallocated); + join_unroller->inner_subplans = + palloc_array(Plan *, join_unroller->nallocated); + join_unroller->inner_elided_nodes = + palloc_array(ElidedNode *, join_unroller->nallocated); + join_unroller->inner_unrollers = + palloc_array(pgpa_join_unroller *, join_unroller->nallocated); + + return join_unroller; +} + +/* + * Unroll one level of an unrollable join tree. + * + * Our basic goal here is to unroll join trees as they occur in the Plan + * tree into a simpler and more regular structure that we can more easily + * use for further processing. Unrolling is outer-deep, so if the plan tree + * has Join1(Join2(A,B),Join3(C,D)), the same join unroller object should be + * used for Join1 and Join2, but a different one will be needed for Join3, + * since that involves a join within the *inner* side of another join. + * + * pgpa_plan_walker creates a "top level" join unroller object when it + * encounters a join in a portion of the plan tree in which no join unroller + * is already active. From there, this function is responsible for determing + * to what portion of the plan tree that join unroller applies, and for + * creating any subordinate join unroller objects that are needed as a result + * of non-outer-deep join trees. We do this by returning the join unroller + * objects that should be used for further traversal of the outer and inner + * subtrees of the current plan node via *outer_join_unroller and + * *inner_join_unroller, respectively. + */ +void +pgpa_unroll_join(pgpa_plan_walker_context *walker, Plan *plan, + pgpa_join_unroller *join_unroller, + pgpa_join_unroller **outer_join_unroller, + pgpa_join_unroller **inner_join_unroller) +{ + pgpa_join_strategy strategy; + Plan *realinner, + *realouter; + ElidedNode *elidedinner, + *elidedouter; + int n; + + Assert(join_unroller != NULL); + + /* + * We need to pass the join_unroller object down through certain types of + * plan nodes -- anything that's considered part of the join strategy, and + * any other nodes that can occur in a join tree despite not being scans + * or joins. + * + * This includes: + * + * (1) Materialize, Memoize, and Hash nodes, which are part of the join + * strategy, + * + * (2) Gather and Gather Merge nodes, which can occur at any point in the + * join tree where the planner decided to initiate parallelism, + * + * (3) Sort and IncrementalSort nodes, which can occur beneath MergeJoin + * or GatherMerge, + * + * (4) Agg and Unique nodes, which can occur when we decide to make the + * nullable side of a semijoin unique and then join the result, and + * + * (5) Result nodes with children, which can be added either to project to + * enforce a one-time filter (but Result nodes without children are + * degenerate scans or joins). + */ + if (IsA(plan, Material) || IsA(plan, Memoize) || IsA(plan, Hash) + || IsA(plan, Gather) || IsA(plan, GatherMerge) + || is_sorting_plan(plan) || IsA(plan, Agg) || IsA(plan, Unique) + || is_result_node_with_child(plan)) + { + *outer_join_unroller = join_unroller; + return; + } + + /* + * Since we've already handled nodes that require pass-through treatment, + * this should be an unrollable join. + */ + strategy = pgpa_decompose_join(walker, plan, + &realouter, &realinner, + &elidedouter, &elidedinner); + + /* If our workspace is full, expand it. */ + if (join_unroller->nused >= join_unroller->nallocated) + { + join_unroller->nallocated *= 2; + join_unroller->strategy = + repalloc_array(join_unroller->strategy, + pgpa_join_strategy, + join_unroller->nallocated); + join_unroller->inner_subplans = + repalloc_array(join_unroller->inner_subplans, + Plan *, + join_unroller->nallocated); + join_unroller->inner_elided_nodes = + repalloc_array(join_unroller->inner_elided_nodes, + ElidedNode *, + join_unroller->nallocated); + join_unroller->inner_unrollers = + repalloc_array(join_unroller->inner_unrollers, + pgpa_join_unroller *, + join_unroller->nallocated); + } + + /* + * Since we're flattening outer-deep join trees, it follows that if the + * outer side is still an unrollable join, it should be unrolled into this + * same object. Otherwise, we've reached the limit of what we can unroll + * into this object and must remember the outer side as the final outer + * subplan. + */ + if (elidedouter == NULL && pgpa_is_join(realouter)) + *outer_join_unroller = join_unroller; + else + { + join_unroller->outer_subplan = realouter; + join_unroller->outer_elided_node = elidedouter; + } + + /* + * Store the inner subplan. If it's an unrollable join, it needs to be + * flattened in turn, but into a new unroller object, not this one. + */ + n = join_unroller->nused++; + join_unroller->strategy[n] = strategy; + join_unroller->inner_subplans[n] = realinner; + join_unroller->inner_elided_nodes[n] = elidedinner; + if (elidedinner == NULL && pgpa_is_join(realinner)) + *inner_join_unroller = pgpa_create_join_unroller(); + else + *inner_join_unroller = NULL; + join_unroller->inner_unrollers[n] = *inner_join_unroller; +} + +/* + * Use the data we've accumulated in a pgpa_join_unroller object to construct + * a pgpa_unrolled_join. + */ +pgpa_unrolled_join * +pgpa_build_unrolled_join(PlannedStmt *pstmt, + pgpa_join_unroller *join_unroller) +{ + pgpa_unrolled_join *ujoin; + int i; + + /* + * We shouldn't have gone even so far as to create a join unroller unless + * we found at least one unrollable join. + */ + Assert(join_unroller->nused > 0); + + /* Allocate result structures. */ + ujoin = palloc0_object(pgpa_unrolled_join); + ujoin->ninner = join_unroller->nused; + ujoin->strategy = palloc0_array(pgpa_join_strategy, join_unroller->nused); + ujoin->inner = palloc0_array(pgpa_join_member, join_unroller->nused); + + /* Handle the outermost join. */ + ujoin->outer.plan = join_unroller->outer_subplan; + ujoin->outer.elided_node = join_unroller->outer_elided_node; + ujoin->outer.scan = pgpa_build_scan(pstmt, ujoin->outer.plan, + ujoin->outer.elided_node, true); + + /* + * We want the joins from the deepest part of the plan tree to appear + * first in the result object, but the join unroller adds them in exactly + * the reverse of that order, so we need to flip the order of the arrays + * when constructing the final result. + */ + for (i = 0; i < join_unroller->nused; ++i) + { + int k = join_unroller->nused - i - 1; + + /* Copy strategy, Plan, and ElidedNode. */ + ujoin->strategy[i] = join_unroller->strategy[k]; + ujoin->inner[i].plan = join_unroller->inner_subplans[k]; + ujoin->inner[i].elided_node = join_unroller->inner_elided_nodes[k]; + + /* + * Fill in remaining details, using either the nested join unroller, + * or by deriving them from the plan and elided nodes. + */ + if (join_unroller->inner_unrollers[k] != NULL) + ujoin->inner[i].unrolled_join = + pgpa_build_unrolled_join(pstmt, + join_unroller->inner_unrollers[k]); + else + ujoin->inner[i].scan = pgpa_build_scan(pstmt, ujoin->inner[i].plan, + ujoin->inner[i].elided_node, + true); + } + + return ujoin; +} + +/* + * Free memory allocated for pgpa_join_unroller. + */ +void +pgpa_destroy_join_unroller(pgpa_join_unroller *join_unroller) +{ + pfree(join_unroller->strategy); + pfree(join_unroller->inner_subplans); + pfree(join_unroller->inner_elided_nodes); + pfree(join_unroller->inner_unrollers); + pfree(join_unroller); +} + +/* + * Identify the join strategy used by a join and the "real" inner and outer + * plans. + * + * For example, a Hash Join always has a Hash node on the inner side, but + * for all intents and purposes the real inner input is the Hash node's child, + * not the Hash node itself. + * + * Likewise, a Merge Join may have Sort note on the inner or outer side; if + * it does, the real input to the join is the Sort node's child, not the + * Sort node itself. + * + * In addition, with a Merge Join or a Nested Loop, the join planning code + * may add additional nodes such as Materialize or Memoize. We regard these + * as an aspect of the join strategy. As in the previous cases, the true input + * to the join is the underlying node. + * + * However, if any involved child node previously had a now-elided node stacked + * on top, then we can't "look through" that node -- indeed, what's going to be + * relevant for our purposes is the ElidedNode on top of that plan node, rather + * than the plan node itself. + * + * If there are multiple elided nodes, we want that one that would have been + * uppermost in the plan tree prior to setrefs processing; we expect to find + * that one last in the list of elided nodes. + * + * On return *realouter and *realinner will have been set to the real inner + * and real outer plans that we identified, and *elidedrealouter and + * *elidedrealinner to the last of any correspoding elided nodes. + */ +static pgpa_join_strategy +pgpa_decompose_join(pgpa_plan_walker_context *walker, Plan *plan, + Plan **realouter, Plan **realinner, + ElidedNode **elidedrealouter, ElidedNode **elidedrealinner) +{ + PlannedStmt *pstmt = walker->pstmt; + JoinType jointype = ((Join *) plan)->jointype; + Plan *outerplan = plan->lefttree; + Plan *innerplan = plan->righttree; + ElidedNode *elidedouter; + ElidedNode *elidedinner; + pgpa_join_strategy strategy; + bool uniqueouter; + bool uniqueinner; + + elidedouter = pgpa_last_elided_node(pstmt, outerplan); + elidedinner = pgpa_last_elided_node(pstmt, innerplan); + + switch (nodeTag(plan)) + { + case T_MergeJoin: + + /* + * The planner may have chosen to place a Material node on the + * inner side of the MergeJoin; if this is present, we record it + * as part of the join strategy. + */ + if (elidedinner == NULL && IsA(innerplan, Material)) + { + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_MERGE_JOIN_MATERIALIZE; + } + else + strategy = JSTRAT_MERGE_JOIN_PLAIN; + + /* + * For a MergeJoin, either the outer or the inner subplan, or + * both, may have needed to be sorted; we must disregard any Sort + * or IncrementalSort node to find the real inner or outer + * subplan. + */ + if (elidedouter == NULL && is_sorting_plan(outerplan)) + elidedouter = pgpa_descend_node(pstmt, &outerplan); + if (elidedinner == NULL && is_sorting_plan(innerplan)) + elidedinner = pgpa_descend_node(pstmt, &innerplan); + break; + + case T_NestLoop: + + /* + * The planner may have chosen to place a Material or Memoize node + * on the inner side of the NestLoop; if this is present, we + * record it as part of the join strategy. + */ + if (elidedinner == NULL && IsA(innerplan, Material)) + { + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_NESTED_LOOP_MATERIALIZE; + } + else if (elidedinner == NULL && IsA(innerplan, Memoize)) + { + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_NESTED_LOOP_MEMOIZE; + } + else + strategy = JSTRAT_NESTED_LOOP_PLAIN; + break; + + case T_HashJoin: + + /* + * The inner subplan of a HashJoin is always a Hash node; the real + * inner subplan is the Hash node's child. + */ + Assert(IsA(innerplan, Hash)); + Assert(elidedinner == NULL); + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_HASH_JOIN; + break; + + default: + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(plan)); + } + + /* + * The planner may have decided to implement a semijoin by first making + * the nullable side of the plan unique, and then performing a normal join + * against the result. Therefore, we might need to descend through a + * unique node on either side of the plan. + */ + uniqueouter = pgpa_descend_any_unique(pstmt, &outerplan, &elidedouter); + uniqueinner = pgpa_descend_any_unique(pstmt, &innerplan, &elidedinner); + + /* + * The planner may have decided to parallelize part of the join tree, so + * we could find a Gather or Gather Merge node here. Note that, if + * present, this will appear below both (1) nodes we considered as part of + * the join strategy and (2) nodes inserted for uniqueness, because we + * currently only consider gathering a partial path on top of the final + * path for some RelOptInfo. + */ + if (elidedouter == NULL) + elidedouter = pgpa_descend_any_gather(pstmt, &outerplan); + if (elidedinner == NULL) + elidedinner = pgpa_descend_any_gather(pstmt, &innerplan); + + /* + * It's possible that Result node has been inserted either to project a + * target list or to implement a one-time filter. If so, we can descend + * throught it. Note that a result node without a child would be a + * degenerate scan or join, and not something we could descend through. + * + * XXX. I suspect it's possible for this to happen above the Gather or + * Gather Merge node, too, but apparently we have no test case for that + * scenario. + */ + if (elidedouter == NULL && is_result_node_with_child(outerplan)) + elidedouter = pgpa_descend_node(pstmt, &outerplan); + if (elidedinner == NULL && is_result_node_with_child(innerplan)) + elidedinner = pgpa_descend_node(pstmt, &innerplan); + + /* + * If this is a semijoin that was converted to an inner join by making one + * side or the other unique, make a note that the inner or outer subplan, + * as appropriate, should be treated as a query plan feature when the main + * tree traversal reaches it. + * + * Conversely, if the planner could have made one side of the join unique + * and thereby converted it to an inner join, and chose not to do so, that + * is also worth noting. (XXX: I'm not sure that we need to emit + * non-unique advice in every case where these join-type tests pass.) + * + * NB: This code could appear slightly higher up in in this function, but + * none of the nodes through which we just descended should be have + * associated RTIs. + * + * NB: This seems like a somewhat hacky way of passing information up to + * the main tree walk, but I don't currently have a better idea. + */ + if (uniqueouter) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_UNIQUE, outerplan); + else if (jointype == JOIN_RIGHT_SEMI) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_NON_UNIQUE, outerplan); + if (uniqueinner) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_UNIQUE, innerplan); + else if (jointype == JOIN_SEMI) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_NON_UNIQUE, innerplan); + + /* Set output parameters. */ + *realouter = outerplan; + *realinner = innerplan; + *elidedrealouter = elidedouter; + *elidedrealinner = elidedinner; + return strategy; +} + +/* + * Descend through a Plan node in a join tree that the caller has determined + * to be irrelevant. + * + * Updates *plan, and returns the last of any elided nodes pertaining to the + * new plan node. + */ +static ElidedNode * +pgpa_descend_node(PlannedStmt *pstmt, Plan **plan) +{ + *plan = (*plan)->lefttree; + return pgpa_last_elided_node(pstmt, *plan); +} + +/* + * Descend through a Gather or Gather Merge node, if present, and any Sort + * or IncrementalSort node occurring under a Gather Merge. + * + * Caller should have verified that there is no ElidedNode pertaining to + * the initial value of *plan. + * + * Updates *plan, and returns the last of any elided nodes pertaining to the + * new plan node. + */ +static ElidedNode * +pgpa_descend_any_gather(PlannedStmt *pstmt, Plan **plan) +{ + if (IsA(*plan, Gather)) + return pgpa_descend_node(pstmt, plan); + + if (IsA(*plan, GatherMerge)) + { + ElidedNode *elided = pgpa_descend_node(pstmt, plan); + + if (elided == NULL && is_sorting_plan(*plan)) + elided = pgpa_descend_node(pstmt, plan); + + return elided; + } + + return NULL; +} + +/* + * If *plan is an Agg or Unique node, we want to descend through it, unless + * it has a corresponding elided node. If its immediate child is a Sort or + * IncrementalSort, we also want to descend through that, unless it has a + * corresponding elided node. + * + * On entry, *elided_node must be the last of any elided nodes corresponding + * to *plan; on exit, this will still be true, but *plan may have been updated. + * + * The reason we don't want to descend through elided nodes is that a single + * join tree can't cross through any sort of elided node: subqueries are + * planned separately, and planning inside an Append or MergeAppend is + * separate from planning outside of it. + * + * The return value is true if we descend through at least one node, and + * otherwise false. + */ +static bool +pgpa_descend_any_unique(PlannedStmt *pstmt, Plan **plan, + ElidedNode **elided_node) +{ + if (*elided_node != NULL) + return false; + + if (IsA(*plan, Agg) || IsA(*plan, Unique)) + { + *elided_node = pgpa_descend_node(pstmt, plan); + + if (*elided_node == NULL && is_sorting_plan(*plan)) + *elided_node = pgpa_descend_node(pstmt, plan); + + return true; + } + + return false; +} + +/* + * Create a Bitmapset of all RTIs that we associated with an unrolled join. + * + * As usual, join RTIs are excluded. + */ +Bitmapset * +pgpa_unrolled_join_all_relids(pgpa_unrolled_join *join) +{ + Bitmapset *relids = NULL; + + relids = pgpa_add_member_relids(relids, &join->outer); + + for (int k = 0; k < join->ninner; ++k) + relids = pgpa_add_member_relids(relids, &join->inner[k]); + + return relids; +} + +/* + * Add all RTIs that we associate with a join member to the provided set, + * returning the result. + */ +static Bitmapset * +pgpa_add_member_relids(Bitmapset *relids, pgpa_join_member *member) +{ + if (member->unrolled_join != NULL) + return bms_union(relids, + pgpa_unrolled_join_all_relids(member->unrolled_join)); + else + { + Assert(member->scan != NULL); + return bms_union(relids, member->scan->relids); + } +} + +/* + * Is this a Result node that has a child? + */ +static bool +is_result_node_with_child(Plan *plan) +{ + return IsA(plan, Result) && plan->lefttree != NULL; +} + +/* + * Is this a Plan node whose purpose is put the data in a certain order? + */ +static bool +is_sorting_plan(Plan *plan) +{ + return IsA(plan, Sort) || IsA(plan, IncrementalSort); +} diff --git a/contrib/pg_plan_advice/pgpa_join.h b/contrib/pg_plan_advice/pgpa_join.h new file mode 100644 index 0000000000..7c861fb446 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_join.h @@ -0,0 +1,118 @@ +/*------------------------------------------------------------------------- + * + * pgpa_join.h + * analysis of joins in Plan trees + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_join.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_JOIN_H +#define PGPA_JOIN_H + +#include "nodes/plannodes.h" + +struct pgpa_plan_walker_context; +struct pgpa_clumped_join; +typedef struct pgpa_join_unroller pgpa_join_unroller; +typedef struct pgpa_unrolled_join pgpa_unrolled_join; + +/* + * Although there are three main join strategies, we try to classify things + * more precisely here: merge joins have the option of using materialization + * on the inner side, and nested loops can use either materialization or + * memoization. + */ +typedef enum +{ + JSTRAT_MERGE_JOIN_PLAIN = 0, + JSTRAT_MERGE_JOIN_MATERIALIZE, + JSTRAT_NESTED_LOOP_PLAIN, + JSTRAT_NESTED_LOOP_MATERIALIZE, + JSTRAT_NESTED_LOOP_MEMOIZE, + JSTRAT_HASH_JOIN + /* update NUM_PGPA_JOIN_STRATEGY if you add anything here */ +} pgpa_join_strategy; + +#define NUM_PGPA_JOIN_STRATEGY ((int) JSTRAT_HASH_JOIN + 1) + +/* + * In an outer-deep join tree, every member of an unrolled join will be a scan, + * but join trees with other shapes can contain unrolled joins. + * + * The plan node we store here will be the inner or outer child of the join + * node, as appropriate, except that we look through subnodes that we regard as + * part of the join method itself. For instance, for a Nested Loop that + * materializes the inner input, we'll store the child of the Materialize node, + * not the Materialize node itself. + * + * If setrefs processing elided one or more nodes from the plan tree, then + * we'll store details about the topmost of those in elided_node; otherwise, + * it will be NULL. + * + * Exactly one of scan and unrolled_join will be non-NULL. + */ +typedef struct +{ + Plan *plan; + ElidedNode *elided_node; + struct pgpa_scan *scan; + pgpa_unrolled_join *unrolled_join; +} pgpa_join_member; + +/* + * Non-clumped joins are unrolled; that is, we convert outer-deep join trees + * to a flat structure. ((A JOIN B) JOIN C) JOIN D gets converted to + * outer = A, inner = <B C D>, provided that none of the joins involved are + * clumped. When joins aren't outer-deep, substructure is required, e.g. + * (A JOIN B) JOIN (C JOIN D) is represented as outer = A, inner = <B X>, + * where X is a clumped or unrolled join covering C-D. + */ +struct pgpa_unrolled_join +{ + /* Outermost member; must not itself be an unrolled join. */ + pgpa_join_member outer; + + /* Number of inner members. Length of the strategy and inner arrays. */ + unsigned ninner; + + /* Array of strategies, one per non-outermost member. */ + pgpa_join_strategy *strategy; + + /* Array of members, excluding the outermost. Deepest first. */ + pgpa_join_member *inner; +}; + +/* + * Classification of Plan nodes based on what structure we should construct. + */ +typedef enum +{ + PGPA_NON_JOIN, + PGPA_CLUMPED_JOIN, + PGPA_UNROLLED_JOIN +} pgpa_join_class; + +/* + * Does this plan node inherit from Join? + */ +static inline bool +pgpa_is_join(Plan *plan) +{ + return IsA(plan, NestLoop) || IsA(plan, MergeJoin) || IsA(plan, HashJoin); +} + +extern pgpa_join_unroller *pgpa_create_join_unroller(void); +extern void pgpa_unroll_join(struct pgpa_plan_walker_context *walker, + Plan *plan, + pgpa_join_unroller *join_unroller, + pgpa_join_unroller **outer_join_unroller, + pgpa_join_unroller **inner_join_unroller); +extern pgpa_unrolled_join *pgpa_build_unrolled_join(PlannedStmt *pstmt, + pgpa_join_unroller *join_unroller); +extern void pgpa_destroy_join_unroller(pgpa_join_unroller *join_unroller); +extern Bitmapset *pgpa_unrolled_join_all_relids(pgpa_unrolled_join *join); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_output.c b/contrib/pg_plan_advice/pgpa_output.c new file mode 100644 index 0000000000..1b660791ba --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_output.c @@ -0,0 +1,480 @@ +/*------------------------------------------------------------------------- + * + * pgpa_output.c + * produce textual output from the results of a plan tree walk + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_output.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgpa_output.h" +#include "pgpa_scan.h" + +typedef struct pgpa_output_context +{ + const char **rt_identifiers; + StringInfo buf; + List *unrolled_joins[NUM_PGPA_JOIN_STRATEGY]; + List *scans[NUM_PGPA_SCAN_STRATEGY]; + List *query_features[NUM_PGPA_QF_TYPES]; + int wrap_column; +} pgpa_output_context; + +static void pgpa_output_unrolled_join(pgpa_output_context *context, + pgpa_unrolled_join *join); +static void pgpa_output_join_member(pgpa_output_context *context, + pgpa_join_member *member); +static void pgpa_output_relations(pgpa_output_context *context, StringInfo buf, + Bitmapset *relids); + +static char *pgpa_cstring_join_strategy(pgpa_join_strategy strategy); +static char *pgpa_cstring_scan_strategy(pgpa_scan_strategy strategy); + +static void pgpa_maybe_linebreak(StringInfo buf, int wrap_column); + +/* + * Append query advice to the provided buffer. + * + * Before calling this function, 'walker' must be used to iterate over the + * main plan tree and all subplans from the PlannedStmt. + * + * 'rt_identifiers' is a table of unique identifiers, one for each RTI. + * See pgpa_create_identifiers_for_planned_stmt(). + * + * Results will be appended to 'buf'. + */ +void +pgpa_output_advice(StringInfo buf, pgpa_plan_walker_context *walker, + const char **rt_identifiers) +{ + ListCell *lc; + pgpa_output_context context; + + /* + * If the user chooses to use EXPLAIN (PLAN_ADVICE) in an 80-column window + * from a psql client with default settings, psql will add one space to the + * left of the output and EXPLAIN will add two more to the left of the + * advice. Thus, lines of more than 77 characters will wrap. We set the + * wrap limit to 76 here so that the output won't reach all the way to the + * very last column of the terminal. + * + * Of course, this is fairly arbitrary set of assumptions, and one could + * well make an argument for a different wrap limit, or for a configurable + * one. + */ + memset(&context, 0, sizeof(pgpa_output_context)); + context.rt_identifiers = rt_identifiers; + context.buf = buf; + context.wrap_column = 76; + + /* + * Put all the top-level scans for each strategy into a single list. + */ + foreach(lc, walker->scans) + { + pgpa_scan *scan = lfirst(lc); + + context.scans[scan->strategy] = + lappend(context.scans[scan->strategy], scan->relids); + } + + /* + * Each piece of JOIN_ORDER() advice fully describes the join order for a + * a single unrolled join. Merging is not permitted, because that would + * change the meaning, e.g. SEQ_SCAN(a b c d) means simply that sequential + * scans should be used for all of those relations, and is thus equivalent + * to SEQ_SCAN(a b) SEQ_SCAN(c d), but JOIN_ORDER(a b c d) means that "a" + * is the driving table which is then joined to "b" then "c" then "d", + * which is totally different from JOIN_ORDER(a b) and JOIN_ORDER(c d). + * + * As a side effect, the calls to pgpa_output_unrolled_join() will add to + * the context.unrolled_joins[] lists and context.scans[] lists, which + * will be important for emitting join and scan strategy advice further + * down. + */ + foreach(lc, walker->unrolled_joins) + { + pgpa_unrolled_join *ujoin = lfirst(lc); + + if (buf->len > 0) + appendStringInfoChar(buf, '\n'); + appendStringInfo(context.buf, "JOIN_ORDER("); + pgpa_output_unrolled_join(&context, ujoin); + appendStringInfoChar(context.buf, ')'); + pgpa_maybe_linebreak(context.buf, context.wrap_column); + } + + /* + * Emit just one piece of advice for each unrolled-join strategy that is + * used at least once in the query. + */ + for (int s = 0; s < NUM_PGPA_JOIN_STRATEGY; ++s) + { + char *strategy = pgpa_cstring_join_strategy(s); + bool first = true; + + if (context.unrolled_joins[s] == NIL) + continue; + + if (buf->len > 0) + appendStringInfoChar(buf, '\n'); + appendStringInfo(buf, "%s(", strategy); + + foreach(lc, context.unrolled_joins[s]) + { + Bitmapset *relids = lfirst(lc); + + if (first) + first = false; + else + { + pgpa_maybe_linebreak(context.buf, context.wrap_column); + appendStringInfoChar(context.buf, ' '); + } + + if (bms_membership(relids) == BMS_SINGLETON) + pgpa_output_relations(&context, buf, relids); + else + { + appendStringInfoChar(buf, '('); + pgpa_output_relations(&context, buf, relids); + appendStringInfoChar(buf, ')'); + } + } + appendStringInfoChar(buf, ')'); + pgpa_maybe_linebreak(context.buf, context.wrap_column); + } + + /* + * Emit just one piece of advice for each scan strategy that is used at + * least once in the query. + * + * XXX. We shouldn't emit anything for ordinary scans, and we should + * emit more detailed information for index, index-only, and + * bitmap-heap scans. + */ + for (int c = 0; c < NUM_PGPA_SCAN_STRATEGY; ++c) + { + char *cstrategy = pgpa_cstring_scan_strategy(c); + bool first = true; + + if (context.scans[c] == NIL) + continue; + + if (buf->len > 0) + appendStringInfoChar(buf, '\n'); + appendStringInfo(buf, "%s(", cstrategy); + + foreach(lc, context.scans[c]) + { + Bitmapset *relids = lfirst(lc); + + if (first) + first = false; + else + { + pgpa_maybe_linebreak(context.buf, context.wrap_column); + appendStringInfoChar(context.buf, ' '); + } + + if (bms_membership(relids) == BMS_SINGLETON) + pgpa_output_relations(&context, buf, relids); + else + { + appendStringInfoChar(buf, '('); + pgpa_output_relations(&context, buf, relids); + appendStringInfoChar(buf, ')'); + } + } + appendStringInfoChar(buf, ')'); + pgpa_maybe_linebreak(context.buf, context.wrap_column); + } + + /* Sort query features into one list per query feature type. */ + foreach(lc, walker->query_features) + { + pgpa_query_feature *qf = lfirst(lc); + + context.query_features[qf->type] = + lappend(context.query_features[qf->type], qf); + } + + /* + * Emit just one piece of advice for each type of query feature that is + * used at least once in the query. + */ + for (int t = 0; t < NUM_PGPA_QF_TYPES; ++t) + { + bool first = true; + + if (context.query_features[t] == NIL) + continue; + + if (buf->len > 0) + appendStringInfoChar(buf, '\n'); + + switch (t) + { + case PGPAQF_GATHER: + appendStringInfo(buf, "GATHER("); + break; + case PGPAQF_GATHER_MERGE: + appendStringInfo(buf, "GATHER_MERGE("); + break; + case PGPAQF_SEMIJOIN_NON_UNIQUE: + appendStringInfo(buf, "SEMIJOIN_NON_UNIQUE("); + break; + case PGPAQF_SEMIJOIN_UNIQUE: + appendStringInfo(buf, "SEMIJOIN_UNIQUE("); + break; + } + + foreach(lc, context.query_features[t]) + { + pgpa_query_feature *qf = lfirst(lc); + + if (first) + first = false; + else + { + pgpa_maybe_linebreak(context.buf, context.wrap_column); + appendStringInfoChar(context.buf, ' '); + } + + if (bms_membership(qf->relids) == BMS_SINGLETON) + pgpa_output_relations(&context, buf, qf->relids); + else + { + appendStringInfoChar(buf, '('); + pgpa_output_relations(&context, buf, qf->relids); + appendStringInfoChar(buf, ')'); + } + } + appendStringInfoChar(buf, ')'); + pgpa_maybe_linebreak(context.buf, context.wrap_column); + } +} + +/* + * Output the members of an unrolled join, first the outermost member, and + * then the inner members one by one, as part of JOIN_ORDER() advice. + * + * As we visit members, we add to the context->unrolled_joins[] and + * context->scans[] lists. The former updates are done by this function + * directly, and the latter via pgpa_output_join_member. + */ +static void +pgpa_output_unrolled_join(pgpa_output_context *context, + pgpa_unrolled_join *join) +{ + pgpa_output_join_member(context, &join->outer); + + for (int k = 0; k < join->ninner; ++k) + { + pgpa_join_member *member = &join->inner[k]; + Bitmapset *relids; + + pgpa_maybe_linebreak(context->buf, context->wrap_column); + appendStringInfoChar(context->buf, ' '); + pgpa_output_join_member(context, member); + + if (member->unrolled_join != NULL) + relids = pgpa_unrolled_join_all_relids(member->unrolled_join); + else + { + Assert(member->scan != NULL); + relids = member->scan->relids; + } + context->unrolled_joins[join->strategy[k]] = + lappend(context->unrolled_joins[join->strategy[k]], relids); + } +} + +/* + * Output a single member of an unrolled join as part of JOIN_ORDER() advice. + * + * If that member happens to be a scan, also add it to the appropriate + * context->scans[] list. + */ +static void +pgpa_output_join_member(pgpa_output_context *context, + pgpa_join_member *member) +{ + if (member->unrolled_join != NULL) + { + appendStringInfoChar(context->buf, '('); + pgpa_output_unrolled_join(context, member->unrolled_join); + appendStringInfoChar(context->buf, ')'); + } + else + { + pgpa_scan *scan = member->scan; + + Assert(scan != NULL); + if (bms_membership(scan->relids) == BMS_SINGLETON) + pgpa_output_relations(context, context->buf, scan->relids); + else + { + appendStringInfoChar(context->buf, '{'); + pgpa_output_relations(context, context->buf, scan->relids); + appendStringInfoChar(context->buf, '}'); + } + context->scans[scan->strategy] = + lappend(context->scans[scan->strategy], scan->relids); + } +} + +/* + * Output the identifiers for each RTI in the provided set. + * + * Identifiers are separated by spaces, and a line break is possible after + * each one. + */ +static void +pgpa_output_relations(pgpa_output_context *context, StringInfo buf, + Bitmapset *relids) +{ + int rti = -1; + bool first = true; + + while ((rti = bms_next_member(relids, rti)) >= 0) + { + const char *identifier = context->rt_identifiers[rti - 1]; + + if (identifier == NULL) + elog(ERROR, "no identifier for RTI %d", rti); + + if (first) + { + first = false; + appendStringInfoString(buf, identifier); + } + else + { + pgpa_maybe_linebreak(buf, context->wrap_column); + appendStringInfo(buf, " %s", identifier); + } + } +} + +/* + * Get a C string that corresponds to the specified join strategy. + */ +static char * +pgpa_cstring_join_strategy(pgpa_join_strategy strategy) +{ + switch (strategy) + { + case JSTRAT_MERGE_JOIN_PLAIN: + return "MERGE_JOIN_PLAIN"; + case JSTRAT_MERGE_JOIN_MATERIALIZE: + return "MERGE_JOIN_MATERIALIZE"; + case JSTRAT_NESTED_LOOP_PLAIN: + return "NESTED_LOOP_PLAIN"; + case JSTRAT_NESTED_LOOP_MATERIALIZE: + return "NESTED_LOOP_MATERIALIZE"; + case JSTRAT_NESTED_LOOP_MEMOIZE: + return "NESTED_LOOP_MEMOIZE"; + case JSTRAT_HASH_JOIN: + return "HASH_JOIN"; + } + + Assert(false); +} + +/* + * Get a C string that corresponds to the specified scan strategy. + */ +static char * +pgpa_cstring_scan_strategy(pgpa_scan_strategy strategy) +{ + switch (strategy) + { + case PGPA_SCAN_ORDINARY: + return "ORDINARY_SCAN"; + case PGPA_SCAN_SEQ: + return "SEQ_SCAN"; + case PGPA_SCAN_BITMAP_HEAP: + return "BITMAP_HEAP_SCAN"; + case PGPA_SCAN_FOREIGN: + return "FOREIGN_SCAN"; + case PGPA_SCAN_CUSTOM: + return "CUSTOM_SCAN"; + case PGPA_SCAN_INDEX: + return "INDEX_SCAN"; + case PGPA_SCAN_INDEX_ONLY: + return "INDEX_ONLY_SCAN"; + case PGPA_SCAN_PARTITIONWISE: + return "PARTITIONWISE"; + } + + Assert(false); +} + +/* + * Insert a line break into the StringInfoData, if needed. + * + * If wrap_column is zero or negative, this does nothing. Otherwise, we + * consider inserting a newline. We only insert a newline if the length of + * the last line in the buffer exceeds wrap_column, and not if we'd be + * inserting a newline at or before the beginning of the current line. + * + * The position at which the newline is inserted is simply wherever the + * buffer ended the last time this function was called. In other words, + * the caller is expected to call this function every time we reach a good + * place for a line break. + */ +static void +pgpa_maybe_linebreak(StringInfo buf, int wrap_column) +{ + char *trailing_nl; + int line_start; + int save_cursor; + + /* If line wrapping is disabled, exit quickly. */ + if (wrap_column <= 0) + return; + + /* + * Set line_start to the byte offset within buf->data of the first + * character of the current line, where the current line means the last + * one in the buffer. Note that line_start could be the offset of the + * trailing '\0' if the last character in the buffer is a line break. + */ + trailing_nl = strrchr(buf->data, '\n'); + if (trailing_nl == NULL) + line_start = 0; + else + line_start = (trailing_nl - buf->data) + 1; + + /* + * Remember that the current end of the buffer is a potential location to + * insert a line break on a future call to this function. + */ + save_cursor = buf->cursor; + buf->cursor = buf->len; + + /* If we haven't passed the wrap column, we don't need a newline. */ + if (buf->len - line_start <= wrap_column) + return; + + /* + * It only makes sense to insert a newline at a position later than the + * beginning of the current line. + */ + if (buf->cursor <= line_start) + return; + + /* Insert a newline at the previous cursor location. */ + enlargeStringInfo(buf, 1); + memmove(&buf->data[save_cursor] + 1, &buf->data[save_cursor], + buf->len - save_cursor); + ++buf->cursor; + buf->data[++buf->len] = '\0'; + buf->data[save_cursor] = '\n'; +} diff --git a/contrib/pg_plan_advice/pgpa_output.h b/contrib/pg_plan_advice/pgpa_output.h new file mode 100644 index 0000000000..f7d74d3794 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_output.h @@ -0,0 +1,21 @@ +/*------------------------------------------------------------------------- + * + * pgpa_output.h + * produce textual output from the results of a plan tree walk + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_output.c + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_OUTPUT_H +#define PGPA_OUTPUT_H + +#include "pgpa_walker.h" + +extern void pgpa_output_advice(StringInfo buf, + pgpa_plan_walker_context *walker, + const char **rt_identifiers); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_scan.c b/contrib/pg_plan_advice/pgpa_scan.c new file mode 100644 index 0000000000..5d0c6f5043 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_scan.c @@ -0,0 +1,193 @@ +/*------------------------------------------------------------------------- + * + * pgpa_scan.c + * analysis of scans in Plan trees + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_scan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pgpa_scan.h" +#include "pgpa_walker.h" + +#include "nodes/parsenodes.h" +#include "parser/parsetree.h" + +/* + * Build a pgpa_scan object for a Plan node. + * + * If there is at least one ElidedNode for this plan node, pass the uppermost + * one as elided_node, else pass NULL. + * + * Set the 'force' flag if we're inside of a join problem and not otherwise; + * see comments below for why this matters. + * + * The return value can be NULL only if force is false. + */ +pgpa_scan * +pgpa_build_scan(PlannedStmt *pstmt, Plan *plan, ElidedNode *elided_node, + bool force) +{ + pgpa_scan *scan; + pgpa_scan_strategy strategy = PGPA_SCAN_ORDINARY; + Bitmapset *relids = NULL; + int rti = -1; + bool filter_join_rtis = false; + + if (elided_node != NULL) + { + NodeTag elided_type = elided_node->elided_type; + + /* + * If setrefs processing elided an Append or MergeAppend node that + * had only one surviving child, then this is a partitionwise + * "scan" -- which may really be a partitionwise join, but there's + * no need to distinguish. + * + * If it's a trivial SubqueryScan that was elided, then this is an + * "ordinary" scan i.e. one for which we need to generate advice + * because the planner has not made any meaningful choice. + */ + relids = elided_node->relids; + if (elided_type == T_Append || elided_type == T_MergeAppend) + strategy = PGPA_SCAN_PARTITIONWISE; + else + strategy = PGPA_SCAN_ORDINARY; + filter_join_rtis = true; + } + else if ((rti = pgpa_scanrelid(plan)) != 0) + { + relids = bms_make_singleton(rti); + + switch (nodeTag(plan)) + { + case T_SeqScan: + strategy = PGPA_SCAN_SEQ; + break; + case T_BitmapHeapScan: + strategy = PGPA_SCAN_BITMAP_HEAP; + break; + case T_ForeignScan: + strategy = PGPA_SCAN_FOREIGN; + break; + case T_CustomScan: + strategy = PGPA_SCAN_CUSTOM; + break; + case T_IndexScan: + strategy = PGPA_SCAN_INDEX; + break; + case T_IndexOnlyScan: + strategy = PGPA_SCAN_INDEX_ONLY; + break; + default: + strategy = PGPA_SCAN_ORDINARY; + break; + } + } + else if ((relids = pgpa_relids(plan)) != NULL) + { + switch (nodeTag(plan)) + { + case T_ForeignScan: + strategy = PGPA_SCAN_FOREIGN; + break; + case T_Append: + case T_MergeAppend: + { + int first_rti; + RangeTblEntry *first_rte; + + first_rti = bms_next_member(relids, -1); + first_rte = rt_fetch(first_rti, pstmt->rtable); + + /* + * Append and MergeAppend nodes can be the result either + * of set operations or of inheritance expansion. In the + * latter case, this plan node is telling us that the relid + * set found in the plan node was handled partitionwise, + * but in the former, it isn't. Even if the set operations + * planner has multiple ways to execute some set operation, + * we lack the ability to describe what it decided here, + * and hence just label this as an ordinary scan. + * + * XXX. This breaks with a single Append node that involves + * both set operations and inheritance expansion. Consider: + * + * create table foo (a int) partition by range (a); + * create table foo0 partition of foo for values + * from (0) to (1000); + * create table foo1 partition of foo for values + * from (1000) to (2000); + * explain select * from foo union all select 1; + */ + if (first_rte->rtekind == RTE_RELATION) + strategy = PGPA_SCAN_PARTITIONWISE; + else + strategy = PGPA_SCAN_ORDINARY; + } + break; + default: + strategy = PGPA_SCAN_ORDINARY; + break; + } + filter_join_rtis = true; + } + + /* + * If this plan node has no associated RTIs, it's not a scan. When the + * 'force' flag is set, that's unexpected, so throw an error, else return + * quietly. + */ + if (relids == NULL) + { + if (force) + elog(ERROR, "plan node has no RTIs: %d", (int) nodeTag(plan)); + return NULL; + } + + /* + * Ordinary scans appearing outside of a join problem need not be tracked, + * since nothing interesting can be said either about the choice of join + * method or about the join order. + */ + if (!force && strategy == PGPA_SCAN_ORDINARY) + return NULL; + + /* Now create the result object. */ + scan = palloc(sizeof(pgpa_scan)); + scan->plan = plan; + scan->strategy = strategy; + + /* + * In some cases, it's possible for RTIs of type RTE_JOIN to be included, + * and when that is the case, we want to filter them out. We always refer + * to groups of relations in terms of the relation RTIs, not the join RTIs, + * so the join RTIs are just noise. + * + * When the code earlier in this function knows that no join RTIs can have + * infiltrated the relids set, it can set filter_join_rtis = false to save + * a few CPU cycles here. + * + * One important case where join RTIs can definitely occur is when we + * encounter a Result node where a whole join result has been proven empty. + */ + if (!filter_join_rtis) + scan->relids = relids; + else + { + scan->relids = NULL; + while ((rti = bms_next_member(relids, rti)) >= 0) + { + RangeTblEntry *rte = rt_fetch(rti, pstmt->rtable); + + if (rte->rtekind != RTE_JOIN) + scan->relids = bms_add_member(scan->relids, rti); + } + } + + return scan; +} diff --git a/contrib/pg_plan_advice/pgpa_scan.h b/contrib/pg_plan_advice/pgpa_scan.h new file mode 100644 index 0000000000..c2a7c645b7 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_scan.h @@ -0,0 +1,76 @@ +/*------------------------------------------------------------------------- + * + * pgpa_scan.h + * analysis of scans in Plan trees + * + * Note that our definition of "scan" is extremely broad. It includes + * (1) single plan nodes that scan multiple RTIs, such as a degenerate + * Result node that replaces what would otherwise have been a join, and + * (2) Append and MergeAppend nodes implementing a partitionwise scan + * or a partitionwise join. + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_scan.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_SCAN_H +#define PGPA_SCAN_H + +#include "nodes/plannodes.h" + +/* + * Scan strategies. + * + * PGPA_SCAN_ORDINARY is any scan strategy that isn't interesting to us + * because there is no meaningful planner decision involved. For example, + * the only way to scan a subquery is a SubqueryScan, and the only way to + * scan a VALUES construct is a ValuesScan. We need not care exactly which + * type of planner node was used in such cases, because the same thing will + * happen when replanning. + * + * PGPA_SCAN_ORDINARY also includes Result nodes that correspond to scans + * or even joins that are proved empty. We don't know whether or not the scan + * or join will still be provably empty at replanning time, but if it is, + * then no scan-type advice is needed, and if it's not, we can't recommend + * a scan type based on the current plan. + * + * PGPA_SCAN_PARTITIONWISE also lumps together scans and joins: this can + * be either a partitionwise scan of a partitioned table or a partitionwise + * join between several partitioned tables. Note that all decisions about + * whether or not to use partitionwise join are meaningful: no matter what + * we decided this time, we could do more or fewer things partitionwise the + * next time. + * + * Other scan strategies map one-to-one to plan nodes. + */ +typedef enum +{ + PGPA_SCAN_ORDINARY = 0, + PGPA_SCAN_SEQ, + PGPA_SCAN_BITMAP_HEAP, + PGPA_SCAN_FOREIGN, + PGPA_SCAN_CUSTOM, + PGPA_SCAN_INDEX, + PGPA_SCAN_INDEX_ONLY, + PGPA_SCAN_PARTITIONWISE + /* update NUM_PGPA_SCAN_STRATEGY if you add anything here */ +} pgpa_scan_strategy; + +#define NUM_PGPA_SCAN_STRATEGY ((int) PGPA_SCAN_PARTITIONWISE + 1) + +/* + * All of the details we need regarding a scan. + */ +typedef struct pgpa_scan +{ + Plan *plan; + pgpa_scan_strategy strategy; + Bitmapset *relids; +} pgpa_scan; + +extern pgpa_scan *pgpa_build_scan(PlannedStmt *pstmt, Plan *plan, + ElidedNode *elided_node, bool force); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_walker.c b/contrib/pg_plan_advice/pgpa_walker.c new file mode 100644 index 0000000000..9eb739220a --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_walker.c @@ -0,0 +1,408 @@ +#include "postgres.h" + +#include "pgpa_join.h" +#include "pgpa_scan.h" +#include "pgpa_walker.h" + +#include "nodes/plannodes.h" + +static pgpa_query_feature *pgpa_add_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, + Plan *plan); + +static void pgpa_qf_add_rti(List *active_query_features, Index rti); +static void pgpa_qf_add_rtis(List *active_query_features, Bitmapset *relids); +static void pgpa_qf_add_plan_rtis(List *active_query_features, Plan *plan); + +/* + * Iterate over the entire plan tree. + * + * XXX. More comments. + */ +void +pgpa_plan_walker(pgpa_plan_walker_context *walker, Plan *plan, + int join_unroll_level, + pgpa_join_unroller *join_unroller, + List *active_query_features) +{ + pgpa_join_unroller *outer_join_unroller = NULL; + pgpa_join_unroller *inner_join_unroller = NULL; + bool join_unroller_toplevel = false; + bool pushdown_query_features = false; + ListCell *lc; + List *extraplans = NIL; + List *elided_nodes = NIL; + bool is_query_feature = false; + + /* + * If this is a Gather or Gather Merge node, directly add it to the list + * of currently-active query features. + * + * Otherwise, check the future_query_features list to see whether this was + * previously identified as a plan node that needs to be treated as a + * query feature. + */ + if (IsA(plan, Gather)) + { + active_query_features = + lappend(active_query_features, + pgpa_add_feature(walker, PGPAQF_GATHER, plan)); + is_query_feature = true; + } + else if (IsA(plan, GatherMerge)) + { + active_query_features = + lappend(active_query_features, + pgpa_add_feature(walker, PGPAQF_GATHER_MERGE, plan)); + is_query_feature = true; + } + else + { + foreach_ptr(pgpa_query_feature, qf, walker->future_query_features) + { + if (qf->plan == plan) + { + is_query_feature = true; + active_query_features = lappend(active_query_features, qf); + walker->future_query_features = + list_delete_ptr(walker->future_query_features, plan); + break; + } + } + } + + /* + * Find all elided nodes for this Plan node. + */ + foreach_node(ElidedNode, n, walker->pstmt->elidedNodes) + { + if (n->plan_node_id == plan->plan_node_id) + elided_nodes = lappend(elided_nodes, n); + } + + /* If we found any elided_nodes, handle them. */ + if (elided_nodes != NIL) + { + int num_elided_nodes = list_length(elided_nodes); + ElidedNode *last_elided_node; + + /* XXX */ + last_elided_node = list_nth(elided_nodes, num_elided_nodes - 1); + pgpa_qf_add_rtis(active_query_features, last_elided_node->relids); + active_query_features = NIL; + + /* + * If there are multiple relids for the elided node, a clumped join + * should be built for it exactly once. When there's a join_unroller, + * the join unrolling process will have already built any necessary + * clumped join for the final elided node, so throw it out. + */ + if (join_unroller != NULL) + elided_nodes = list_truncate(elided_nodes, num_elided_nodes - 1); + + /* + * Every element of elided_nodes is an ElidedNode for which any + * necessary pgpa_scan has not yet been created. Do that here, and + * attach the resulting objects directly to the walker object, since + * we have nowhere else to put a reference to it. + * + * XXX. Isn't this a bald-faced lie? + */ + foreach_node(ElidedNode, n, elided_nodes) + { + pgpa_scan *scan; + + scan = pgpa_build_scan(walker->pstmt, plan, n, false); + if (scan != NULL) + walker->scans = lappend(walker->scans, scan); + } + + /* + * Because we only want to unroll joins that the planner would have + * considered as part of the same join problem, nodes elided during + * setrefs processing act as boundaries. + * + * In more detail, if an Append or MergeAppend was elided, then j a + * partitionwise join was chosen and only a single child survived; if + * a SubqueryScan was elided, the subquery was separately without + * flattening it into the parent. In either case, the join order and + * join methods beneath the elided node should be described separately + * from the join order and methods above the elided node. + */ + join_unroller = NULL; + } + + /* + * If this join needs to unrolled but there's no join unroller already + * available, create one. + */ + if (join_unroller == NULL && pgpa_is_join(plan)) + { + join_unroller = pgpa_create_join_unroller(); + join_unroller_toplevel = true; + ++join_unroll_level; + } + + /* + * We only need to handle scans here if we're not underneath a join + * unroller. When we are, the join unroller will create the pgpa_scan and + * and attach it to the unrolled join. Otherwise, do it here. + */ + if (join_unroll_level == 0) + { + pgpa_scan *scan; + + scan = pgpa_build_scan(walker->pstmt, plan, NULL, false); + if (scan != NULL) + walker->scans = lappend(walker->scans, scan); + } + + /* + * If this join is to be unrolled, pgpa_unroll_join() will return the join + * unroller object that should be passed down when we recurse into the + * outer and inner sides of the plan. + */ + if (join_unroller != NULL) + pgpa_unroll_join(walker, plan, join_unroller, + &outer_join_unroller, &inner_join_unroller); + + /* Add RTIs from the plan node to all active query features. */ + pgpa_qf_add_plan_rtis(active_query_features, plan); + + /* Recurse into the outer and inner subtrees. */ + if (plan->lefttree != NULL) + pgpa_plan_walker(walker, plan->lefttree, join_unroll_level, + outer_join_unroller, active_query_features); + if (plan->righttree != NULL) + pgpa_plan_walker(walker, plan->righttree, join_unroll_level, + inner_join_unroller, active_query_features); + + /* + * If we created a join unroller up above, then it's also our join to use + * it to build the final pgpa_unrolled_join, and to destroy the object. + */ + if (join_unroller_toplevel) + { + pgpa_unrolled_join *ujoin; + + ujoin = pgpa_build_unrolled_join(walker->pstmt, join_unroller); + walker->unrolled_joins = lappend(walker->unrolled_joins, ujoin); + pgpa_destroy_join_unroller(join_unroller); + } + + /* + * Some plan types can have additional children. Nodes like Append that + * can have any number of children store them in a List; a SubqueryScan + * just has a field for a single additional Plan. + */ + switch (nodeTag(plan)) + { + case T_Append: + { + Append *aplan = (Append *) plan; + + extraplans = aplan->appendplans; + pushdown_query_features = bms_is_empty(aplan->apprelids); + } + break; + case T_MergeAppend: + { + MergeAppend *maplan = (MergeAppend *) plan; + + extraplans = maplan->mergeplans; + pushdown_query_features = bms_is_empty(maplan->apprelids); + } + break; + case T_BitmapAnd: + extraplans = ((BitmapAnd *) plan)->bitmapplans; + break; + case T_BitmapOr: + extraplans = ((BitmapOr *) plan)->bitmapplans; + break; + case T_SubqueryScan: + + /* + * We don't pass down active_query_features across here, because + * those are specific to a subquery level. + */ + pgpa_plan_walker(walker, ((SubqueryScan *) plan)->subplan, + 0, NULL, NIL); + break; + case T_CustomScan: + extraplans = ((CustomScan *) plan)->custom_plans; + break; + default: + break; + } + + /* If we found a list of extra children, iterate over it. */ + foreach(lc, extraplans) + { + Plan *subplan = lfirst(lc); + + pgpa_plan_walker(walker, subplan, 0, NULL, + pushdown_query_features ? active_query_features : NIL); + } + + /* + * If the current node is a query feature, then active_query_features has + * been destructively modified as compared to the value passed down from + * the caller, and we need to put things back as they were. + * + * Exceptions: If the caller passed NIL, or if we reset the list to NIL + * because of the presence of an elided SubqueryScan, then we created a + * new list here instead of destructively modifying the caller's list. + * That case requires no special handling, because list_truncate() will + * simply exit quickly. + */ + if (is_query_feature) + { + int num_aqf = list_length(active_query_features); + + (void) list_truncate(active_query_features, num_aqf - 1); + } +} + +/* + * Arrange for the given plan node to be treated as a query feature when the + * tree walk reaches it. + * + * Make sure to only use this for nodes that the tree walk can't have reached + * yet! + */ +void +pgpa_add_future_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, Plan *plan) +{ + pgpa_query_feature *qf = pgpa_add_feature(walker, type, plan); + + walker->future_query_features = + lappend(walker->future_query_features, qf); +} + +/* + * Return the last of any elided nodes associated with this plan node ID. + * + * The last elided node is the one that would have been uppermost in the plan + * tree had it not been removed during setrefs processig. + */ +ElidedNode * +pgpa_last_elided_node(PlannedStmt *pstmt, Plan *plan) +{ + ElidedNode *elided_node = NULL; + + foreach_node(ElidedNode, n, pstmt->elidedNodes) + { + if (n->plan_node_id == plan->plan_node_id) + elided_node = n; + } + + return elided_node; +} + +/* + * Certain plan nodes can refer to a set of RTIs. Extract and return the set. + */ +Bitmapset * +pgpa_relids(Plan *plan) +{ + if (IsA(plan, Result)) + return ((Result *) plan)->relids; + else if (IsA(plan, ForeignScan)) + return ((ForeignScan *) plan)->fs_relids; + else if (IsA(plan, Append)) + return ((Append *) plan)->apprelids; + else if (IsA(plan, MergeAppend)) + return ((MergeAppend *) plan)->apprelids; + + return NULL; +} + +/* + * Extract the scanned RTI from a plan node. + * + * Returns 0 if there isn't one. + */ +Index +pgpa_scanrelid(Plan *plan) +{ + switch (nodeTag(plan)) + { + case T_SeqScan: + case T_SampleScan: + case T_BitmapHeapScan: + case T_TidScan: + case T_TidRangeScan: + case T_SubqueryScan: + case T_FunctionScan: + case T_TableFuncScan: + case T_ValuesScan: + case T_CteScan: + case T_NamedTuplestoreScan: + case T_WorkTableScan: + case T_ForeignScan: + case T_CustomScan: + case T_IndexScan: + case T_IndexOnlyScan: + return ((Scan *) plan)->scanrelid; + default: + return 0; + } +} + +/* + * Create a pgpa_query_feature and add it to the list of all query features + * for this plan. + */ +static pgpa_query_feature * +pgpa_add_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, Plan *plan) +{ + pgpa_query_feature *qf = palloc0_object(pgpa_query_feature); + + qf->type = type; + qf->plan = plan; + + walker->query_features = lappend(walker->query_features, qf); + + return qf; +} + +/* + * Add a single RTI to each active query feature. + */ +static void +pgpa_qf_add_rti(List *active_query_features, Index rti) +{ + foreach_ptr(pgpa_query_feature, qf, active_query_features) + { + qf->relids = bms_add_member(qf->relids, rti); + } +} + +/* + * Add a set of RTIs to each active query feature. + */ +static void +pgpa_qf_add_rtis(List *active_query_features, Bitmapset *relids) +{ + foreach_ptr(pgpa_query_feature, qf, active_query_features) + { + qf->relids = bms_add_members(qf->relids, relids); + } +} + +/* + * Add RTIs directly contained in a plan node to each active query feature. + */ +static void +pgpa_qf_add_plan_rtis(List *active_query_features, Plan *plan) +{ + Bitmapset *relids; + Index rti; + + if ((relids = pgpa_relids(plan)) != NULL) + pgpa_qf_add_rtis(active_query_features, relids); + else if ((rti = pgpa_scanrelid(plan)) != 0) + pgpa_qf_add_rti(active_query_features, rti); +} diff --git a/contrib/pg_plan_advice/pgpa_walker.h b/contrib/pg_plan_advice/pgpa_walker.h new file mode 100644 index 0000000000..871d31801d --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_walker.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * pgpa_walker.h + * Plan tree iteration + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_walker.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_WALKER_H +#define PGPA_WALKER_H + +#include "pgpa_join.h" + +/* + * We use the term "query feature" to refer to plan nodes that are interesting + * in the following way: to generate advice, we'll need to know the set of + * same-subquery, non-join RTIs occuring at or below that plan node, without + * admixture of parent and child RTIs. + * + * For example, Gather nodes, desiginated by PGPAQF_GATHER, and Gather Merge + * nodes, designated by PGPAQF_GATHER_MERGE, are query features, because we'll + * want to admit some kind of advice that describes the portion of the plan + * tree that appears beneath those nodes. + * + * Each semijoin can be implemented either by directly performing a semijoin, + * or by making one side unique and then performing a normal join. Either way, + * we use a query feature to notice what decision was made, so that we can + * describe it by enumerating the RTIs on that side of the join. + * + * To elaborate on the "no admixture of parent and child RTIs" rule, in all of + * these cases, if the entirety of an inheritance hierarchy appears beneath + * the query feature, we only want to name the parent table. But it's also + * possible to have cases where we must name child tables. This is particularly + * likely to happen when partitionwise join is in use, but could happen for + * Gather or Gather Merge even without that, if one of those appears below + * an Append or MergeAppend node for a single table. + */ +typedef enum pgpa_qf_type +{ + PGPAQF_GATHER, + PGPAQF_GATHER_MERGE, + PGPAQF_SEMIJOIN_NON_UNIQUE, + /* update NUM_PGPA_QF_TYPES if you add anything here */ + PGPAQF_SEMIJOIN_UNIQUE +} pgpa_qf_type; + +#define NUM_PGPA_QF_TYPES ((int) PGPAQF_SEMIJOIN_UNIQUE + 1) + +/* + * For each query feature, we keep track of the feature type and the set of + * relids that we found underneath the relevant plan node. See the comments + * on pgpa_qf_type, above, for additional details. + */ +typedef struct pgpa_query_feature +{ + pgpa_qf_type type; + Plan *plan; + Bitmapset *relids; +} pgpa_query_feature; + +typedef struct pgpa_plan_walker_context +{ + PlannedStmt *pstmt; + List *unrolled_joins; + List *scans; + List *query_features; + List *future_query_features; +} pgpa_plan_walker_context; + +extern void pgpa_plan_walker(pgpa_plan_walker_context *walker, Plan *plan, + int join_unroll_level, + pgpa_join_unroller *join_unroller, + List *active_query_features); + +extern void pgpa_add_future_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, + Plan *plan); + +extern ElidedNode *pgpa_last_elided_node(PlannedStmt *pstmt, Plan *plan); +extern Bitmapset *pgpa_relids(Plan *plan); +extern Index pgpa_scanrelid(Plan *plan); + +#endif diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index df850cf634..ad8416a4d8 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -4345,3 +4345,20 @@ z_streamp zic_t SubPlanRTInfo ElidedNode +pgpa_collected_advice +pgpa_join_class +pgpa_join_member +pgpa_join_strategy +pgpa_join_unroller +pgpa_local_advice +pgpa_local_advice_chunk +pgpa_output_context +pgpa_plan_walker_context +pgpa_qf_type +pgpa_query_feature +pgpa_scan +pgpa_scan_strategy +pgpa_shared_advice +pgpa_shared_advice_chunk +pgpa_shared_state +pgpa_unrolled_join |