Allow estimate_num_groups() to pass back further details about the estimation
authorDavid Rowley <drowley@postgresql.org>
Tue, 30 Mar 2021 07:52:46 +0000 (20:52 +1300)
committerDavid Rowley <drowley@postgresql.org>
Tue, 30 Mar 2021 07:52:46 +0000 (20:52 +1300)
Here we add a new output parameter to estimate_num_groups() to allow it to
inform the caller of additional, possibly useful information about the
estimation.

The new output parameter is a struct that currently contains just a single
field with a set of flags.  This was done rather than having the flags as
an output parameter to allow future fields to be added without having to
change the signature of the function at a later date when we want to pass
back further information that might not be suitable to store in the flags
field.

It seems reasonable that one day in the future that the planner would want
to know more about the estimation. For example, how many individual sets
of statistics was the estimation generated from?  The planner may want to
take that into account if we ever want to consider risks as well as costs
when generating plans.

For now, there's only 1 flag we set in the flags field.  This is to
indicate if the estimation fell back on using the hard-coded constants in
any part of the estimation. Callers may like to change their behavior if
this is set, and this gives them the ability to do so.  Callers may pass
the flag pointer as NULL if they have no interest in obtaining any
additional information about the estimate.

We're not adding any actual usages of these flags here.  Some follow-up
commits will make use of this feature.  Additionally, we're also not
making any changes to add support for clauselist_selectivity() and
clauselist_selectivity_ext().  However, if this is required in the future
then the same struct being added here should be fine to use as a new
output argument for those functions too.

Author: David Rowley
Discussion: https://postgr.es/m/CAApHDvqQqpk=1W-G_ds7A9CsXX3BggWj_7okinzkLVhDubQzjA@mail.gmail.com

contrib/postgres_fdw/postgres_fdw.c
src/backend/optimizer/path/costsize.c
src/backend/optimizer/path/indxpath.c
src/backend/optimizer/plan/planner.c
src/backend/optimizer/prep/prepunion.c
src/backend/optimizer/util/pathnode.c
src/backend/utils/adt/selfuncs.c
src/include/utils/selfuncs.h

index a4babed2b04631501044b5630dfaa4808edfcced..20b25935ce68c4431a1a6704348c0ca4c0f7b3cf 100644 (file)
@@ -3087,7 +3087,7 @@ estimate_path_cost_size(PlannerInfo *root,
                        numGroups = estimate_num_groups(root,
                                                                                        get_sortgrouplist_exprs(root->parse->groupClause,
                                                                                                                                        fpinfo->grouped_tlist),
-                                                                                       input_rows, NULL);
+                                                                                       input_rows, NULL, NULL);
 
                        /*
                         * Get the retrieved_rows and rows estimates.  If there are HAVING
index a25b674a1927f02d4919acb3ab05856b7832590a..b92c94858821e9b2142b1d0c380e0b41274bb9aa 100644 (file)
@@ -1969,7 +1969,8 @@ cost_incremental_sort(Path *path,
 
        /* Estimate number of groups with equal presorted keys. */
        if (!unknown_varno)
-               input_groups = estimate_num_groups(root, presortedExprs, input_tuples, NULL);
+               input_groups = estimate_num_groups(root, presortedExprs, input_tuples,
+                                                                                  NULL, NULL);
 
        group_tuples = input_tuples / input_groups;
        group_input_run_cost = input_run_cost / input_groups;
index ff536e6b24ba4a84d56d3624b5055f8af8f32282..53b24e9e8c80ea07c36b1a5d63726ffff7029e35 100644 (file)
@@ -1990,6 +1990,7 @@ adjust_rowcount_for_semijoins(PlannerInfo *root,
                        nunique = estimate_num_groups(root,
                                                                                  sjinfo->semi_rhs_exprs,
                                                                                  nraw,
+                                                                                 NULL,
                                                                                  NULL);
                        if (rowcount > nunique)
                                rowcount = nunique;
index f529d107d29c3614c6514d865e1442ab8b873a1c..0886bf4ae8f6001cd0906fd5f0c17b03ee04b090 100644 (file)
@@ -3702,7 +3702,8 @@ get_number_of_groups(PlannerInfo *root,
                                        double          numGroups = estimate_num_groups(root,
                                                                                                                                groupExprs,
                                                                                                                                path_rows,
-                                                                                                                               &gset);
+                                                                                                                               &gset,
+                                                                                                                               NULL);
 
                                        gs->numGroups = numGroups;
                                        rollup->numGroups += numGroups;
@@ -3727,7 +3728,8 @@ get_number_of_groups(PlannerInfo *root,
                                        double          numGroups = estimate_num_groups(root,
                                                                                                                                groupExprs,
                                                                                                                                path_rows,
-                                                                                                                               &gset);
+                                                                                                                               &gset,
+                                                                                                                               NULL);
 
                                        gs->numGroups = numGroups;
                                        gd->dNumHashGroups += numGroups;
@@ -3743,7 +3745,7 @@ get_number_of_groups(PlannerInfo *root,
                                                                                                 target_list);
 
                        dNumGroups = estimate_num_groups(root, groupExprs, path_rows,
-                                                                                        NULL);
+                                                                                        NULL, NULL);
                }
        }
        else if (parse->groupingSets)
@@ -4792,7 +4794,7 @@ create_distinct_paths(PlannerInfo *root,
                                                                                                parse->targetList);
                numDistinctRows = estimate_num_groups(root, distinctExprs,
                                                                                          cheapest_input_path->rows,
-                                                                                         NULL);
+                                                                                         NULL, NULL);
        }
 
        /*
index becdcbb872533123b5c12bcc8397d6d6aeb36109..037dfaacfd4241f5c205a291c91e97da24bac894 100644 (file)
@@ -338,6 +338,7 @@ recurse_set_operations(Node *setOp, PlannerInfo *root,
                                *pNumGroups = estimate_num_groups(subroot,
                                                                                                  get_tlist_exprs(subquery->targetList, false),
                                                                                                  subpath->rows,
+                                                                                                 NULL,
                                                                                                  NULL);
                }
        }
index 69b83071cf2190e020fcf1f5051aeb1e82b84497..d5c66780ac861a6f705568f0c4530f32560f2880 100644 (file)
@@ -1713,6 +1713,7 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
        pathnode->path.rows = estimate_num_groups(root,
                                                                                          sjinfo->semi_rhs_exprs,
                                                                                          rel->rows,
+                                                                                         NULL,
                                                                                          NULL);
        numCols = list_length(sjinfo->semi_rhs_exprs);
 
index 7e41bc56418ea0a2c9ac8a4a46e51d8454598461..0963e2701cb7cd061afc51e35b4ee0ca35a82fc7 100644 (file)
@@ -3241,6 +3241,7 @@ typedef struct
        Node       *var;                        /* might be an expression, not just a Var */
        RelOptInfo *rel;                        /* relation it belongs to */
        double          ndistinct;              /* # distinct values */
+       bool            isdefault;              /* true if DEFAULT_NUM_DISTINCT was used */
 } GroupVarInfo;
 
 static List *
@@ -3287,6 +3288,7 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
        varinfo->var = var;
        varinfo->rel = vardata->rel;
        varinfo->ndistinct = ndistinct;
+       varinfo->isdefault = isdefault;
        varinfos = lappend(varinfos, varinfo);
        return varinfos;
 }
@@ -3311,6 +3313,12 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
  *     pgset - NULL, or a List** pointing to a grouping set to filter the
  *             groupExprs against
  *
+ * Outputs:
+ *     estinfo - When passed as non-NULL, the function will set bits in the
+ *             "flags" field in order to provide callers with additional information
+ *             about the estimation.  Currently, we only set the SELFLAG_USED_DEFAULT
+ *             bit if we used any default values in the estimation.
+ *
  * Given the lack of any cross-correlation statistics in the system, it's
  * impossible to do anything really trustworthy with GROUP BY conditions
  * involving multiple Vars.  We should however avoid assuming the worst
@@ -3358,7 +3366,7 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
  */
 double
 estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
-                                       List **pgset)
+                                       List **pgset, EstimationInfo *estinfo)
 {
        List       *varinfos = NIL;
        double          srf_multiplier = 1.0;
@@ -3366,6 +3374,10 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
        ListCell   *l;
        int                     i;
 
+       /* Zero the estinfo output parameter, if non-NULL */
+       if (estinfo != NULL)
+               memset(estinfo, 0, sizeof(EstimationInfo));
+
        /*
         * We don't ever want to return an estimate of zero groups, as that tends
         * to lead to division-by-zero and other unpleasantness.  The input_rows
@@ -3577,6 +3589,14 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
                                        if (relmaxndistinct < varinfo2->ndistinct)
                                                relmaxndistinct = varinfo2->ndistinct;
                                        relvarcount++;
+
+                                       /*
+                                        * When varinfo2's isdefault is set then we'd better set
+                                        * the SELFLAG_USED_DEFAULT bit in the EstimationInfo.
+                                        */
+                                       if (estinfo != NULL && varinfo2->isdefault)
+                                               estinfo->flags |= SELFLAG_USED_DEFAULT;
+
                                }
 
                                /* we're done with this relation */
index f9be539602b259b18952cb8bc9fcf3740eade3a4..78cde58acc109226d5c8f1cc149c35527e36dffb 100644 (file)
                        p = 1.0; \
        } while (0)
 
+/*
+ * A set of flags which some selectivity estimation functions can pass back to
+ * callers to provide further details about some assumptions which were made
+ * during the estimation.
+ */
+#define SELFLAG_USED_DEFAULT           (1 << 0)        /* Estimation fell back on one
+                                                                                                * of the DEFAULTs as defined
+                                                                                                * above. */
+
+typedef struct EstimationInfo
+{
+       uint32                  flags;          /* Flags, as defined above to mark special
+                                                                * properties of the estimation. */
+} EstimationInfo;
 
 /* Return data from examine_variable and friends */
 typedef struct VariableStatData
@@ -197,7 +211,8 @@ extern void mergejoinscansel(PlannerInfo *root, Node *clause,
                                                         Selectivity *rightstart, Selectivity *rightend);
 
 extern double estimate_num_groups(PlannerInfo *root, List *groupExprs,
-                                                                 double input_rows, List **pgset);
+                                                                 double input_rows, List **pgset,
+                                                                 EstimationInfo *estinfo);
 
 extern void estimate_hash_bucket_stats(PlannerInfo *root,
                                                                           Node *hashkey, double nbuckets,