summaryrefslogtreecommitdiff
path: root/src/backend/statistics
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/statistics')
-rw-r--r--src/backend/statistics/dependencies.c4
-rw-r--r--src/backend/statistics/extended_stats.c217
-rw-r--r--src/backend/statistics/mcv.c169
3 files changed, 313 insertions, 77 deletions
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c
index d950b4eabe0..b1abcde9687 100644
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@@ -1073,8 +1073,8 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
}
}
- simple_sel = clauselist_selectivity_simple(root, attr_clauses, varRelid,
- jointype, sjinfo, NULL);
+ simple_sel = clauselist_selectivity_ext(root, attr_clauses, varRelid,
+ jointype, sjinfo, false);
attr_sel[attidx++] = simple_sel;
}
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 36326927c6b..8d3cd091ada 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1239,10 +1239,10 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
* One of the main challenges with using MCV lists is how to extrapolate the
* estimate to the data not covered by the MCV list. To do that, we compute
* not only the "MCV selectivity" (selectivities for MCV items matching the
- * supplied clauses), but also a couple of derived selectivities:
+ * supplied clauses), but also the following related selectivities:
*
- * - simple selectivity: Computed without extended statistic, i.e. as if the
- * columns/clauses were independent
+ * - simple selectivity: Computed without extended statistics, i.e. as if the
+ * columns/clauses were independent.
*
* - base selectivity: Similar to simple selectivity, but is computed using
* the extended statistic by adding up the base frequencies (that we compute
@@ -1250,30 +1250,9 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
*
* - total selectivity: Selectivity covered by the whole MCV list.
*
- * - other selectivity: A selectivity estimate for data not covered by the MCV
- * list (i.e. satisfying the clauses, but not common enough to make it into
- * the MCV list)
- *
- * Note: While simple and base selectivities are defined in a quite similar
- * way, the values are computed differently and are not therefore equal. The
- * simple selectivity is computed as a product of per-clause estimates, while
- * the base selectivity is computed by adding up base frequencies of matching
- * items of the multi-column MCV list. So the values may differ for two main
- * reasons - (a) the MCV list may not cover 100% of the data and (b) some of
- * the MCV items did not match the estimated clauses.
- *
- * As both (a) and (b) reduce the base selectivity value, it generally holds
- * that (simple_selectivity >= base_selectivity). If the MCV list covers all
- * the data, the values may be equal.
- *
- * So, (simple_selectivity - base_selectivity) is an estimate for the part
- * not covered by the MCV list, and (mcv_selectivity - base_selectivity) may
- * be seen as a correction for the part covered by the MCV list. Those two
- * statements are actually equivalent.
- *
- * Note: Due to rounding errors and minor differences in how the estimates
- * are computed, the inequality may not always hold. Which is why we clamp
- * the selectivities to prevent strange estimate (negative etc.).
+ * These are passed to mcv_combine_selectivities() which combines them to
+ * produce a selectivity estimate that makes use of both per-column statistics
+ * and the multi-column MCV statistics.
*
* 'estimatedclauses' is an input/output parameter. We set bits for the
* 0-based 'clauses' indexes we estimate for and also skip clause items that
@@ -1282,16 +1261,17 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
static Selectivity
statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
JoinType jointype, SpecialJoinInfo *sjinfo,
- RelOptInfo *rel, Bitmapset **estimatedclauses)
+ RelOptInfo *rel, Bitmapset **estimatedclauses,
+ bool is_or)
{
ListCell *l;
Bitmapset **list_attnums;
int listidx;
- Selectivity sel = 1.0;
+ Selectivity sel = (is_or) ? 0.0 : 1.0;
/* check if there's any stats that might be useful for us. */
if (!has_stats_of_kind(rel->statlist, STATS_EXT_MCV))
- return 1.0;
+ return sel;
list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) *
list_length(clauses));
@@ -1327,12 +1307,7 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
{
StatisticExtInfo *stat;
List *stat_clauses;
- Selectivity simple_sel,
- mcv_sel,
- mcv_basesel,
- mcv_totalsel,
- other_sel,
- stat_sel;
+ Bitmapset *simple_clauses;
/* find the best suited statistics object for these attnums */
stat = choose_best_statistics(rel->statlist, STATS_EXT_MCV,
@@ -1351,6 +1326,9 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
/* now filter the clauses to be estimated using the selected MCV */
stat_clauses = NIL;
+ /* record which clauses are simple (single column) */
+ simple_clauses = NULL;
+
listidx = 0;
foreach(l, clauses)
{
@@ -1361,6 +1339,10 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
if (list_attnums[listidx] != NULL &&
bms_is_subset(list_attnums[listidx], stat->keys))
{
+ if (bms_membership(list_attnums[listidx]) == BMS_SINGLETON)
+ simple_clauses = bms_add_member(simple_clauses,
+ list_length(stat_clauses));
+
stat_clauses = lappend(stat_clauses, (Node *) lfirst(l));
*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
@@ -1371,40 +1353,131 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
listidx++;
}
- /*
- * First compute "simple" selectivity, i.e. without the extended
- * statistics, and essentially assuming independence of the
- * columns/clauses. We'll then use the various selectivities computed
- * from MCV list to improve it.
- */
- simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid,
- jointype, sjinfo, NULL);
-
- /*
- * Now compute the multi-column estimate from the MCV list, along with
- * the other selectivities (base & total selectivity).
- */
- mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid,
- jointype, sjinfo, rel,
- &mcv_basesel, &mcv_totalsel);
+ if (is_or)
+ {
+ bool *or_matches = NULL;
+ Selectivity simple_or_sel = 0.0;
+ MCVList *mcv_list;
- /* Estimated selectivity of values not covered by MCV matches */
- other_sel = simple_sel - mcv_basesel;
- CLAMP_PROBABILITY(other_sel);
+ /* Load the MCV list stored in the statistics object */
+ mcv_list = statext_mcv_load(stat->statOid);
- /* The non-MCV selectivity can't exceed the 1 - mcv_totalsel. */
- if (other_sel > 1.0 - mcv_totalsel)
- other_sel = 1.0 - mcv_totalsel;
+ /*
+ * Compute the selectivity of the ORed list of clauses by
+ * estimating each in turn and combining them using the formula
+ * P(A OR B) = P(A) + P(B) - P(A AND B). This allows us to use
+ * the multivariate MCV stats to better estimate each term.
+ *
+ * Each time we iterate this formula, the clause "A" above is
+ * equal to all the clauses processed so far, combined with "OR".
+ */
+ listidx = 0;
+ foreach(l, stat_clauses)
+ {
+ Node *clause = (Node *) lfirst(l);
+ Selectivity simple_sel,
+ overlap_simple_sel,
+ mcv_sel,
+ mcv_basesel,
+ overlap_mcvsel,
+ overlap_basesel,
+ mcv_totalsel,
+ clause_sel,
+ overlap_sel;
+
+ /*
+ * "Simple" selectivity of the next clause and its overlap
+ * with any of the previous clauses. These are our initial
+ * estimates of P(B) and P(A AND B), assuming independence of
+ * columns/clauses.
+ */
+ simple_sel = clause_selectivity_ext(root, clause, varRelid,
+ jointype, sjinfo, false);
+
+ overlap_simple_sel = simple_or_sel * simple_sel;
+
+ /*
+ * New "simple" selectivity of all clauses seen so far,
+ * assuming independence.
+ */
+ simple_or_sel += simple_sel - overlap_simple_sel;
+ CLAMP_PROBABILITY(simple_or_sel);
+
+ /*
+ * Multi-column estimate of this clause using MCV statistics,
+ * along with base and total selectivities, and corresponding
+ * selectivities for the overlap term P(A AND B).
+ */
+ mcv_sel = mcv_clause_selectivity_or(root, stat, mcv_list,
+ clause, &or_matches,
+ &mcv_basesel,
+ &overlap_mcvsel,
+ &overlap_basesel,
+ &mcv_totalsel);
+
+ /*
+ * Combine the simple and multi-column estimates.
+ *
+ * If this clause is a simple single-column clause, then we
+ * just use the simple selectivity estimate for it, since the
+ * multi-column statistics are unlikely to improve on that
+ * (and in fact could make it worse). For the overlap, we
+ * always make use of the multi-column statistics.
+ */
+ if (bms_is_member(listidx, simple_clauses))
+ clause_sel = simple_sel;
+ else
+ clause_sel = mcv_combine_selectivities(simple_sel,
+ mcv_sel,
+ mcv_basesel,
+ mcv_totalsel);
+
+ overlap_sel = mcv_combine_selectivities(overlap_simple_sel,
+ overlap_mcvsel,
+ overlap_basesel,
+ mcv_totalsel);
+
+ /* Factor these into the overall result */
+ sel += clause_sel - overlap_sel;
+ CLAMP_PROBABILITY(sel);
+
+ listidx++;
+ }
+ }
+ else /* Implicitly-ANDed list of clauses */
+ {
+ Selectivity simple_sel,
+ mcv_sel,
+ mcv_basesel,
+ mcv_totalsel,
+ stat_sel;
- /*
- * Overall selectivity is the combination of MCV and non-MCV
- * estimates.
- */
- stat_sel = mcv_sel + other_sel;
- CLAMP_PROBABILITY(stat_sel);
+ /*
+ * "Simple" selectivity, i.e. without any extended statistics,
+ * essentially assuming independence of the columns/clauses.
+ */
+ simple_sel = clauselist_selectivity_ext(root, stat_clauses,
+ varRelid, jointype,
+ sjinfo, false);
- /* Factor the estimate from this MCV to the overall estimate. */
- sel *= stat_sel;
+ /*
+ * Multi-column estimate using MCV statistics, along with base and
+ * total selectivities.
+ */
+ mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses,
+ varRelid, jointype, sjinfo,
+ rel, &mcv_basesel,
+ &mcv_totalsel);
+
+ /* Combine the simple and multi-column estimates. */
+ stat_sel = mcv_combine_selectivities(simple_sel,
+ mcv_sel,
+ mcv_basesel,
+ mcv_totalsel);
+
+ /* Factor this into the overall result */
+ sel *= stat_sel;
+ }
}
return sel;
@@ -1417,13 +1490,21 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
Selectivity
statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
JoinType jointype, SpecialJoinInfo *sjinfo,
- RelOptInfo *rel, Bitmapset **estimatedclauses)
+ RelOptInfo *rel, Bitmapset **estimatedclauses,
+ bool is_or)
{
Selectivity sel;
/* First, try estimating clauses using a multivariate MCV list. */
sel = statext_mcv_clauselist_selectivity(root, clauses, varRelid, jointype,
- sjinfo, rel, estimatedclauses);
+ sjinfo, rel, estimatedclauses, is_or);
+
+ /*
+ * Functional dependencies only work for clauses connected by AND, so for
+ * OR clauses we're done.
+ */
+ if (is_or)
+ return sel;
/*
* Then, apply functional dependencies on the remaining clauses by calling
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index 6a262f15436..fae792a2ddf 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -32,6 +32,7 @@
#include "utils/fmgroids.h"
#include "utils/fmgrprotos.h"
#include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
#include "utils/syscache.h"
#include "utils/typcache.h"
@@ -1889,15 +1890,79 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
/*
+ * mcv_combine_selectivities
+ * Combine per-column and multi-column MCV selectivity estimates.
+ *
+ * simple_sel is a "simple" selectivity estimate (produced without using any
+ * extended statistics, essentially assuming independence of columns/clauses).
+ *
+ * mcv_sel and mcv_basesel are sums of the frequencies and base frequencies of
+ * all matching MCV items. The difference (mcv_sel - mcv_basesel) is then
+ * essentially interpreted as a correction to be added to simple_sel, as
+ * described below.
+ *
+ * mcv_totalsel is the sum of the frequencies of all MCV items (not just the
+ * matching ones). This is used as an upper bound on the portion of the
+ * selectivity estimates not covered by the MCV statistics.
+ *
+ * Note: While simple and base selectivities are defined in a quite similar
+ * way, the values are computed differently and are not therefore equal. The
+ * simple selectivity is computed as a product of per-clause estimates, while
+ * the base selectivity is computed by adding up base frequencies of matching
+ * items of the multi-column MCV list. So the values may differ for two main
+ * reasons - (a) the MCV list may not cover 100% of the data and (b) some of
+ * the MCV items did not match the estimated clauses.
+ *
+ * As both (a) and (b) reduce the base selectivity value, it generally holds
+ * that (simple_sel >= mcv_basesel). If the MCV list covers all the data, the
+ * values may be equal.
+ *
+ * So, other_sel = (simple_sel - mcv_basesel) is an estimate for the part not
+ * covered by the MCV list, and (mcv_sel - mcv_basesel) may be seen as a
+ * correction for the part covered by the MCV list. Those two statements are
+ * actually equivalent.
+ */
+Selectivity
+mcv_combine_selectivities(Selectivity simple_sel,
+ Selectivity mcv_sel,
+ Selectivity mcv_basesel,
+ Selectivity mcv_totalsel)
+{
+ Selectivity other_sel;
+ Selectivity sel;
+
+ /* estimated selectivity of values not covered by MCV matches */
+ other_sel = simple_sel - mcv_basesel;
+ CLAMP_PROBABILITY(other_sel);
+
+ /* this non-MCV selectivity cannot exceed 1 - mcv_totalsel */
+ if (other_sel > 1.0 - mcv_totalsel)
+ other_sel = 1.0 - mcv_totalsel;
+
+ /* overall selectivity is the sum of the MCV and non-MCV parts */
+ sel = mcv_sel + other_sel;
+ CLAMP_PROBABILITY(sel);
+
+ return sel;
+}
+
+
+/*
* mcv_clauselist_selectivity
- * Return the selectivity estimate computed using an MCV list.
+ * Use MCV statistics to estimate the selectivity of an implicitly-ANDed
+ * list of clauses.
*
- * First builds a bitmap of MCV items matching the clauses, and then sums
- * the frequencies of matching items.
+ * This determines which MCV items match every clause in the list and returns
+ * the sum of the frequencies of those items.
*
- * It also produces two additional interesting selectivities - total
- * selectivity of all the MCV items (not just the matching ones), and the
- * base frequency computed on the assumption of independence.
+ * In addition, it returns the sum of the base frequencies of each of those
+ * items (that is the sum of the selectivities that each item would have if
+ * the columns were independent of one another), and the total selectivity of
+ * all the MCV items (not just the matching ones). These are expected to be
+ * used together with a "simple" selectivity estimate (one based only on
+ * per-column statistics) to produce an overall selectivity estimate that
+ * makes use of both per-column and multi-column statistics --- see
+ * mcv_combine_selectivities().
*/
Selectivity
mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
@@ -1928,7 +1993,6 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
if (matches[i] != false)
{
- /* XXX Shouldn't the basesel be outside the if condition? */
*basesel += mcv->items[i].base_frequency;
s += mcv->items[i].frequency;
}
@@ -1936,3 +2000,94 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
return s;
}
+
+
+/*
+ * mcv_clause_selectivity_or
+ * Use MCV statistics to estimate the selectivity of a clause that
+ * appears in an ORed list of clauses.
+ *
+ * As with mcv_clauselist_selectivity() this determines which MCV items match
+ * the clause and returns both the sum of the frequencies and the sum of the
+ * base frequencies of those items, as well as the sum of the frequencies of
+ * all MCV items (not just the matching ones) so that this information can be
+ * used by mcv_combine_selectivities() to produce a selectivity estimate that
+ * makes use of both per-column and multi-column statistics.
+ *
+ * Additionally, we return information to help compute the overall selectivity
+ * of the ORed list of clauses assumed to contain this clause. This function
+ * is intended to be called for each clause in the ORed list of clauses,
+ * allowing the overall selectivity to be computed using the following
+ * algorithm:
+ *
+ * Suppose P[n] = P(C[1] OR C[2] OR ... OR C[n]) is the combined selectivity
+ * of the first n clauses in the list. Then the combined selectivity taking
+ * into account the next clause C[n+1] can be written as
+ *
+ * P[n+1] = P[n] + P(C[n+1]) - P((C[1] OR ... OR C[n]) AND C[n+1])
+ *
+ * The final term above represents the overlap between the clauses examined so
+ * far and the (n+1)'th clause. To estimate its selectivity, we track the
+ * match bitmap for the ORed list of clauses examined so far and examine its
+ * intersection with the match bitmap for the (n+1)'th clause.
+ *
+ * We then also return the sums of the MCV item frequencies and base
+ * frequencies for the match bitmap intersection corresponding to the overlap
+ * term above, so that they can be combined with a simple selectivity estimate
+ * for that term.
+ *
+ * The parameter "or_matches" is an in/out parameter tracking the match bitmap
+ * for the clauses examined so far. The caller is expected to set it to NULL
+ * the first time it calls this function.
+ */
+Selectivity
+mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat,
+ MCVList *mcv, Node *clause, bool **or_matches,
+ Selectivity *basesel, Selectivity *overlap_mcvsel,
+ Selectivity *overlap_basesel, Selectivity *totalsel)
+{
+ Selectivity s = 0.0;
+ bool *new_matches;
+ int i;
+
+ /* build the OR-matches bitmap, if not built already */
+ if (*or_matches == NULL)
+ *or_matches = palloc0(sizeof(bool) * mcv->nitems);
+
+ /* build the match bitmap for the new clause */
+ new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys,
+ mcv, false);
+
+ /*
+ * Sum the frequencies for all the MCV items matching this clause and also
+ * those matching the overlap between this clause and any of the preceding
+ * clauses as described above.
+ */
+ *basesel = 0.0;
+ *overlap_mcvsel = 0.0;
+ *overlap_basesel = 0.0;
+ *totalsel = 0.0;
+ for (i = 0; i < mcv->nitems; i++)
+ {
+ *totalsel += mcv->items[i].frequency;
+
+ if (new_matches[i])
+ {
+ s += mcv->items[i].frequency;
+ *basesel += mcv->items[i].base_frequency;
+
+ if ((*or_matches)[i])
+ {
+ *overlap_mcvsel += mcv->items[i].frequency;
+ *overlap_basesel += mcv->items[i].base_frequency;
+ }
+ }
+
+ /* update the OR-matches bitmap for the next clause */
+ (*or_matches)[i] = (*or_matches)[i] || new_matches[i];
+ }
+
+ pfree(new_matches);
+
+ return s;
+}