diff options
Diffstat (limited to 'src/backend/statistics')
| -rw-r--r-- | src/backend/statistics/dependencies.c | 4 | ||||
| -rw-r--r-- | src/backend/statistics/extended_stats.c | 217 | ||||
| -rw-r--r-- | src/backend/statistics/mcv.c | 169 |
3 files changed, 313 insertions, 77 deletions
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c index d950b4eabe0..b1abcde9687 100644 --- a/src/backend/statistics/dependencies.c +++ b/src/backend/statistics/dependencies.c @@ -1073,8 +1073,8 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses, } } - simple_sel = clauselist_selectivity_simple(root, attr_clauses, varRelid, - jointype, sjinfo, NULL); + simple_sel = clauselist_selectivity_ext(root, attr_clauses, varRelid, + jointype, sjinfo, false); attr_sel[attidx++] = simple_sel; } diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index 36326927c6b..8d3cd091ada 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -1239,10 +1239,10 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, * One of the main challenges with using MCV lists is how to extrapolate the * estimate to the data not covered by the MCV list. To do that, we compute * not only the "MCV selectivity" (selectivities for MCV items matching the - * supplied clauses), but also a couple of derived selectivities: + * supplied clauses), but also the following related selectivities: * - * - simple selectivity: Computed without extended statistic, i.e. as if the - * columns/clauses were independent + * - simple selectivity: Computed without extended statistics, i.e. as if the + * columns/clauses were independent. * * - base selectivity: Similar to simple selectivity, but is computed using * the extended statistic by adding up the base frequencies (that we compute @@ -1250,30 +1250,9 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, * * - total selectivity: Selectivity covered by the whole MCV list. * - * - other selectivity: A selectivity estimate for data not covered by the MCV - * list (i.e. satisfying the clauses, but not common enough to make it into - * the MCV list) - * - * Note: While simple and base selectivities are defined in a quite similar - * way, the values are computed differently and are not therefore equal. The - * simple selectivity is computed as a product of per-clause estimates, while - * the base selectivity is computed by adding up base frequencies of matching - * items of the multi-column MCV list. So the values may differ for two main - * reasons - (a) the MCV list may not cover 100% of the data and (b) some of - * the MCV items did not match the estimated clauses. - * - * As both (a) and (b) reduce the base selectivity value, it generally holds - * that (simple_selectivity >= base_selectivity). If the MCV list covers all - * the data, the values may be equal. - * - * So, (simple_selectivity - base_selectivity) is an estimate for the part - * not covered by the MCV list, and (mcv_selectivity - base_selectivity) may - * be seen as a correction for the part covered by the MCV list. Those two - * statements are actually equivalent. - * - * Note: Due to rounding errors and minor differences in how the estimates - * are computed, the inequality may not always hold. Which is why we clamp - * the selectivities to prevent strange estimate (negative etc.). + * These are passed to mcv_combine_selectivities() which combines them to + * produce a selectivity estimate that makes use of both per-column statistics + * and the multi-column MCV statistics. * * 'estimatedclauses' is an input/output parameter. We set bits for the * 0-based 'clauses' indexes we estimate for and also skip clause items that @@ -1282,16 +1261,17 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, static Selectivity statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo, - RelOptInfo *rel, Bitmapset **estimatedclauses) + RelOptInfo *rel, Bitmapset **estimatedclauses, + bool is_or) { ListCell *l; Bitmapset **list_attnums; int listidx; - Selectivity sel = 1.0; + Selectivity sel = (is_or) ? 0.0 : 1.0; /* check if there's any stats that might be useful for us. */ if (!has_stats_of_kind(rel->statlist, STATS_EXT_MCV)) - return 1.0; + return sel; list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) * list_length(clauses)); @@ -1327,12 +1307,7 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli { StatisticExtInfo *stat; List *stat_clauses; - Selectivity simple_sel, - mcv_sel, - mcv_basesel, - mcv_totalsel, - other_sel, - stat_sel; + Bitmapset *simple_clauses; /* find the best suited statistics object for these attnums */ stat = choose_best_statistics(rel->statlist, STATS_EXT_MCV, @@ -1351,6 +1326,9 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli /* now filter the clauses to be estimated using the selected MCV */ stat_clauses = NIL; + /* record which clauses are simple (single column) */ + simple_clauses = NULL; + listidx = 0; foreach(l, clauses) { @@ -1361,6 +1339,10 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli if (list_attnums[listidx] != NULL && bms_is_subset(list_attnums[listidx], stat->keys)) { + if (bms_membership(list_attnums[listidx]) == BMS_SINGLETON) + simple_clauses = bms_add_member(simple_clauses, + list_length(stat_clauses)); + stat_clauses = lappend(stat_clauses, (Node *) lfirst(l)); *estimatedclauses = bms_add_member(*estimatedclauses, listidx); @@ -1371,40 +1353,131 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli listidx++; } - /* - * First compute "simple" selectivity, i.e. without the extended - * statistics, and essentially assuming independence of the - * columns/clauses. We'll then use the various selectivities computed - * from MCV list to improve it. - */ - simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid, - jointype, sjinfo, NULL); - - /* - * Now compute the multi-column estimate from the MCV list, along with - * the other selectivities (base & total selectivity). - */ - mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid, - jointype, sjinfo, rel, - &mcv_basesel, &mcv_totalsel); + if (is_or) + { + bool *or_matches = NULL; + Selectivity simple_or_sel = 0.0; + MCVList *mcv_list; - /* Estimated selectivity of values not covered by MCV matches */ - other_sel = simple_sel - mcv_basesel; - CLAMP_PROBABILITY(other_sel); + /* Load the MCV list stored in the statistics object */ + mcv_list = statext_mcv_load(stat->statOid); - /* The non-MCV selectivity can't exceed the 1 - mcv_totalsel. */ - if (other_sel > 1.0 - mcv_totalsel) - other_sel = 1.0 - mcv_totalsel; + /* + * Compute the selectivity of the ORed list of clauses by + * estimating each in turn and combining them using the formula + * P(A OR B) = P(A) + P(B) - P(A AND B). This allows us to use + * the multivariate MCV stats to better estimate each term. + * + * Each time we iterate this formula, the clause "A" above is + * equal to all the clauses processed so far, combined with "OR". + */ + listidx = 0; + foreach(l, stat_clauses) + { + Node *clause = (Node *) lfirst(l); + Selectivity simple_sel, + overlap_simple_sel, + mcv_sel, + mcv_basesel, + overlap_mcvsel, + overlap_basesel, + mcv_totalsel, + clause_sel, + overlap_sel; + + /* + * "Simple" selectivity of the next clause and its overlap + * with any of the previous clauses. These are our initial + * estimates of P(B) and P(A AND B), assuming independence of + * columns/clauses. + */ + simple_sel = clause_selectivity_ext(root, clause, varRelid, + jointype, sjinfo, false); + + overlap_simple_sel = simple_or_sel * simple_sel; + + /* + * New "simple" selectivity of all clauses seen so far, + * assuming independence. + */ + simple_or_sel += simple_sel - overlap_simple_sel; + CLAMP_PROBABILITY(simple_or_sel); + + /* + * Multi-column estimate of this clause using MCV statistics, + * along with base and total selectivities, and corresponding + * selectivities for the overlap term P(A AND B). + */ + mcv_sel = mcv_clause_selectivity_or(root, stat, mcv_list, + clause, &or_matches, + &mcv_basesel, + &overlap_mcvsel, + &overlap_basesel, + &mcv_totalsel); + + /* + * Combine the simple and multi-column estimates. + * + * If this clause is a simple single-column clause, then we + * just use the simple selectivity estimate for it, since the + * multi-column statistics are unlikely to improve on that + * (and in fact could make it worse). For the overlap, we + * always make use of the multi-column statistics. + */ + if (bms_is_member(listidx, simple_clauses)) + clause_sel = simple_sel; + else + clause_sel = mcv_combine_selectivities(simple_sel, + mcv_sel, + mcv_basesel, + mcv_totalsel); + + overlap_sel = mcv_combine_selectivities(overlap_simple_sel, + overlap_mcvsel, + overlap_basesel, + mcv_totalsel); + + /* Factor these into the overall result */ + sel += clause_sel - overlap_sel; + CLAMP_PROBABILITY(sel); + + listidx++; + } + } + else /* Implicitly-ANDed list of clauses */ + { + Selectivity simple_sel, + mcv_sel, + mcv_basesel, + mcv_totalsel, + stat_sel; - /* - * Overall selectivity is the combination of MCV and non-MCV - * estimates. - */ - stat_sel = mcv_sel + other_sel; - CLAMP_PROBABILITY(stat_sel); + /* + * "Simple" selectivity, i.e. without any extended statistics, + * essentially assuming independence of the columns/clauses. + */ + simple_sel = clauselist_selectivity_ext(root, stat_clauses, + varRelid, jointype, + sjinfo, false); - /* Factor the estimate from this MCV to the overall estimate. */ - sel *= stat_sel; + /* + * Multi-column estimate using MCV statistics, along with base and + * total selectivities. + */ + mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, + varRelid, jointype, sjinfo, + rel, &mcv_basesel, + &mcv_totalsel); + + /* Combine the simple and multi-column estimates. */ + stat_sel = mcv_combine_selectivities(simple_sel, + mcv_sel, + mcv_basesel, + mcv_totalsel); + + /* Factor this into the overall result */ + sel *= stat_sel; + } } return sel; @@ -1417,13 +1490,21 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli Selectivity statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo, - RelOptInfo *rel, Bitmapset **estimatedclauses) + RelOptInfo *rel, Bitmapset **estimatedclauses, + bool is_or) { Selectivity sel; /* First, try estimating clauses using a multivariate MCV list. */ sel = statext_mcv_clauselist_selectivity(root, clauses, varRelid, jointype, - sjinfo, rel, estimatedclauses); + sjinfo, rel, estimatedclauses, is_or); + + /* + * Functional dependencies only work for clauses connected by AND, so for + * OR clauses we're done. + */ + if (is_or) + return sel; /* * Then, apply functional dependencies on the remaining clauses by calling diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c index 6a262f15436..fae792a2ddf 100644 --- a/src/backend/statistics/mcv.c +++ b/src/backend/statistics/mcv.c @@ -32,6 +32,7 @@ #include "utils/fmgroids.h" #include "utils/fmgrprotos.h" #include "utils/lsyscache.h" +#include "utils/selfuncs.h" #include "utils/syscache.h" #include "utils/typcache.h" @@ -1889,15 +1890,79 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses, /* + * mcv_combine_selectivities + * Combine per-column and multi-column MCV selectivity estimates. + * + * simple_sel is a "simple" selectivity estimate (produced without using any + * extended statistics, essentially assuming independence of columns/clauses). + * + * mcv_sel and mcv_basesel are sums of the frequencies and base frequencies of + * all matching MCV items. The difference (mcv_sel - mcv_basesel) is then + * essentially interpreted as a correction to be added to simple_sel, as + * described below. + * + * mcv_totalsel is the sum of the frequencies of all MCV items (not just the + * matching ones). This is used as an upper bound on the portion of the + * selectivity estimates not covered by the MCV statistics. + * + * Note: While simple and base selectivities are defined in a quite similar + * way, the values are computed differently and are not therefore equal. The + * simple selectivity is computed as a product of per-clause estimates, while + * the base selectivity is computed by adding up base frequencies of matching + * items of the multi-column MCV list. So the values may differ for two main + * reasons - (a) the MCV list may not cover 100% of the data and (b) some of + * the MCV items did not match the estimated clauses. + * + * As both (a) and (b) reduce the base selectivity value, it generally holds + * that (simple_sel >= mcv_basesel). If the MCV list covers all the data, the + * values may be equal. + * + * So, other_sel = (simple_sel - mcv_basesel) is an estimate for the part not + * covered by the MCV list, and (mcv_sel - mcv_basesel) may be seen as a + * correction for the part covered by the MCV list. Those two statements are + * actually equivalent. + */ +Selectivity +mcv_combine_selectivities(Selectivity simple_sel, + Selectivity mcv_sel, + Selectivity mcv_basesel, + Selectivity mcv_totalsel) +{ + Selectivity other_sel; + Selectivity sel; + + /* estimated selectivity of values not covered by MCV matches */ + other_sel = simple_sel - mcv_basesel; + CLAMP_PROBABILITY(other_sel); + + /* this non-MCV selectivity cannot exceed 1 - mcv_totalsel */ + if (other_sel > 1.0 - mcv_totalsel) + other_sel = 1.0 - mcv_totalsel; + + /* overall selectivity is the sum of the MCV and non-MCV parts */ + sel = mcv_sel + other_sel; + CLAMP_PROBABILITY(sel); + + return sel; +} + + +/* * mcv_clauselist_selectivity - * Return the selectivity estimate computed using an MCV list. + * Use MCV statistics to estimate the selectivity of an implicitly-ANDed + * list of clauses. * - * First builds a bitmap of MCV items matching the clauses, and then sums - * the frequencies of matching items. + * This determines which MCV items match every clause in the list and returns + * the sum of the frequencies of those items. * - * It also produces two additional interesting selectivities - total - * selectivity of all the MCV items (not just the matching ones), and the - * base frequency computed on the assumption of independence. + * In addition, it returns the sum of the base frequencies of each of those + * items (that is the sum of the selectivities that each item would have if + * the columns were independent of one another), and the total selectivity of + * all the MCV items (not just the matching ones). These are expected to be + * used together with a "simple" selectivity estimate (one based only on + * per-column statistics) to produce an overall selectivity estimate that + * makes use of both per-column and multi-column statistics --- see + * mcv_combine_selectivities(). */ Selectivity mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat, @@ -1928,7 +1993,6 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat, if (matches[i] != false) { - /* XXX Shouldn't the basesel be outside the if condition? */ *basesel += mcv->items[i].base_frequency; s += mcv->items[i].frequency; } @@ -1936,3 +2000,94 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat, return s; } + + +/* + * mcv_clause_selectivity_or + * Use MCV statistics to estimate the selectivity of a clause that + * appears in an ORed list of clauses. + * + * As with mcv_clauselist_selectivity() this determines which MCV items match + * the clause and returns both the sum of the frequencies and the sum of the + * base frequencies of those items, as well as the sum of the frequencies of + * all MCV items (not just the matching ones) so that this information can be + * used by mcv_combine_selectivities() to produce a selectivity estimate that + * makes use of both per-column and multi-column statistics. + * + * Additionally, we return information to help compute the overall selectivity + * of the ORed list of clauses assumed to contain this clause. This function + * is intended to be called for each clause in the ORed list of clauses, + * allowing the overall selectivity to be computed using the following + * algorithm: + * + * Suppose P[n] = P(C[1] OR C[2] OR ... OR C[n]) is the combined selectivity + * of the first n clauses in the list. Then the combined selectivity taking + * into account the next clause C[n+1] can be written as + * + * P[n+1] = P[n] + P(C[n+1]) - P((C[1] OR ... OR C[n]) AND C[n+1]) + * + * The final term above represents the overlap between the clauses examined so + * far and the (n+1)'th clause. To estimate its selectivity, we track the + * match bitmap for the ORed list of clauses examined so far and examine its + * intersection with the match bitmap for the (n+1)'th clause. + * + * We then also return the sums of the MCV item frequencies and base + * frequencies for the match bitmap intersection corresponding to the overlap + * term above, so that they can be combined with a simple selectivity estimate + * for that term. + * + * The parameter "or_matches" is an in/out parameter tracking the match bitmap + * for the clauses examined so far. The caller is expected to set it to NULL + * the first time it calls this function. + */ +Selectivity +mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat, + MCVList *mcv, Node *clause, bool **or_matches, + Selectivity *basesel, Selectivity *overlap_mcvsel, + Selectivity *overlap_basesel, Selectivity *totalsel) +{ + Selectivity s = 0.0; + bool *new_matches; + int i; + + /* build the OR-matches bitmap, if not built already */ + if (*or_matches == NULL) + *or_matches = palloc0(sizeof(bool) * mcv->nitems); + + /* build the match bitmap for the new clause */ + new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys, + mcv, false); + + /* + * Sum the frequencies for all the MCV items matching this clause and also + * those matching the overlap between this clause and any of the preceding + * clauses as described above. + */ + *basesel = 0.0; + *overlap_mcvsel = 0.0; + *overlap_basesel = 0.0; + *totalsel = 0.0; + for (i = 0; i < mcv->nitems; i++) + { + *totalsel += mcv->items[i].frequency; + + if (new_matches[i]) + { + s += mcv->items[i].frequency; + *basesel += mcv->items[i].base_frequency; + + if ((*or_matches)[i]) + { + *overlap_mcvsel += mcv->items[i].frequency; + *overlap_basesel += mcv->items[i].base_frequency; + } + } + + /* update the OR-matches bitmap for the next clause */ + (*or_matches)[i] = (*or_matches)[i] || new_matches[i]; + } + + pfree(new_matches); + + return s; +} |
