diff options
| author | Tomas Vondra | 2021-03-26 22:22:01 +0000 |
|---|---|---|
| committer | Tomas Vondra | 2021-03-26 23:01:11 +0000 |
| commit | a4d75c86bf15220df22de0a92c819ecef9db3849 (patch) | |
| tree | a736a68b1c3f022590a886b7bac45276f1f490a6 /src/backend/statistics | |
| parent | 98376c18f12e562421b5c77e619248e8b7aae3c6 (diff) | |
Extended statistics on expressions
Allow defining extended statistics on expressions, not just just on
simple column references. With this commit, expressions are supported
by all existing extended statistics kinds, improving the same types of
estimates. A simple example may look like this:
CREATE TABLE t (a int);
CREATE STATISTICS s ON mod(a,10), mod(a,20) FROM t;
ANALYZE t;
The collected statistics are useful e.g. to estimate queries with those
expressions in WHERE or GROUP BY clauses:
SELECT * FROM t WHERE mod(a,10) = 0 AND mod(a,20) = 0;
SELECT 1 FROM t GROUP BY mod(a,10), mod(a,20);
This introduces new internal statistics kind 'e' (expressions) which is
built automatically when the statistics object definition includes any
expressions. This represents single-expression statistics, as if there
was an expression index (but without the index maintenance overhead).
The statistics is stored in pg_statistics_ext_data as an array of
composite types, which is possible thanks to 79f6a942bd.
CREATE STATISTICS allows building statistics on a single expression, in
which case in which case it's not possible to specify statistics kinds.
A new system view pg_stats_ext_exprs can be used to display expression
statistics, similarly to pg_stats and pg_stats_ext views.
ALTER TABLE ... ALTER COLUMN ... TYPE now treats indexes the same way it
treats indexes, i.e. it drops and recreates the statistics. This means
all statistics are reset, and we no longer try to preserve at least the
functional dependencies. This should not be a major issue in practice,
as the functional dependencies actually rely on per-column statistics,
which were always reset anyway.
Author: Tomas Vondra
Reviewed-by: Justin Pryzby, Dean Rasheed, Zhihong Yu
Discussion: https://postgr.es/m/ad7891d2-e90c-b446-9fe2-7419143847d7%40enterprisedb.com
Diffstat (limited to 'src/backend/statistics')
| -rw-r--r-- | src/backend/statistics/dependencies.c | 616 | ||||
| -rw-r--r-- | src/backend/statistics/extended_stats.c | 1253 | ||||
| -rw-r--r-- | src/backend/statistics/mcv.c | 369 | ||||
| -rw-r--r-- | src/backend/statistics/mvdistinct.c | 96 |
4 files changed, 1929 insertions, 405 deletions
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c index eac9285165..cf8a6d5f68 100644 --- a/src/backend/statistics/dependencies.c +++ b/src/backend/statistics/dependencies.c @@ -70,15 +70,15 @@ static void generate_dependencies(DependencyGenerator state); static DependencyGenerator DependencyGenerator_init(int n, int k); static void DependencyGenerator_free(DependencyGenerator state); static AttrNumber *DependencyGenerator_next(DependencyGenerator state); -static double dependency_degree(int numrows, HeapTuple *rows, int k, - AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs); +static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency); static bool dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums); static bool dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum); +static bool dependency_is_compatible_expression(Node *clause, Index relid, + List *statlist, Node **expr); static MVDependency *find_strongest_dependency(MVDependencies **dependencies, - int ndependencies, - Bitmapset *attnums); + int ndependencies, Bitmapset *attnums); static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo, @@ -219,16 +219,13 @@ DependencyGenerator_next(DependencyGenerator state) * the last one. */ static double -dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, - VacAttrStats **stats, Bitmapset *attrs) +dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency) { int i, nitems; MultiSortSupport mss; SortItem *items; - AttrNumber *attnums; AttrNumber *attnums_dep; - int numattrs; /* counters valid within a group */ int group_size = 0; @@ -244,15 +241,12 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, mss = multi_sort_init(k); /* - * Transform the attrs from bitmap to an array to make accessing the i-th - * member easier, and then construct a filtered version with only attnums - * referenced by the dependency we validate. + * Translate the array of indexes to regular attnums for the dependency (we + * will need this to identify the columns in StatsBuildData). */ - attnums = build_attnums_array(attrs, &numattrs); - attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber)); for (i = 0; i < k; i++) - attnums_dep[i] = attnums[dependency[i]]; + attnums_dep[i] = data->attnums[dependency[i]]; /* * Verify the dependency (a,b,...)->z, using a rather simple algorithm: @@ -270,7 +264,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, /* prepare the sort function for the dimensions */ for (i = 0; i < k; i++) { - VacAttrStats *colstat = stats[dependency[i]]; + VacAttrStats *colstat = data->stats[dependency[i]]; TypeCacheEntry *type; type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR); @@ -289,8 +283,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, * descriptor. For now that assumption holds, but it might change in the * future for example if we support statistics on multiple tables. */ - items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc, - mss, k, attnums_dep); + items = build_sorted_items(data, &nitems, mss, k, attnums_dep); /* * Walk through the sorted array, split it into rows according to the @@ -336,11 +329,10 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, pfree(items); pfree(mss); - pfree(attnums); pfree(attnums_dep); /* Compute the 'degree of validity' as (supporting/total). */ - return (n_supporting_rows * 1.0 / numrows); + return (n_supporting_rows * 1.0 / data->numrows); } /* @@ -360,23 +352,15 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, * (c) -> b */ MVDependencies * -statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, - VacAttrStats **stats) +statext_dependencies_build(StatsBuildData *data) { int i, k; - int numattrs; - AttrNumber *attnums; /* result */ MVDependencies *dependencies = NULL; - /* - * Transform the bms into an array, to make accessing i-th member easier. - */ - attnums = build_attnums_array(attrs, &numattrs); - - Assert(numattrs >= 2); + Assert(data->nattnums >= 2); /* * We'll try build functional dependencies starting from the smallest ones @@ -384,12 +368,12 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, * included in the statistics object. We start from the smallest ones * because we want to be able to skip already implied ones. */ - for (k = 2; k <= numattrs; k++) + for (k = 2; k <= data->nattnums; k++) { AttrNumber *dependency; /* array with k elements */ /* prepare a DependencyGenerator of variation */ - DependencyGenerator DependencyGenerator = DependencyGenerator_init(numattrs, k); + DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k); /* generate all possible variations of k values (out of n) */ while ((dependency = DependencyGenerator_next(DependencyGenerator))) @@ -398,7 +382,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, MVDependency *d; /* compute how valid the dependency seems */ - degree = dependency_degree(numrows, rows, k, dependency, stats, attrs); + degree = dependency_degree(data, k, dependency); /* * if the dependency seems entirely invalid, don't store it @@ -413,7 +397,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, d->degree = degree; d->nattributes = k; for (i = 0; i < k; i++) - d->attributes[i] = attnums[dependency[i]]; + d->attributes[i] = data->attnums[dependency[i]]; /* initialize the list of dependencies */ if (dependencies == NULL) @@ -747,6 +731,7 @@ static bool dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) { Var *var; + Node *clause_expr; if (IsA(clause, RestrictInfo)) { @@ -774,9 +759,9 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) /* Make sure non-selected argument is a pseudoconstant. */ if (is_pseudo_constant_clause(lsecond(expr->args))) - var = linitial(expr->args); + clause_expr = linitial(expr->args); else if (is_pseudo_constant_clause(linitial(expr->args))) - var = lsecond(expr->args); + clause_expr = lsecond(expr->args); else return false; @@ -805,8 +790,8 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) /* * Reject ALL() variant, we only care about ANY/IN. * - * FIXME Maybe we should check if all the values are the same, and - * allow ALL in that case? Doesn't seem very practical, though. + * XXX Maybe we should check if all the values are the same, and allow + * ALL in that case? Doesn't seem very practical, though. */ if (!expr->useOr) return false; @@ -822,7 +807,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) if (!is_pseudo_constant_clause(lsecond(expr->args))) return false; - var = linitial(expr->args); + clause_expr = linitial(expr->args); /* * If it's not an "=" operator, just ignore the clause, as it's not @@ -838,13 +823,13 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) } else if (is_orclause(clause)) { - BoolExpr *expr = (BoolExpr *) clause; + BoolExpr *bool_expr = (BoolExpr *) clause; ListCell *lc; /* start with no attribute number */ *attnum = InvalidAttrNumber; - foreach(lc, expr->args) + foreach(lc, bool_expr->args) { AttrNumber clause_attnum; @@ -859,6 +844,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) if (*attnum == InvalidAttrNumber) *attnum = clause_attnum; + /* ensure all the variables are the same (same attnum) */ if (*attnum != clause_attnum) return false; } @@ -872,7 +858,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) * "NOT x" can be interpreted as "x = false", so get the argument and * proceed with seeing if it's a suitable Var. */ - var = (Var *) get_notclausearg(clause); + clause_expr = (Node *) get_notclausearg(clause); } else { @@ -880,20 +866,23 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) * A boolean expression "x" can be interpreted as "x = true", so * proceed with seeing if it's a suitable Var. */ - var = (Var *) clause; + clause_expr = (Node *) clause; } /* * We may ignore any RelabelType node above the operand. (There won't be * more than one, since eval_const_expressions has been applied already.) */ - if (IsA(var, RelabelType)) - var = (Var *) ((RelabelType *) var)->arg; + if (IsA(clause_expr, RelabelType)) + clause_expr = (Node *) ((RelabelType *) clause_expr)->arg; /* We only support plain Vars for now */ - if (!IsA(var, Var)) + if (!IsA(clause_expr, Var)) return false; + /* OK, we know we have a Var */ + var = (Var *) clause_expr; + /* Ensure Var is from the correct relation */ if (var->varno != relid) return false; @@ -1158,6 +1147,212 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses, } /* + * dependency_is_compatible_expression + * Determines if the expression is compatible with functional dependencies + * + * Similar to dependency_is_compatible_clause, but doesn't enforce that the + * expression is a simple Var. OTOH we check that there's at least one + * statistics object matching the expression. + */ +static bool +dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr) +{ + List *vars; + ListCell *lc, + *lc2; + Node *clause_expr; + + if (IsA(clause, RestrictInfo)) + { + RestrictInfo *rinfo = (RestrictInfo *) clause; + + /* Pseudoconstants are not interesting (they couldn't contain a Var) */ + if (rinfo->pseudoconstant) + return false; + + /* Clauses referencing multiple, or no, varnos are incompatible */ + if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON) + return false; + + clause = (Node *) rinfo->clause; + } + + if (is_opclause(clause)) + { + /* If it's an opclause, check for Var = Const or Const = Var. */ + OpExpr *expr = (OpExpr *) clause; + + /* Only expressions with two arguments are candidates. */ + if (list_length(expr->args) != 2) + return false; + + /* Make sure non-selected argument is a pseudoconstant. */ + if (is_pseudo_constant_clause(lsecond(expr->args))) + clause_expr = linitial(expr->args); + else if (is_pseudo_constant_clause(linitial(expr->args))) + clause_expr = lsecond(expr->args); + else + return false; + + /* + * If it's not an "=" operator, just ignore the clause, as it's not + * compatible with functional dependencies. + * + * This uses the function for estimating selectivity, not the operator + * directly (a bit awkward, but well ...). + * + * XXX this is pretty dubious; probably it'd be better to check btree + * or hash opclass membership, so as not to be fooled by custom + * selectivity functions, and to be more consistent with decisions + * elsewhere in the planner. + */ + if (get_oprrest(expr->opno) != F_EQSEL) + return false; + + /* OK to proceed with checking "var" */ + } + else if (IsA(clause, ScalarArrayOpExpr)) + { + /* If it's an scalar array operator, check for Var IN Const. */ + ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause; + + /* + * Reject ALL() variant, we only care about ANY/IN. + * + * FIXME Maybe we should check if all the values are the same, and + * allow ALL in that case? Doesn't seem very practical, though. + */ + if (!expr->useOr) + return false; + + /* Only expressions with two arguments are candidates. */ + if (list_length(expr->args) != 2) + return false; + + /* + * We know it's always (Var IN Const), so we assume the var is the + * first argument, and pseudoconstant is the second one. + */ + if (!is_pseudo_constant_clause(lsecond(expr->args))) + return false; + + clause_expr = linitial(expr->args); + + /* + * If it's not an "=" operator, just ignore the clause, as it's not + * compatible with functional dependencies. The operator is identified + * simply by looking at which function it uses to estimate + * selectivity. That's a bit strange, but it's what other similar + * places do. + */ + if (get_oprrest(expr->opno) != F_EQSEL) + return false; + + /* OK to proceed with checking "var" */ + } + else if (is_orclause(clause)) + { + BoolExpr *bool_expr = (BoolExpr *) clause; + ListCell *lc; + + /* start with no expression (we'll use the first match) */ + *expr = NULL; + + foreach(lc, bool_expr->args) + { + Node *or_expr = NULL; + + /* + * Had we found incompatible expression in the arguments, treat + * the whole expression as incompatible. + */ + if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid, + statlist, &or_expr)) + return false; + + if (*expr == NULL) + *expr = or_expr; + + /* ensure all the expressions are the same */ + if (!equal(or_expr, *expr)) + return false; + } + + /* the expression is already checked by the recursive call */ + return true; + } + else if (is_notclause(clause)) + { + /* + * "NOT x" can be interpreted as "x = false", so get the argument and + * proceed with seeing if it's a suitable Var. + */ + clause_expr = (Node *) get_notclausearg(clause); + } + else + { + /* + * A boolean expression "x" can be interpreted as "x = true", so + * proceed with seeing if it's a suitable Var. + */ + clause_expr = (Node *) clause; + } + + /* + * We may ignore any RelabelType node above the operand. (There won't be + * more than one, since eval_const_expressions has been applied already.) + */ + if (IsA(clause_expr, RelabelType)) + clause_expr = (Node *) ((RelabelType *) clause_expr)->arg; + + vars = pull_var_clause(clause_expr, 0); + + foreach(lc, vars) + { + Var *var = (Var *) lfirst(lc); + + /* Ensure Var is from the correct relation */ + if (var->varno != relid) + return false; + + /* We also better ensure the Var is from the current level */ + if (var->varlevelsup != 0) + return false; + + /* Also ignore system attributes (we don't allow stats on those) */ + if (!AttrNumberIsForUserDefinedAttr(var->varattno)) + return false; + } + + /* + * Check if we actually have a matching statistics for the expression. + * + * XXX Maybe this is an overkill. We'll eliminate the expressions later. + */ + foreach(lc, statlist) + { + StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc); + + /* ignore stats without dependencies */ + if (info->kind != STATS_EXT_DEPENDENCIES) + continue; + + foreach(lc2, info->exprs) + { + Node *stat_expr = (Node *) lfirst(lc2); + + if (equal(clause_expr, stat_expr)) + { + *expr = stat_expr; + return true; + } + } + } + + return false; +} + +/* * dependencies_clauselist_selectivity * Return the estimated selectivity of (a subset of) the given clauses * using functional dependency statistics, or 1.0 if no useful functional @@ -1204,6 +1399,11 @@ dependencies_clauselist_selectivity(PlannerInfo *root, MVDependency **dependencies; int ndependencies; int i; + AttrNumber attnum_offset; + + /* unique expressions */ + Node **unique_exprs; + int unique_exprs_cnt; /* check if there's any stats that might be useful for us. */ if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES)) @@ -1213,6 +1413,15 @@ dependencies_clauselist_selectivity(PlannerInfo *root, list_length(clauses)); /* + * We allocate space as if every clause was a unique expression, although + * that's probably overkill. Some will be simple column references that + * we'll translate to attnums, and there might be duplicates. But it's + * easier and cheaper to just do one allocation than repalloc later. + */ + unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses)); + unique_exprs_cnt = 0; + + /* * Pre-process the clauses list to extract the attnums seen in each item. * We need to determine if there's any clauses which will be useful for * dependency selectivity estimations. Along the way we'll record all of @@ -1222,29 +1431,127 @@ dependencies_clauselist_selectivity(PlannerInfo *root, * * We also skip clauses that we already estimated using different types of * statistics (we treat them as incompatible). + * + * To handle expressions, we assign them negative attnums, as if it was a + * system attribute (this is fine, as we only allow extended stats on user + * attributes). And then we offset everything by the number of + * expressions, so that we can store the values in a bitmapset. */ listidx = 0; foreach(l, clauses) { Node *clause = (Node *) lfirst(l); AttrNumber attnum; + Node *expr = NULL; + + /* ignore clause by default */ + list_attnums[listidx] = InvalidAttrNumber; - if (!bms_is_member(listidx, *estimatedclauses) && - dependency_is_compatible_clause(clause, rel->relid, &attnum)) + if (!bms_is_member(listidx, *estimatedclauses)) { - list_attnums[listidx] = attnum; - clauses_attnums = bms_add_member(clauses_attnums, attnum); + /* + * If it's a simple column refrence, just extract the attnum. If + * it's an expression, assign a negative attnum as if it was a + * system attribute. + */ + if (dependency_is_compatible_clause(clause, rel->relid, &attnum)) + { + list_attnums[listidx] = attnum; + } + else if (dependency_is_compatible_expression(clause, rel->relid, + rel->statlist, + &expr)) + { + /* special attnum assigned to this expression */ + attnum = InvalidAttrNumber; + + Assert(expr != NULL); + + /* If the expression is duplicate, use the same attnum. */ + for (i = 0; i < unique_exprs_cnt; i++) + { + if (equal(unique_exprs[i], expr)) + { + /* negative attribute number to expression */ + attnum = -(i + 1); + break; + } + } + + /* not found in the list, so add it */ + if (attnum == InvalidAttrNumber) + { + unique_exprs[unique_exprs_cnt++] = expr; + + /* after incrementing the value, to get -1, -2, ... */ + attnum = (-unique_exprs_cnt); + } + + /* remember which attnum was assigned to this clause */ + list_attnums[listidx] = attnum; + } } - else - list_attnums[listidx] = InvalidAttrNumber; listidx++; } + Assert(listidx == list_length(clauses)); + /* - * If there's not at least two distinct attnums then reject the whole list - * of clauses. We must return 1.0 so the calling function's selectivity is - * unaffected. + * How much we need to offset the attnums? If there are no expressions, + * then no offset is needed. Otherwise we need to offset enough for the + * lowest value (-unique_exprs_cnt) to become 1. + */ + if (unique_exprs_cnt > 0) + attnum_offset = (unique_exprs_cnt + 1); + else + attnum_offset = 0; + + /* + * Now that we know how many expressions there are, we can offset the + * values just enough to build the bitmapset. + */ + for (i = 0; i < list_length(clauses); i++) + { + AttrNumber attnum; + + /* ignore incompatible or already estimated clauses */ + if (list_attnums[i] == InvalidAttrNumber) + continue; + + /* make sure the attnum is in the expected range */ + Assert(list_attnums[i] >= (-unique_exprs_cnt)); + Assert(list_attnums[i] <= MaxHeapAttributeNumber); + + /* make sure the attnum is positive (valid AttrNumber) */ + attnum = list_attnums[i] + attnum_offset; + + /* + * Either it's a regular attribute, or it's an expression, in which + * case we must not have seen it before (expressions are unique). + * + * XXX Check whether it's a regular attribute has to be done using the + * original attnum, while the second check has to use the value with + * an offset. + */ + Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) || + !bms_is_member(attnum, clauses_attnums)); + + /* + * Remember the offset attnum, both for attributes and expressions. + * We'll pass list_attnums to clauselist_apply_dependencies, which + * uses it to identify clauses in a bitmap. We could also pass the + * offset, but this is more convenient. + */ + list_attnums[i] = attnum; + + clauses_attnums = bms_add_member(clauses_attnums, attnum); + } + + /* + * If there's not at least two distinct attnums and expressions, then + * reject the whole list of clauses. We must return 1.0 so the calling + * function's selectivity is unaffected. */ if (bms_membership(clauses_attnums) != BMS_MULTIPLE) { @@ -1272,26 +1579,203 @@ dependencies_clauselist_selectivity(PlannerInfo *root, foreach(l, rel->statlist) { StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l); - Bitmapset *matched; - BMS_Membership membership; + int nmatched; + int nexprs; + int k; + MVDependencies *deps; /* skip statistics that are not of the correct type */ if (stat->kind != STATS_EXT_DEPENDENCIES) continue; - matched = bms_intersect(clauses_attnums, stat->keys); - membership = bms_membership(matched); - bms_free(matched); + /* + * Count matching attributes - we have to undo the attnum offsets. The + * input attribute numbers are not offset (expressions are not + * included in stat->keys, so it's not necessary). But we need to + * offset it before checking against clauses_attnums. + */ + nmatched = 0; + k = -1; + while ((k = bms_next_member(stat->keys, k)) >= 0) + { + AttrNumber attnum = (AttrNumber) k; - /* skip objects matching fewer than two attributes from clauses */ - if (membership != BMS_MULTIPLE) + /* skip expressions */ + if (!AttrNumberIsForUserDefinedAttr(attnum)) + continue; + + /* apply the same offset as above */ + attnum += attnum_offset; + + if (bms_is_member(attnum, clauses_attnums)) + nmatched++; + } + + /* count matching expressions */ + nexprs = 0; + for (i = 0; i < unique_exprs_cnt; i++) + { + ListCell *lc; + + foreach(lc, stat->exprs) + { + Node *stat_expr = (Node *) lfirst(lc); + + /* try to match it */ + if (equal(stat_expr, unique_exprs[i])) + nexprs++; + } + } + + /* + * Skip objects matching fewer than two attributes/expressions from + * clauses. + */ + if (nmatched + nexprs < 2) continue; - func_dependencies[nfunc_dependencies] - = statext_dependencies_load(stat->statOid); + deps = statext_dependencies_load(stat->statOid); + + /* + * The expressions may be represented by different attnums in the + * stats, we need to remap them to be consistent with the clauses. + * That will make the later steps (e.g. picking the strongest item and + * so on) much simpler and cheaper, because it won't need to care + * about the offset at all. + * + * When we're at it, we can ignore dependencies that are not fully + * matched by clauses (i.e. referencing attributes or expressions that + * are not in the clauses). + * + * We have to do this for all statistics, as long as there are any + * expressions - we need to shift the attnums in all dependencies. + * + * XXX Maybe we should do this always, because it also eliminates some + * of the dependencies early. It might be cheaper than having to walk + * the longer list in find_strongest_dependency later, especially as + * we need to do that repeatedly? + * + * XXX We have to do this even when there are no expressions in + * clauses, otherwise find_strongest_dependency may fail for stats + * with expressions (due to lookup of negative value in bitmap). So we + * need to at least filter out those dependencies. Maybe we could do + * it in a cheaper way (if there are no expr clauses, we can just + * discard all negative attnums without any lookups). + */ + if (unique_exprs_cnt > 0 || stat->exprs != NIL) + { + int ndeps = 0; + + for (i = 0; i < deps->ndeps; i++) + { + bool skip = false; + MVDependency *dep = deps->deps[i]; + int j; + + for (j = 0; j < dep->nattributes; j++) + { + int idx; + Node *expr; + int k; + AttrNumber unique_attnum = InvalidAttrNumber; + AttrNumber attnum; + + /* undo the per-statistics offset */ + attnum = dep->attributes[j]; + + /* + * For regular attributes we can simply check if it + * matches any clause. If there's no matching clause, we + * can just ignore it. We need to offset the attnum + * though. + */ + if (AttrNumberIsForUserDefinedAttr(attnum)) + { + dep->attributes[j] = attnum + attnum_offset; + + if (!bms_is_member(dep->attributes[j], clauses_attnums)) + { + skip = true; + break; + } + + continue; + } + + /* + * the attnum should be a valid system attnum (-1, -2, + * ...) + */ + Assert(AttributeNumberIsValid(attnum)); + + /* + * For expressions, we need to do two translations. First + * we have to translate the negative attnum to index in + * the list of expressions (in the statistics object). + * Then we need to see if there's a matching clause. The + * index of the unique expression determines the attnum + * (and we offset it). + */ + idx = -(1 + attnum); + + /* Is the expression index is valid? */ + Assert((idx >= 0) && (idx < list_length(stat->exprs))); + + expr = (Node *) list_nth(stat->exprs, idx); + + /* try to find the expression in the unique list */ + for (k = 0; k < unique_exprs_cnt; k++) + { + /* + * found a matching unique expression, use the attnum + * (derived from index of the unique expression) + */ + if (equal(unique_exprs[k], expr)) + { + unique_attnum = -(k + 1) + attnum_offset; + break; + } + } + + /* + * Found no matching expression, so we can simply skip + * this dependency, because there's no chance it will be + * fully covered. + */ + if (unique_attnum == InvalidAttrNumber) + { + skip = true; + break; + } + + /* otherwise remap it to the new attnum */ + dep->attributes[j] = unique_attnum; + } - total_ndeps += func_dependencies[nfunc_dependencies]->ndeps; - nfunc_dependencies++; + /* if found a matching dependency, keep it */ + if (!skip) + { + /* maybe we've skipped something earlier, so move it */ + if (ndeps != i) + deps->deps[ndeps] = deps->deps[i]; + + ndeps++; + } + } + + deps->ndeps = ndeps; + } + + /* + * It's possible we've removed all dependencies, in which case we + * don't bother adding it to the list. + */ + if (deps->ndeps > 0) + { + func_dependencies[nfunc_dependencies] = deps; + total_ndeps += deps->ndeps; + nfunc_dependencies++; + } } /* if no matching stats could be found then we've nothing to do */ @@ -1300,6 +1784,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root, pfree(func_dependencies); bms_free(clauses_attnums); pfree(list_attnums); + pfree(unique_exprs); return 1.0; } @@ -1347,6 +1832,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root, pfree(func_dependencies); bms_free(clauses_attnums); pfree(list_attnums); + pfree(unique_exprs); return s1; } diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index 7808c6a09c..8c75690fce 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -24,6 +24,7 @@ #include "catalog/pg_collation.h" #include "catalog/pg_statistic_ext.h" #include "catalog/pg_statistic_ext_data.h" +#include "executor/executor.h" #include "commands/progress.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" @@ -35,13 +36,16 @@ #include "statistics/statistics.h" #include "utils/acl.h" #include "utils/array.h" +#include "utils/attoptcache.h" #include "utils/builtins.h" +#include "utils/datum.h" #include "utils/fmgroids.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" #include "utils/selfuncs.h" #include "utils/syscache.h" +#include "utils/typcache.h" /* * To avoid consuming too much memory during analysis and/or too much space @@ -66,18 +70,38 @@ typedef struct StatExtEntry Bitmapset *columns; /* attribute numbers covered by the object */ List *types; /* 'char' list of enabled statistics kinds */ int stattarget; /* statistics target (-1 for default) */ + List *exprs; /* expressions */ } StatExtEntry; static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid); -static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs, +static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs, int nvacatts, VacAttrStats **vacatts); -static void statext_store(Oid relid, +static void statext_store(Oid statOid, MVNDistinct *ndistinct, MVDependencies *dependencies, - MCVList *mcv, VacAttrStats **stats); + MCVList *mcv, Datum exprs, VacAttrStats **stats); static int statext_compute_stattarget(int stattarget, int natts, VacAttrStats **stats); +/* Information needed to analyze a single simple expression. */ +typedef struct AnlExprData +{ + Node *expr; /* expression to analyze */ + VacAttrStats *vacattrstat; /* statistics attrs to analyze */ +} AnlExprData; + +static void compute_expr_stats(Relation onerel, double totalrows, + AnlExprData * exprdata, int nexprs, + HeapTuple *rows, int numrows); +static Datum serialize_expr_stats(AnlExprData * exprdata, int nexprs); +static Datum expr_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); +static AnlExprData *build_expr_data(List *exprs, int stattarget); + +static StatsBuildData *make_build_data(Relation onerel, StatExtEntry *stat, + int numrows, HeapTuple *rows, + VacAttrStats **stats, int stattarget); + + /* * Compute requested extended stats, using the rows sampled for the plain * (single-column) stats. @@ -92,21 +116,25 @@ BuildRelationExtStatistics(Relation onerel, double totalrows, { Relation pg_stext; ListCell *lc; - List *stats; + List *statslist; MemoryContext cxt; MemoryContext oldcxt; int64 ext_cnt; + /* Do nothing if there are no columns to analyze. */ + if (!natts) + return; + cxt = AllocSetContextCreate(CurrentMemoryContext, "BuildRelationExtStatistics", ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(cxt); pg_stext = table_open(StatisticExtRelationId, RowExclusiveLock); - stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel)); + statslist = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel)); /* report this phase */ - if (stats != NIL) + if (statslist != NIL) { const int index[] = { PROGRESS_ANALYZE_PHASE, @@ -114,28 +142,30 @@ BuildRelationExtStatistics(Relation onerel, double totalrows, }; const int64 val[] = { PROGRESS_ANALYZE_PHASE_COMPUTE_EXT_STATS, - list_length(stats) + list_length(statslist) }; pgstat_progress_update_multi_param(2, index, val); } ext_cnt = 0; - foreach(lc, stats) + foreach(lc, statslist) { StatExtEntry *stat = (StatExtEntry *) lfirst(lc); MVNDistinct *ndistinct = NULL; MVDependencies *dependencies = NULL; MCVList *mcv = NULL; + Datum exprstats = (Datum) 0; VacAttrStats **stats; ListCell *lc2; int stattarget; + StatsBuildData *data; /* * Check if we can build these stats based on the column analyzed. If * not, report this fact (except in autovacuum) and move on. */ - stats = lookup_var_attr_stats(onerel, stat->columns, + stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs, natts, vacattrstats); if (!stats) { @@ -150,10 +180,6 @@ BuildRelationExtStatistics(Relation onerel, double totalrows, continue; } - /* check allowed number of dimensions */ - Assert(bms_num_members(stat->columns) >= 2 && - bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS); - /* compute statistics target for this statistics */ stattarget = statext_compute_stattarget(stat->stattarget, bms_num_members(stat->columns), @@ -167,28 +193,49 @@ BuildRelationExtStatistics(Relation onerel, double totalrows, if (stattarget == 0) continue; + /* evaluate expressions (if the statistics has any) */ + data = make_build_data(onerel, stat, numrows, rows, stats, stattarget); + /* compute statistic of each requested type */ foreach(lc2, stat->types) { char t = (char) lfirst_int(lc2); if (t == STATS_EXT_NDISTINCT) - ndistinct = statext_ndistinct_build(totalrows, numrows, rows, - stat->columns, stats); + ndistinct = statext_ndistinct_build(totalrows, data); else if (t == STATS_EXT_DEPENDENCIES) - dependencies = statext_dependencies_build(numrows, rows, - stat->columns, stats); + dependencies = statext_dependencies_build(data); else if (t == STATS_EXT_MCV) - mcv = statext_mcv_build(numrows, rows, stat->columns, stats, - totalrows, stattarget); + mcv = statext_mcv_build(data, totalrows, stattarget); + else if (t == STATS_EXT_EXPRESSIONS) + { + AnlExprData *exprdata; + int nexprs; + + /* should not happen, thanks to checks when defining stats */ + if (!stat->exprs) + elog(ERROR, "requested expression stats, but there are no expressions"); + + exprdata = build_expr_data(stat->exprs, stattarget); + nexprs = list_length(stat->exprs); + + compute_expr_stats(onerel, totalrows, + exprdata, nexprs, + rows, numrows); + + exprstats = serialize_expr_stats(exprdata, nexprs); + } } /* store the statistics in the catalog */ - statext_store(stat->statOid, ndistinct, dependencies, mcv, stats); + statext_store(stat->statOid, ndistinct, dependencies, mcv, exprstats, stats); /* for reporting progress */ pgstat_progress_update_param(PROGRESS_ANALYZE_EXT_STATS_COMPUTED, ++ext_cnt); + + /* free the build data (allocated as a single chunk) */ + pfree(data); } table_close(pg_stext, RowExclusiveLock); @@ -221,6 +268,10 @@ ComputeExtStatisticsRows(Relation onerel, MemoryContext oldcxt; int result = 0; + /* If there are no columns to analyze, just return 0. */ + if (!natts) + return 0; + cxt = AllocSetContextCreate(CurrentMemoryContext, "ComputeExtStatisticsRows", ALLOCSET_DEFAULT_SIZES); @@ -241,7 +292,7 @@ ComputeExtStatisticsRows(Relation onerel, * analyzed. If not, ignore it (don't report anything, we'll do that * during the actual build BuildRelationExtStatistics). */ - stats = lookup_var_attr_stats(onerel, stat->columns, + stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs, natts, vacattrstats); if (!stats) @@ -349,6 +400,10 @@ statext_is_kind_built(HeapTuple htup, char type) attnum = Anum_pg_statistic_ext_data_stxdmcv; break; + case STATS_EXT_EXPRESSIONS: + attnum = Anum_pg_statistic_ext_data_stxdexpr; + break; + default: elog(ERROR, "unexpected statistics type requested: %d", type); } @@ -388,6 +443,7 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid) ArrayType *arr; char *enabled; Form_pg_statistic_ext staForm; + List *exprs = NIL; entry = palloc0(sizeof(StatExtEntry)); staForm = (Form_pg_statistic_ext) GETSTRUCT(htup); @@ -415,10 +471,40 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid) { Assert((enabled[i] == STATS_EXT_NDISTINCT) || (enabled[i] == STATS_EXT_DEPENDENCIES) || - (enabled[i] == STATS_EXT_MCV)); + (enabled[i] == STATS_EXT_MCV) || + (enabled[i] == STATS_EXT_EXPRESSIONS)); entry->types = lappend_int(entry->types, (int) enabled[i]); } + /* decode expression (if any) */ + datum = SysCacheGetAttr(STATEXTOID, htup, + Anum_pg_statistic_ext_stxexprs, &isnull); + + if (!isnull) + { + char *exprsString; + + exprsString = TextDatumGetCString(datum); + exprs = (List *) stringToNode(exprsString); + + pfree(exprsString); + + /* + * Run the expressions through eval_const_expressions. This is not + * just an optimization, but is necessary, because the planner + * will be comparing them to similarly-processed qual clauses, and + * may fail to detect valid matches without this. We must not use + * canonicalize_qual, however, since these aren't qual + * expressions. + */ + exprs = (List *) eval_const_expressions(NULL, (Node *) exprs); + + /* May as well fix opfuncids too */ + fix_opfuncids((Node *) exprs); + } + + entry->exprs = exprs; + result = lappend(result, entry); } @@ -428,6 +514,187 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid) } /* + * examine_attribute -- pre-analysis of a single column + * + * Determine whether the column is analyzable; if so, create and initialize + * a VacAttrStats struct for it. If not, return NULL. + */ +static VacAttrStats * +examine_attribute(Node *expr) +{ + HeapTuple typtuple; + VacAttrStats *stats; + int i; + bool ok; + + /* + * Create the VacAttrStats struct. Note that we only have a copy of the + * fixed fields of the pg_attribute tuple. + */ + stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats)); + + /* fake the attribute */ + stats->attr = (Form_pg_attribute) palloc0(ATTRIBUTE_FIXED_PART_SIZE); + stats->attr->attstattarget = -1; + + /* + * When analyzing an expression, believe the expression tree's type not + * the column datatype --- the latter might be the opckeytype storage + * type of the opclass, which is not interesting for our purposes. (Note: + * if we did anything with non-expression statistics columns, we'd need to + * figure out where to get the correct type info from, but for now that's + * not a problem.) It's not clear whether anyone will care about the + * typmod, but we store that too just in case. + */ + stats->attrtypid = exprType(expr); + stats->attrtypmod = exprTypmod(expr); + stats->attrcollid = exprCollation(expr); + + typtuple = SearchSysCacheCopy1(TYPEOID, + ObjectIdGetDatum(stats->attrtypid)); + if (!HeapTupleIsValid(typtuple)) + elog(ERROR, "cache lookup failed for type %u", stats->attrtypid); + stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple); + + /* + * We don't actually analyze individual attributes, so no need to set the + * memory context. + */ + stats->anl_context = NULL; + stats->tupattnum = InvalidAttrNumber; + + /* + * The fields describing the stats->stavalues[n] element types default to + * the type of the data being analyzed, but the type-specific typanalyze + * function can change them if it wants to store something else. + */ + for (i = 0; i < STATISTIC_NUM_SLOTS; i++) + { + stats->statypid[i] = stats->attrtypid; + stats->statyplen[i] = stats->attrtype->typlen; + stats->statypbyval[i] = stats->attrtype->typbyval; + stats->statypalign[i] = stats->attrtype->typalign; + } + + /* + * Call the type-specific typanalyze function. If none is specified, use + * std_typanalyze(). + */ + if (OidIsValid(stats->attrtype->typanalyze)) + ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze, + PointerGetDatum(stats))); + else + ok = std_typanalyze(stats); + + if (!ok || stats->compute_stats == NULL || stats->minrows <= 0) + { + heap_freetuple(typtuple); + pfree(stats->attr); + pfree(stats); + return NULL; + } + + return stats; +} + +/* + * examine_expression -- pre-analysis of a single expression + * + * Determine whether the expression is analyzable; if so, create and initialize + * a VacAttrStats struct for it. If not, return NULL. + */ +static VacAttrStats * +examine_expression(Node *expr, int stattarget) +{ + HeapTuple typtuple; + VacAttrStats *stats; + int i; + bool ok; + + Assert(expr != NULL); + + /* + * Create the VacAttrStats struct. + */ + stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats)); + + /* + * When analyzing an expression, believe the expression tree's type. + */ + stats->attrtypid = exprType(expr); + stats->attrtypmod = exprTypmod(expr); + + /* + * We don't allow collation to be specified in CREATE STATISTICS, so we + * have to use the collation specified for the expression. It's possible + * to specify the collation in the expression "(col COLLATE "en_US")" in + * which case exprCollation() does the right thing. + */ + stats->attrcollid = exprCollation(expr); + + /* + * We don't have any pg_attribute for expressions, so let's fake something + * reasonable into attstattarget, which is the only thing std_typanalyze + * needs. + */ + stats->attr = (Form_pg_attribute) palloc(ATTRIBUTE_FIXED_PART_SIZE); + + /* + * We can't have statistics target specified for the expression, so we + * could use either the default_statistics_target, or the target computed + * for the extended statistics. The second option seems more reasonable. + */ + stats->attr->attstattarget = stattarget; + + /* initialize some basic fields */ + stats->attr->attrelid = InvalidOid; + stats->attr->attnum = InvalidAttrNumber; + stats->attr->atttypid = stats->attrtypid; + + typtuple = SearchSysCacheCopy1(TYPEOID, + ObjectIdGetDatum(stats->attrtypid)); + if (!HeapTupleIsValid(typtuple)) + elog(ERROR, "cache lookup failed for type %u", stats->attrtypid); + + stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple); + stats->anl_context = CurrentMemoryContext; /* XXX should be using + * something else? */ + stats->tupattnum = InvalidAttrNumber; + + /* + * The fields describing the stats->stavalues[n] element types default to + * the type of the data being analyzed, but the type-specific typanalyze + * function can change them if it wants to store something else. + */ + for (i = 0; i < STATISTIC_NUM_SLOTS; i++) + { + stats->statypid[i] = stats->attrtypid; + stats->statyplen[i] = stats->attrtype->typlen; + stats->statypbyval[i] = stats->attrtype->typbyval; + stats->statypalign[i] = stats->attrtype->typalign; + } + + /* + * Call the type-specific typanalyze function. If none is specified, use + * std_typanalyze(). + */ + if (OidIsValid(stats->attrtype->typanalyze)) + ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze, + PointerGetDatum(stats))); + else + ok = std_typanalyze(stats); + + if (!ok || stats->compute_stats == NULL || stats->minrows <= 0) + { + heap_freetuple(typtuple); + pfree(stats); + return NULL; + } + + return stats; +} + +/* * Using 'vacatts' of size 'nvacatts' as input data, return a newly built * VacAttrStats array which includes only the items corresponding to * attributes indicated by 'stxkeys'. If we don't have all of the per column @@ -435,15 +702,18 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid) * to the caller that the stats should not be built. */ static VacAttrStats ** -lookup_var_attr_stats(Relation rel, Bitmapset *attrs, +lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs, int nvacatts, VacAttrStats **vacatts) { int i = 0; int x = -1; + int natts; VacAttrStats **stats; + ListCell *lc; + + natts = bms_num_members(attrs) + list_length(exprs); - stats = (VacAttrStats **) - palloc(bms_num_members(attrs) * sizeof(VacAttrStats *)); + stats = (VacAttrStats **) palloc(natts * sizeof(VacAttrStats *)); /* lookup VacAttrStats info for the requested columns (same attnum) */ while ((x = bms_next_member(attrs, x)) >= 0) @@ -480,6 +750,24 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs, i++; } + /* also add info for expressions */ + foreach(lc, exprs) + { + Node *expr = (Node *) lfirst(lc); + + stats[i] = examine_attribute(expr); + + /* + * XXX We need tuple descriptor later, and we just grab it from + * stats[0]->tupDesc (see e.g. statext_mcv_build). But as coded + * examine_attribute does not set that, so just grab it from the first + * vacatts element. + */ + stats[i]->tupDesc = vacatts[0]->tupDesc; + + i++; + } + return stats; } @@ -491,7 +779,7 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs, static void statext_store(Oid statOid, MVNDistinct *ndistinct, MVDependencies *dependencies, - MCVList *mcv, VacAttrStats **stats) + MCVList *mcv, Datum exprs, VacAttrStats **stats) { Relation pg_stextdata; HeapTuple stup, @@ -532,11 +820,17 @@ statext_store(Oid statOid, nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = (data == NULL); values[Anum_pg_statistic_ext_data_stxdmcv - 1] = PointerGetDatum(data); } + if (exprs != (Datum) 0) + { + nulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = false; + values[Anum_pg_statistic_ext_data_stxdexpr - 1] = exprs; + } /* always replace the value (either by bytea or NULL) */ replaces[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true; replaces[Anum_pg_statistic_ext_data_stxddependencies - 1] = true; replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true; + replaces[Anum_pg_statistic_ext_data_stxdexpr - 1] = true; /* there should already be a pg_statistic_ext_data tuple */ oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statOid)); @@ -668,7 +962,7 @@ compare_datums_simple(Datum a, Datum b, SortSupport ssup) * is not necessary here (and when querying the bitmap). */ AttrNumber * -build_attnums_array(Bitmapset *attrs, int *numattrs) +build_attnums_array(Bitmapset *attrs, int nexprs, int *numattrs) { int i, j; @@ -684,16 +978,19 @@ build_attnums_array(Bitmapset *attrs, int *numattrs) j = -1; while ((j = bms_next_member(attrs, j)) >= 0) { + AttrNumber attnum = (j - nexprs); + /* * Make sure the bitmap contains only user-defined attributes. As * bitmaps can't contain negative values, this can be violated in two * ways. Firstly, the bitmap might contain 0 as a member, and secondly * the integer value might be larger than MaxAttrNumber. */ - Assert(AttrNumberIsForUserDefinedAttr(j)); - Assert(j <= MaxAttrNumber); + Assert(AttributeNumberIsValid(attnum)); + Assert(attnum <= MaxAttrNumber); + Assert(attnum >= (-nexprs)); - attnums[i++] = (AttrNumber) j; + attnums[i++] = (AttrNumber) attnum; /* protect against overflows */ Assert(i <= num); @@ -710,29 +1007,31 @@ build_attnums_array(Bitmapset *attrs, int *numattrs) * can simply pfree the return value to release all of it. */ SortItem * -build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc, - MultiSortSupport mss, int numattrs, AttrNumber *attnums) +build_sorted_items(StatsBuildData *data, int *nitems, + MultiSortSupport mss, + int numattrs, AttrNumber *attnums) { int i, j, len, - idx; - int nvalues = numrows * numattrs; + nrows; + int nvalues = data->numrows * numattrs; SortItem *items; Datum *values; bool *isnull; char *ptr; + int *typlen; /* Compute the total amount of memory we need (both items and values). */ - len = numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool)); + len = data->numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool)); /* Allocate the memory and split it into the pieces. */ ptr = palloc0(len); /* items to sort */ items = (SortItem *) ptr; - ptr += numrows * sizeof(SortItem); + ptr += data->numrows * sizeof(SortItem); /* values and null flags */ values = (Datum *) ptr; @@ -745,21 +1044,47 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc, Assert((ptr - (char *) items) == len); /* fix the pointers to Datum and bool arrays */ - idx = 0; - for (i = 0; i < numrows; i++) + nrows = 0; + for (i = 0; i < data->numrows; i++) { - bool toowide = false; + items[nrows].values = &values[nrows * numattrs]; + items[nrows].isnull = &isnull[nrows * numattrs]; - items[idx].values = &values[idx * numattrs]; - items[idx].isnull = &isnull[idx * numattrs]; + nrows++; + } + + /* build a local cache of typlen for all attributes */ + typlen = (int *) palloc(sizeof(int) * data->nattnums); + for (i = 0; i < data->nattnums; i++) + typlen[i] = get_typlen(data->stats[i]->attrtypid); + + nrows = 0; + for (i = 0; i < data->numrows; i++) + { + bool toowide = false; /* load the values/null flags from sample rows */ for (j = 0; j < numattrs; j++) { Datum value; bool isnull; + int attlen; + AttrNumber attnum = attnums[j]; + + int idx; + + /* match attnum to the pre-calculated data */ + for (idx = 0; idx < data->nattnums; idx++) + { + if (attnum == data->attnums[idx]) + break; + } - value = heap_getattr(rows[i], attnums[j], tdesc, &isnull); + Assert(idx < data->nattnums); + + value = data->values[idx][i]; + isnull = data->nulls[idx][i]; + attlen = typlen[idx]; /* * If this is a varlena value, check if it's too wide and if yes @@ -770,8 +1095,7 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc, * on the assumption that those are small (below WIDTH_THRESHOLD) * and will be discarded at the end of analyze. */ - if ((!isnull) && - (TupleDescAttr(tdesc, attnums[j] - 1)->attlen == -1)) + if ((!isnull) && (attlen == -1)) { if (toast_raw_datum_size(value) > WIDTH_THRESHOLD) { @@ -782,21 +1106,21 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc, value = PointerGetDatum(PG_DETOAST_DATUM(value)); } - items[idx].values[j] = value; - items[idx].isnull[j] = isnull; + items[nrows].values[j] = value; + items[nrows].isnull[j] = isnull; } if (toowide) continue; - idx++; + nrows++; } /* store the actual number of items (ignoring the too-wide ones) */ - *nitems = idx; + *nitems = nrows; /* all items were too wide */ - if (idx == 0) + if (nrows == 0) { /* everything is allocated as a single chunk */ pfree(items); @@ -804,7 +1128,7 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc, } /* do the sort, using the multi-sort */ - qsort_arg((void *) items, idx, sizeof(SortItem), + qsort_arg((void *) items, nrows, sizeof(SortItem), multi_sort_compare, mss); return items; @@ -831,6 +1155,63 @@ has_stats_of_kind(List *stats, char requiredkind) } /* + * stat_find_expression + * Search for an expression in statistics object's list of expressions. + * + * Returns the index of the expression in the statistics object's list of + * expressions, or -1 if not found. + */ +static int +stat_find_expression(StatisticExtInfo *stat, Node *expr) +{ + ListCell *lc; + int idx; + + idx = 0; + foreach(lc, stat->exprs) + { + Node *stat_expr = (Node *) lfirst(lc); + + if (equal(stat_expr, expr)) + return idx; + idx++; + } + + /* Expression not found */ + return -1; +} + +/* + * stat_covers_expressions + * Test whether a statistics object covers all expressions in a list. + * + * Returns true if all expressions are covered. If expr_idxs is non-NULL, it + * is populated with the indexes of the expressions found. + */ +static bool +stat_covers_expressions(StatisticExtInfo *stat, List *exprs, + Bitmapset **expr_idxs) +{ + ListCell *lc; + + foreach(lc, exprs) + { + Node *expr = (Node *) lfirst(lc); + int expr_idx; + + expr_idx = stat_find_expression(stat, expr); + if (expr_idx == -1) + return false; + + if (expr_idxs != NULL) + *expr_idxs = bms_add_member(*expr_idxs, expr_idx); + } + + /* If we reach here, all expressions are covered */ + return true; +} + +/* * choose_best_statistics * Look for and return statistics with the specified 'requiredkind' which * have keys that match at least two of the given attnums. Return NULL if @@ -850,7 +1231,8 @@ has_stats_of_kind(List *stats, char requiredkind) */ StatisticExtInfo * choose_best_statistics(List *stats, char requiredkind, - Bitmapset **clause_attnums, int nclauses) + Bitmapset **clause_attnums, List **clause_exprs, + int nclauses) { ListCell *lc; StatisticExtInfo *best_match = NULL; @@ -861,7 +1243,8 @@ choose_best_statistics(List *stats, char requiredkind, { int i; StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc); - Bitmapset *matched = NULL; + Bitmapset *matched_attnums = NULL; + Bitmapset *matched_exprs = NULL; int num_matched; int numkeys; @@ -870,35 +1253,43 @@ choose_best_statistics(List *stats, char requiredkind, continue; /* - * Collect attributes in remaining (unestimated) clauses fully covered - * by this statistic object. + * Collect attributes and expressions in remaining (unestimated) + * clauses fully covered by this statistic object. */ for (i = 0; i < nclauses; i++) { + Bitmapset *expr_idxs = NULL; + /* ignore incompatible/estimated clauses */ - if (!clause_attnums[i]) + if (!clause_attnums[i] && !clause_exprs[i]) continue; /* ignore clauses that are not covered by this object */ - if (!bms_is_subset(clause_attnums[i], info->keys)) + if (!bms_is_subset(clause_attnums[i], info->keys) || + !stat_covers_expressions(info, clause_exprs[i], &expr_idxs)) continue; - matched = bms_add_members(matched, clause_attnums[i]); + /* record attnums and indexes of expressions covered */ + matched_attnums = bms_add_members(matched_attnums, clause_attnums[i]); + matched_exprs = bms_add_members(matched_exprs, expr_idxs); } - num_matched = bms_num_members(matched); - bms_free(matched); + num_matched = bms_num_members(matched_attnums) + bms_num_members(matched_exprs); + + bms_free(matched_attnums); + bms_free(matched_exprs); /* * save the actual number of keys in the stats so that we can choose * the narrowest stats with the most matching keys. */ - numkeys = bms_num_members(info->keys); + numkeys = bms_num_members(info->keys) + list_length(info->exprs); /* - * Use this object when it increases the number of matched clauses or - * when it matches the same number of attributes but these stats have - * fewer keys than any previous match. + * Use this object when it increases the number of matched attributes + * and expressions or when it matches the same number of attributes + * and expressions but these stats have fewer keys than any previous + * match. */ if (num_matched > best_num_matched || (num_matched == best_num_matched && numkeys < best_match_keys)) @@ -923,7 +1314,8 @@ choose_best_statistics(List *stats, char requiredkind, */ static bool statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, - Index relid, Bitmapset **attnums) + Index relid, Bitmapset **attnums, + List **exprs) { /* Look inside any binary-compatible relabeling (as in examine_variable) */ if (IsA(clause, RelabelType)) @@ -951,19 +1343,19 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, return true; } - /* (Var op Const) or (Const op Var) */ + /* (Var/Expr op Const) or (Const op Var/Expr) */ if (is_opclause(clause)) { RangeTblEntry *rte = root->simple_rte_array[relid]; OpExpr *expr = (OpExpr *) clause; - Var *var; + Node *clause_expr; /* Only expressions with two arguments are considered compatible. */ if (list_length(expr->args) != 2) return false; - /* Check if the expression has the right shape (one Var, one Const) */ - if (!examine_clause_args(expr->args, &var, NULL, NULL)) + /* Check if the expression has the right shape */ + if (!examine_opclause_args(expr->args, &clause_expr, NULL, NULL)) return false; /* @@ -981,7 +1373,7 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, case F_SCALARLESEL: case F_SCALARGTSEL: case F_SCALARGESEL: - /* supported, will continue with inspection of the Var */ + /* supported, will continue with inspection of the Var/Expr */ break; default: @@ -1003,23 +1395,29 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, !get_func_leakproof(get_opcode(expr->opno))) return false; - return statext_is_compatible_clause_internal(root, (Node *) var, - relid, attnums); + /* Check (Var op Const) or (Const op Var) clauses by recursing. */ + if (IsA(clause_expr, Var)) + return statext_is_compatible_clause_internal(root, clause_expr, + relid, attnums, exprs); + + /* Otherwise we have (Expr op Const) or (Const op Expr). */ + *exprs = lappend(*exprs, clause_expr); + return true; } - /* Var IN Array */ + /* Var/Expr IN Array */ if (IsA(clause, ScalarArrayOpExpr)) { RangeTblEntry *rte = root->simple_rte_array[relid]; ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause; - Var *var; + Node *clause_expr; /* Only expressions with two arguments are considered compatible. */ if (list_length(expr->args) != 2) return false; /* Check if the expression has the right shape (one Var, one Const) */ - if (!examine_clause_args(expr->args, &var, NULL, NULL)) + if (!examine_opclause_args(expr->args, &clause_expr, NULL, NULL)) return false; /* @@ -1037,7 +1435,7 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, case F_SCALARLESEL: case F_SCALARGTSEL: case F_SCALARGESEL: - /* supported, will continue with inspection of the Var */ + /* supported, will continue with inspection of the Var/Expr */ break; default: @@ -1059,8 +1457,14 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, !get_func_leakproof(get_opcode(expr->opno))) return false; - return statext_is_compatible_clause_internal(root, (Node *) var, - relid, attnums); + /* Check Var IN Array clauses by recursing. */ + if (IsA(clause_expr, Var)) + return statext_is_compatible_clause_internal(root, clause_expr, + relid, attnums, exprs); + + /* Otherwise we have Expr IN Array. */ + *exprs = lappend(*exprs, clause_expr); + return true; } /* AND/OR/NOT clause */ @@ -1093,54 +1497,62 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, */ if (!statext_is_compatible_clause_internal(root, (Node *) lfirst(lc), - relid, attnums)) + relid, attnums, exprs)) return false; } return true; } - /* Var IS NULL */ + /* Var/Expr IS NULL */ if (IsA(clause, NullTest)) { NullTest *nt = (NullTest *) clause; - /* - * Only simple (Var IS NULL) expressions supported for now. Maybe we - * could use examine_variable to fix this? - */ - if (!IsA(nt->arg, Var)) - return false; + /* Check Var IS NULL clauses by recursing. */ + if (IsA(nt->arg, Var)) + return statext_is_compatible_clause_internal(root, (Node *) (nt->arg), + relid, attnums, exprs); - return statext_is_compatible_clause_internal(root, (Node *) (nt->arg), - relid, attnums); + /* Otherwise we have Expr IS NULL. */ + *exprs = lappend(*exprs, nt->arg); + return true; } - return false; + /* + * Treat any other expressions as bare expressions to be matched against + * expressions in statistics objects. + */ + *exprs = lappend(*exprs, clause); + return true; } /* * statext_is_compatible_clause * Determines if the clause is compatible with MCV lists. * - * Currently, we only support three types of clauses: + * Currently, we only support the following types of clauses: * - * (a) OpExprs of the form (Var op Const), or (Const op Var), where the op - * is one of ("=", "<", ">", ">=", "<=") + * (a) OpExprs of the form (Var/Expr op Const), or (Const op Var/Expr), where + * the op is one of ("=", "<", ">", ">=", "<=") * - * (b) (Var IS [NOT] NULL) + * (b) (Var/Expr IS [NOT] NULL) * * (c) combinations using AND/OR/NOT * + * (d) ScalarArrayOpExprs of the form (Var/Expr op ANY (array)) or (Var/Expr + * op ALL (array)) + * * In the future, the range of supported clauses may be expanded to more * complex cases, for example (Var op Var). */ static bool statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, - Bitmapset **attnums) + Bitmapset **attnums, List **exprs) { RangeTblEntry *rte = root->simple_rte_array[relid]; RestrictInfo *rinfo = (RestrictInfo *) clause; + int clause_relid; Oid userid; /* @@ -1160,7 +1572,7 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, foreach(lc, expr->args) { if (!statext_is_compatible_clause(root, (Node *) lfirst(lc), - relid, attnums)) + relid, attnums, exprs)) return false; } @@ -1175,25 +1587,36 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, if (rinfo->pseudoconstant) return false; - /* clauses referencing multiple varnos are incompatible */ - if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON) + /* Clauses referencing other varnos are incompatible. */ + if (!bms_get_singleton_member(rinfo->clause_relids, &clause_relid) || + clause_relid != relid) return false; /* Check the clause and determine what attributes it references. */ if (!statext_is_compatible_clause_internal(root, (Node *) rinfo->clause, - relid, attnums)) + relid, attnums, exprs)) return false; /* - * Check that the user has permission to read all these attributes. Use + * Check that the user has permission to read all required attributes. Use * checkAsUser if it's set, in case we're accessing the table via a view. */ userid = rte->checkAsUser ? rte->checkAsUser : GetUserId(); if (pg_class_aclcheck(rte->relid, userid, ACL_SELECT) != ACLCHECK_OK) { + Bitmapset *clause_attnums; + /* Don't have table privilege, must check individual columns */ - if (bms_is_member(InvalidAttrNumber, *attnums)) + if (*exprs != NIL) + { + pull_varattnos((Node *) exprs, relid, &clause_attnums); + clause_attnums = bms_add_members(clause_attnums, *attnums); + } + else + clause_attnums = *attnums; + + if (bms_is_member(InvalidAttrNumber, clause_attnums)) { /* Have a whole-row reference, must have access to all columns */ if (pg_attribute_aclcheck_all(rte->relid, userid, ACL_SELECT, @@ -1205,7 +1628,7 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, /* Check the columns referenced by the clause */ int attnum = -1; - while ((attnum = bms_next_member(*attnums, attnum)) >= 0) + while ((attnum = bms_next_member(clause_attnums, attnum)) >= 0) { if (pg_attribute_aclcheck(rte->relid, attnum, userid, ACL_SELECT) != ACLCHECK_OK) @@ -1259,7 +1682,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli bool is_or) { ListCell *l; - Bitmapset **list_attnums; + Bitmapset **list_attnums; /* attnums extracted from the clause */ + List **list_exprs; /* expressions matched to any statistic */ int listidx; Selectivity sel = (is_or) ? 0.0 : 1.0; @@ -1270,13 +1694,16 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) * list_length(clauses)); + /* expressions extracted from complex expressions */ + list_exprs = (List **) palloc(sizeof(Node *) * list_length(clauses)); + /* - * Pre-process the clauses list to extract the attnums seen in each item. - * We need to determine if there's any clauses which will be useful for - * selectivity estimations with extended stats. Along the way we'll record - * all of the attnums for each clause in a list which we'll reference - * later so we don't need to repeat the same work again. We'll also keep - * track of all attnums seen. + * Pre-process the clauses list to extract the attnums and expressions + * seen in each item. We need to determine if there are any clauses which + * will be useful for selectivity estimations with extended stats. Along + * the way we'll record all of the attnums and expressions for each clause + * in lists which we'll reference later so we don't need to repeat the + * same work again. * * We also skip clauses that we already estimated using different types of * statistics (we treat them as incompatible). @@ -1286,12 +1713,19 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli { Node *clause = (Node *) lfirst(l); Bitmapset *attnums = NULL; + List *exprs = NIL; if (!bms_is_member(listidx, *estimatedclauses) && - statext_is_compatible_clause(root, clause, rel->relid, &attnums)) + statext_is_compatible_clause(root, clause, rel->relid, &attnums, &exprs)) + { list_attnums[listidx] = attnums; + list_exprs[listidx] = exprs; + } else + { list_attnums[listidx] = NULL; + list_exprs[listidx] = NIL; + } listidx++; } @@ -1305,7 +1739,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli /* find the best suited statistics object for these attnums */ stat = choose_best_statistics(rel->statlist, STATS_EXT_MCV, - list_attnums, list_length(clauses)); + list_attnums, list_exprs, + list_length(clauses)); /* * if no (additional) matching stats could be found then we've nothing @@ -1320,28 +1755,39 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli /* now filter the clauses to be estimated using the selected MCV */ stat_clauses = NIL; - /* record which clauses are simple (single column) */ + /* record which clauses are simple (single column or expression) */ simple_clauses = NULL; listidx = 0; foreach(l, clauses) { /* - * If the clause is compatible with the selected statistics, mark - * it as estimated and add it to the list to estimate. + * If the clause is not already estimated and is compatible with + * the selected statistics object (all attributes and expressions + * covered), mark it as estimated and add it to the list to + * estimate. */ - if (list_attnums[listidx] != NULL && - bms_is_subset(list_attnums[listidx], stat->keys)) + if (!bms_is_member(listidx, *estimatedclauses) && + bms_is_subset(list_attnums[listidx], stat->keys) && + stat_covers_expressions(stat, list_exprs[listidx], NULL)) { - if (bms_membership(list_attnums[listidx]) == BMS_SINGLETON) + /* record simple clauses (single column or expression) */ + if ((list_attnums[listidx] == NULL && + list_length(list_exprs[listidx]) == 1) || + (list_exprs[listidx] == NIL && + bms_membership(list_attnums[listidx]) == BMS_SINGLETON)) simple_clauses = bms_add_member(simple_clauses, list_length(stat_clauses)); + /* add clause to list and mark as estimated */ stat_clauses = lappend(stat_clauses, (Node *) lfirst(l)); *estimatedclauses = bms_add_member(*estimatedclauses, listidx); bms_free(list_attnums[listidx]); list_attnums[listidx] = NULL; + + list_free(list_exprs[listidx]); + list_exprs[listidx] = NULL; } listidx++; @@ -1530,23 +1976,24 @@ statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, } /* - * examine_opclause_expression - * Split expression into Var and Const parts. + * examine_opclause_args + * Split an operator expression's arguments into Expr and Const parts. * - * Attempts to match the arguments to either (Var op Const) or (Const op Var), - * possibly with a RelabelType on top. When the expression matches this form, - * returns true, otherwise returns false. + * Attempts to match the arguments to either (Expr op Const) or (Const op + * Expr), possibly with a RelabelType on top. When the expression matches this + * form, returns true, otherwise returns false. * - * Optionally returns pointers to the extracted Var/Const nodes, when passed - * non-null pointers (varp, cstp and varonleftp). The varonleftp flag specifies - * on which side of the operator we found the Var node. + * Optionally returns pointers to the extracted Expr/Const nodes, when passed + * non-null pointers (exprp, cstp and expronleftp). The expronleftp flag + * specifies on which side of the operator we found the expression node. */ bool -examine_clause_args(List *args, Var **varp, Const **cstp, bool *varonleftp) +examine_opclause_args(List *args, Node **exprp, Const **cstp, + bool *expronleftp) { - Var *var; + Node *expr; Const *cst; - bool varonleft; + bool expronleft; Node *leftop, *rightop; @@ -1563,30 +2010,564 @@ examine_clause_args(List *args, Var **varp, Const **cstp, bool *varonleftp) if (IsA(rightop, RelabelType)) rightop = (Node *) ((RelabelType *) rightop)->arg; - if (IsA(leftop, Var) && IsA(rightop, Const)) + if (IsA(rightop, Const)) { - var = (Var *) leftop; + expr = (Node *) leftop; cst = (Const *) rightop; - varonleft = true; + expronleft = true; } - else if (IsA(leftop, Const) && IsA(rightop, Var)) + else if (IsA(leftop, Const)) { - var = (Var *) rightop; + expr = (Node *) rightop; cst = (Const *) leftop; - varonleft = false; + expronleft = false; } else return false; /* return pointers to the extracted parts if requested */ - if (varp) - *varp = var; + if (exprp) + *exprp = expr; if (cstp) *cstp = cst; - if (varonleftp) - *varonleftp = varonleft; + if (expronleftp) + *expronleftp = expronleft; return true; } + + +/* + * Compute statistics about expressions of a relation. + */ +static void +compute_expr_stats(Relation onerel, double totalrows, + AnlExprData *exprdata, int nexprs, + HeapTuple *rows, int numrows) +{ + MemoryContext expr_context, + old_context; + int ind, + i; + + expr_context = AllocSetContextCreate(CurrentMemoryContext, + "Analyze Expression", + ALLOCSET_DEFAULT_SIZES); + old_context = MemoryContextSwitchTo(expr_context); + + for (ind = 0; ind < nexprs; ind++) + { + AnlExprData *thisdata = &exprdata[ind]; + VacAttrStats *stats = thisdata->vacattrstat; + Node *expr = thisdata->expr; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + Datum *exprvals; + bool *exprnulls; + ExprState *exprstate; + int tcnt; + + /* Are we still in the main context? */ + Assert(CurrentMemoryContext == expr_context); + + /* + * Need an EState for evaluation of expressions. Create it in the + * per-expression context to be sure it gets cleaned up at the bottom + * of the loop. + */ + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + + /* Set up expression evaluation state */ + exprstate = ExecPrepareExpr((Expr *) expr, estate); + + /* Need a slot to hold the current heap tuple, too */ + slot = MakeSingleTupleTableSlot(RelationGetDescr(onerel), + &TTSOpsHeapTuple); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Compute and save expression values */ + exprvals = (Datum *) palloc(numrows * sizeof(Datum)); + exprnulls = (bool *) palloc(numrows * sizeof(bool)); + + tcnt = 0; + for (i = 0; i < numrows; i++) + { + Datum datum; + bool isnull; + + /* + * Reset the per-tuple context each time, to reclaim any cruft + * left behind by evaluating the statistics expressions. + */ + ResetExprContext(econtext); + + /* Set up for expression evaluation */ + ExecStoreHeapTuple(rows[i], slot, false); + + /* + * Evaluate the expression. We do this in the per-tuple context so + * as not to leak memory, and then copy the result into the + * context created at the beginning of this function. + */ + datum = ExecEvalExprSwitchContext(exprstate, + GetPerTupleExprContext(estate), + &isnull); + if (isnull) + { + exprvals[tcnt] = (Datum) 0; + exprnulls[tcnt] = true; + } + else + { + /* Make sure we copy the data into the context. */ + Assert(CurrentMemoryContext == expr_context); + + exprvals[tcnt] = datumCopy(datum, + stats->attrtype->typbyval, + stats->attrtype->typlen); + exprnulls[tcnt] = false; + } + + tcnt++; + } + + /* + * Now we can compute the statistics for the expression columns. + * + * XXX Unlike compute_index_stats we don't need to switch and reset + * memory contexts here, because we're only computing stats for a + * single expression (and not iterating over many indexes), so we just + * do it in expr_context. Note that compute_stats copies the result + * into stats->anl_context, so it does not disappear. + */ + if (tcnt > 0) + { + AttributeOpts *aopt = + get_attribute_options(stats->attr->attrelid, + stats->attr->attnum); + + stats->exprvals = exprvals; + stats->exprnulls = exprnulls; + stats->rowstride = 1; + stats->compute_stats(stats, + expr_fetch_func, + tcnt, + tcnt); + + /* + * If the n_distinct option is specified, it overrides the above + * computation. + */ + if (aopt != NULL && aopt->n_distinct != 0.0) + stats->stadistinct = aopt->n_distinct; + } + + /* And clean up */ + MemoryContextSwitchTo(expr_context); + + ExecDropSingleTupleTableSlot(slot); + FreeExecutorState(estate); + MemoryContextResetAndDeleteChildren(expr_context); + } + + MemoryContextSwitchTo(old_context); + MemoryContextDelete(expr_context); +} + + +/* + * Fetch function for analyzing statistics object expressions. + * + * We have not bothered to construct tuples from the data, instead the data + * is just in Datum arrays. + */ +static Datum +expr_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull) +{ + int i; + + /* exprvals and exprnulls are already offset for proper column */ + i = rownum * stats->rowstride; + *isNull = stats->exprnulls[i]; + return stats->exprvals[i]; +} + +/* + * Build analyze data for a list of expressions. As this is not tied + * directly to a relation (table or index), we have to fake some of + * the fields in examine_expression(). + */ +static AnlExprData * +build_expr_data(List *exprs, int stattarget) +{ + int idx; + int nexprs = list_length(exprs); + AnlExprData *exprdata; + ListCell *lc; + + exprdata = (AnlExprData *) palloc0(nexprs * sizeof(AnlExprData)); + + idx = 0; + foreach(lc, exprs) + { + Node *expr = (Node *) lfirst(lc); + AnlExprData *thisdata = &exprdata[idx]; + + thisdata->expr = expr; + thisdata->vacattrstat = examine_expression(expr, stattarget); + idx++; + } + + return exprdata; +} + +/* form an array of pg_statistic rows (per update_attstats) */ +static Datum +serialize_expr_stats(AnlExprData *exprdata, int nexprs) +{ + int exprno; + Oid typOid; + Relation sd; + + ArrayBuildState *astate = NULL; + + sd = table_open(StatisticRelationId, RowExclusiveLock); + + /* lookup OID of composite type for pg_statistic */ + typOid = get_rel_type_id(StatisticRelationId); + if (!OidIsValid(typOid)) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("relation \"pg_statistic\" does not have a composite type"))); + + for (exprno = 0; exprno < nexprs; exprno++) + { + int i, + k; + VacAttrStats *stats = exprdata[exprno].vacattrstat; + + Datum values[Natts_pg_statistic]; + bool nulls[Natts_pg_statistic]; + HeapTuple stup; + + if (!stats->stats_valid) + { + astate = accumArrayResult(astate, + (Datum) 0, + true, + typOid, + CurrentMemoryContext); + continue; + } + + /* + * Construct a new pg_statistic tuple + */ + for (i = 0; i < Natts_pg_statistic; ++i) + { + nulls[i] = false; + } + + values[Anum_pg_statistic_starelid - 1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_statistic_staattnum - 1] = Int16GetDatum(InvalidAttrNumber); + values[Anum_pg_statistic_stainherit - 1] = BoolGetDatum(false); + values[Anum_pg_statistic_stanullfrac - 1] = Float4GetDatum(stats->stanullfrac); + values[Anum_pg_statistic_stawidth - 1] = Int32GetDatum(stats->stawidth); + values[Anum_pg_statistic_stadistinct - 1] = Float4GetDatum(stats->stadistinct); + i = Anum_pg_statistic_stakind1 - 1; + for (k = 0; k < STATISTIC_NUM_SLOTS; k++) + { + values[i++] = Int16GetDatum(stats->stakind[k]); /* stakindN */ + } + i = Anum_pg_statistic_staop1 - 1; + for (k = 0; k < STATISTIC_NUM_SLOTS; k++) + { + values[i++] = ObjectIdGetDatum(stats->staop[k]); /* staopN */ + } + i = Anum_pg_statistic_stacoll1 - 1; + for (k = 0; k < STATISTIC_NUM_SLOTS; k++) + { + values[i++] = ObjectIdGetDatum(stats->stacoll[k]); /* stacollN */ + } + i = Anum_pg_statistic_stanumbers1 - 1; + for (k = 0; k < STATISTIC_NUM_SLOTS; k++) + { + int nnum = stats->numnumbers[k]; + + if (nnum > 0) + { + int n; + Datum *numdatums = (Datum *) palloc(nnum * sizeof(Datum)); + ArrayType *arry; + + for (n = 0; n < nnum; n++) + numdatums[n] = Float4GetDatum(stats->stanumbers[k][n]); + /* XXX knows more than it should about type float4: */ + arry = construct_array(numdatums, nnum, + FLOAT4OID, + sizeof(float4), true, TYPALIGN_INT); + values[i++] = PointerGetDatum(arry); /* stanumbersN */ + } + else + { + nulls[i] = true; + values[i++] = (Datum) 0; + } + } + i = Anum_pg_statistic_stavalues1 - 1; + for (k = 0; k < STATISTIC_NUM_SLOTS; k++) + { + if (stats->numvalues[k] > 0) + { + ArrayType *arry; + + arry = construct_array(stats->stavalues[k], + stats->numvalues[k], + stats->statypid[k], + stats->statyplen[k], + stats->statypbyval[k], + stats->statypalign[k]); + values[i++] = PointerGetDatum(arry); /* stavaluesN */ + } + else + { + nulls[i] = true; + values[i++] = (Datum) 0; + } + } + + stup = heap_form_tuple(RelationGetDescr(sd), values, nulls); + + astate = accumArrayResult(astate, + heap_copy_tuple_as_datum(stup, RelationGetDescr(sd)), + false, + typOid, + CurrentMemoryContext); + } + + table_close(sd, RowExclusiveLock); + + return makeArrayResult(astate, CurrentMemoryContext); +} + +/* + * Loads pg_statistic record from expression statistics for expression + * identified by the supplied index. + */ +HeapTuple +statext_expressions_load(Oid stxoid, int idx) +{ + bool isnull; + Datum value; + HeapTuple htup; + ExpandedArrayHeader *eah; + HeapTupleHeader td; + HeapTupleData tmptup; + HeapTuple tup; + + htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(stxoid)); + if (!HeapTupleIsValid(htup)) + elog(ERROR, "cache lookup failed for statistics object %u", stxoid); + + value = SysCacheGetAttr(STATEXTDATASTXOID, htup, + Anum_pg_statistic_ext_data_stxdexpr, &isnull); + if (isnull) + elog(ERROR, + "requested statistics kind \"%c\" is not yet built for statistics object %u", + STATS_EXT_DEPENDENCIES, stxoid); + + eah = DatumGetExpandedArray(value); + + deconstruct_expanded_array(eah); + + td = DatumGetHeapTupleHeader(eah->dvalues[idx]); + + /* Build a temporary HeapTuple control structure */ + tmptup.t_len = HeapTupleHeaderGetDatumLength(td); + tmptup.t_data = td; + + tup = heap_copytuple(&tmptup); + + ReleaseSysCache(htup); + + return tup; +} + +/* + * Evaluate the expressions, so that we can use the results to build + * all the requested statistics types. This matters especially for + * expensive expressions, of course. + */ +static StatsBuildData * +make_build_data(Relation rel, StatExtEntry *stat, int numrows, HeapTuple *rows, + VacAttrStats **stats, int stattarget) +{ + /* evaluated expressions */ + StatsBuildData *result; + char *ptr; + Size len; + + int i; + int k; + int idx; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + List *exprstates = NIL; + int nkeys = bms_num_members(stat->columns) + list_length(stat->exprs); + ListCell *lc; + + /* allocate everything as a single chunk, so we can free it easily */ + len = MAXALIGN(sizeof(StatsBuildData)); + len += MAXALIGN(sizeof(AttrNumber) * nkeys); /* attnums */ + len += MAXALIGN(sizeof(VacAttrStats *) * nkeys); /* stats */ + + /* values */ + len += MAXALIGN(sizeof(Datum *) * nkeys); + len += nkeys * MAXALIGN(sizeof(Datum) * numrows); + + /* nulls */ + len += MAXALIGN(sizeof(bool *) * nkeys); + len += nkeys * MAXALIGN(sizeof(bool) * numrows); + + ptr = palloc(len); + + /* set the pointers */ + result = (StatsBuildData *) ptr; + ptr += MAXALIGN(sizeof(StatsBuildData)); + + /* attnums */ + result->attnums = (AttrNumber *) ptr; + ptr += MAXALIGN(sizeof(AttrNumber) * nkeys); + + /* stats */ + result->stats = (VacAttrStats **) ptr; + ptr += MAXALIGN(sizeof(VacAttrStats *) * nkeys); + + /* values */ + result->values = (Datum **) ptr; + ptr += MAXALIGN(sizeof(Datum *) * nkeys); + + /* nulls */ + result->nulls = (bool **) ptr; + ptr += MAXALIGN(sizeof(bool *) * nkeys); + + for (i = 0; i < nkeys; i++) + { + result->values[i] = (Datum *) ptr; + ptr += MAXALIGN(sizeof(Datum) * numrows); + + result->nulls[i] = (bool *) ptr; + ptr += MAXALIGN(sizeof(bool) * numrows); + } + + Assert((ptr - (char *) result) == len); + + /* we have it allocated, so let's fill the values */ + result->nattnums = nkeys; + result->numrows = numrows; + + /* fill the attribute info - first attributes, then expressions */ + idx = 0; + k = -1; + while ((k = bms_next_member(stat->columns, k)) >= 0) + { + result->attnums[idx] = k; + result->stats[idx] = stats[idx]; + + idx++; + } + + k = -1; + foreach(lc, stat->exprs) + { + Node *expr = (Node *) lfirst(lc); + + result->attnums[idx] = k; + result->stats[idx] = examine_expression(expr, stattarget); + + idx++; + k--; + } + + /* first extract values for all the regular attributes */ + for (i = 0; i < numrows; i++) + { + idx = 0; + k = -1; + while ((k = bms_next_member(stat->columns, k)) >= 0) + { + result->values[idx][i] = heap_getattr(rows[i], k, + result->stats[idx]->tupDesc, + &result->nulls[idx][i]); + + idx++; + } + } + + /* Need an EState for evaluation expressions. */ + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + + /* Need a slot to hold the current heap tuple, too */ + slot = MakeSingleTupleTableSlot(RelationGetDescr(rel), + &TTSOpsHeapTuple); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Set up expression evaluation state */ + exprstates = ExecPrepareExprList(stat->exprs, estate); + + for (i = 0; i < numrows; i++) + { + /* + * Reset the per-tuple context each time, to reclaim any cruft left + * behind by evaluating the statistics object expressions. + */ + ResetExprContext(econtext); + + /* Set up for expression evaluation */ + ExecStoreHeapTuple(rows[i], slot, false); + + idx = bms_num_members(stat->columns); + foreach(lc, exprstates) + { + Datum datum; + bool isnull; + ExprState *exprstate = (ExprState *) lfirst(lc); + + /* + * XXX This probably leaks memory. Maybe we should use + * ExecEvalExprSwitchContext but then we need to copy the result + * somewhere else. + */ + datum = ExecEvalExpr(exprstate, + GetPerTupleExprContext(estate), + &isnull); + if (isnull) + { + result->values[idx][i] = (Datum) 0; + result->nulls[idx][i] = true; + } + else + { + result->values[idx][i] = (Datum) datum; + result->nulls[idx][i] = false; + } + + idx++; + } + } + + ExecDropSingleTupleTableSlot(slot); + FreeExecutorState(estate); + + return result; +} diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c index 8335dff241..2a00fb4848 100644 --- a/src/backend/statistics/mcv.c +++ b/src/backend/statistics/mcv.c @@ -74,7 +74,7 @@ ((ndims) * sizeof(DimensionInfo)) + \ ((nitems) * ITEM_SIZE(ndims))) -static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs); +static MultiSortSupport build_mss(StatsBuildData *data); static SortItem *build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss, int *ndistinct); @@ -181,32 +181,33 @@ get_mincount_for_mcv_list(int samplerows, double totalrows) * */ MCVList * -statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs, - VacAttrStats **stats, double totalrows, int stattarget) +statext_mcv_build(StatsBuildData *data, double totalrows, int stattarget) { int i, numattrs, + numrows, ngroups, nitems; - AttrNumber *attnums; double mincount; SortItem *items; SortItem *groups; MCVList *mcvlist = NULL; MultiSortSupport mss; - attnums = build_attnums_array(attrs, &numattrs); - /* comparator for all the columns */ - mss = build_mss(stats, numattrs); + mss = build_mss(data); /* sort the rows */ - items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc, - mss, numattrs, attnums); + items = build_sorted_items(data, &nitems, mss, + data->nattnums, data->attnums); if (!items) return NULL; + /* for convenience */ + numattrs = data->nattnums; + numrows = data->numrows; + /* transform the sorted rows into groups (sorted by frequency) */ groups = build_distinct_groups(nitems, items, mss, &ngroups); @@ -289,7 +290,7 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs, /* store info about data type OIDs */ for (i = 0; i < numattrs; i++) - mcvlist->types[i] = stats[i]->attrtypid; + mcvlist->types[i] = data->stats[i]->attrtypid; /* Copy the first chunk of groups into the result. */ for (i = 0; i < nitems; i++) @@ -347,9 +348,10 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs, * build MultiSortSupport for the attributes passed in attrs */ static MultiSortSupport -build_mss(VacAttrStats **stats, int numattrs) +build_mss(StatsBuildData *data) { int i; + int numattrs = data->nattnums; /* Sort by multiple columns (using array of SortSupport) */ MultiSortSupport mss = multi_sort_init(numattrs); @@ -357,7 +359,7 @@ build_mss(VacAttrStats **stats, int numattrs) /* prepare the sort functions for all the attributes */ for (i = 0; i < numattrs; i++) { - VacAttrStats *colstat = stats[i]; + VacAttrStats *colstat = data->stats[i]; TypeCacheEntry *type; type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR); @@ -1524,6 +1526,59 @@ pg_mcv_list_send(PG_FUNCTION_ARGS) } /* + * match the attribute/expression to a dimension of the statistic + * + * Match the attribute/expression to statistics dimension. Optionally + * determine the collation. + */ +static int +mcv_match_expression(Node *expr, Bitmapset *keys, List *exprs, Oid *collid) +{ + int idx = -1; + + if (IsA(expr, Var)) + { + /* simple Var, so just lookup using varattno */ + Var *var = (Var *) expr; + + if (collid) + *collid = var->varcollid; + + idx = bms_member_index(keys, var->varattno); + + /* make sure the index is valid */ + Assert((idx >= 0) && (idx <= bms_num_members(keys))); + } + else + { + ListCell *lc; + + /* expressions are stored after the simple columns */ + idx = bms_num_members(keys); + + if (collid) + *collid = exprCollation(expr); + + /* expression - lookup in stats expressions */ + foreach(lc, exprs) + { + Node *stat_expr = (Node *) lfirst(lc); + + if (equal(expr, stat_expr)) + break; + + idx++; + } + + /* make sure the index is valid */ + Assert((idx >= bms_num_members(keys)) && + (idx <= bms_num_members(keys) + list_length(exprs))); + } + + return idx; +} + +/* * mcv_get_match_bitmap * Evaluate clauses using the MCV list, and update the match bitmap. * @@ -1544,7 +1599,8 @@ pg_mcv_list_send(PG_FUNCTION_ARGS) */ static bool * mcv_get_match_bitmap(PlannerInfo *root, List *clauses, - Bitmapset *keys, MCVList *mcvlist, bool is_or) + Bitmapset *keys, List *exprs, + MCVList *mcvlist, bool is_or) { int i; ListCell *l; @@ -1582,77 +1638,78 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses, OpExpr *expr = (OpExpr *) clause; FmgrInfo opproc; - /* valid only after examine_clause_args returns true */ - Var *var; + /* valid only after examine_opclause_args returns true */ + Node *clause_expr; Const *cst; - bool varonleft; + bool expronleft; + int idx; + Oid collid; fmgr_info(get_opcode(expr->opno), &opproc); - /* extract the var and const from the expression */ - if (examine_clause_args(expr->args, &var, &cst, &varonleft)) + /* extract the var/expr and const from the expression */ + if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft)) + elog(ERROR, "incompatible clause"); + + /* match the attribute/expression to a dimension of the statistic */ + idx = mcv_match_expression(clause_expr, keys, exprs, &collid); + + /* + * Walk through the MCV items and evaluate the current clause. We + * can skip items that were already ruled out, and terminate if + * there are no remaining MCV items that might possibly match. + */ + for (i = 0; i < mcvlist->nitems; i++) { - int idx; + bool match = true; + MCVItem *item = &mcvlist->items[i]; - /* match the attribute to a dimension of the statistic */ - idx = bms_member_index(keys, var->varattno); + Assert(idx >= 0); /* - * Walk through the MCV items and evaluate the current clause. - * We can skip items that were already ruled out, and - * terminate if there are no remaining MCV items that might - * possibly match. + * When the MCV item or the Const value is NULL we can treat + * this as a mismatch. We must not call the operator because + * of strictness. */ - for (i = 0; i < mcvlist->nitems; i++) + if (item->isnull[idx] || cst->constisnull) { - bool match = true; - MCVItem *item = &mcvlist->items[i]; - - /* - * When the MCV item or the Const value is NULL we can - * treat this as a mismatch. We must not call the operator - * because of strictness. - */ - if (item->isnull[idx] || cst->constisnull) - { - matches[i] = RESULT_MERGE(matches[i], is_or, false); - continue; - } + matches[i] = RESULT_MERGE(matches[i], is_or, false); + continue; + } - /* - * Skip MCV items that can't change result in the bitmap. - * Once the value gets false for AND-lists, or true for - * OR-lists, we don't need to look at more clauses. - */ - if (RESULT_IS_FINAL(matches[i], is_or)) - continue; + /* + * Skip MCV items that can't change result in the bitmap. Once + * the value gets false for AND-lists, or true for OR-lists, + * we don't need to look at more clauses. + */ + if (RESULT_IS_FINAL(matches[i], is_or)) + continue; - /* - * First check whether the constant is below the lower - * boundary (in that case we can skip the bucket, because - * there's no overlap). - * - * We don't store collations used to build the statistics, - * but we can use the collation for the attribute itself, - * as stored in varcollid. We do reset the statistics - * after a type change (including collation change), so - * this is OK. We may need to relax this after allowing - * extended statistics on expressions. - */ - if (varonleft) - match = DatumGetBool(FunctionCall2Coll(&opproc, - var->varcollid, - item->values[idx], - cst->constvalue)); - else - match = DatumGetBool(FunctionCall2Coll(&opproc, - var->varcollid, - cst->constvalue, - item->values[idx])); - - /* update the match bitmap with the result */ - matches[i] = RESULT_MERGE(matches[i], is_or, match); - } + /* + * First check whether the constant is below the lower + * boundary (in that case we can skip the bucket, because + * there's no overlap). + * + * We don't store collations used to build the statistics, but + * we can use the collation for the attribute itself, as + * stored in varcollid. We do reset the statistics after a + * type change (including collation change), so this is OK. + * For expressions we use the collation extracted from the + * expression itself. + */ + if (expronleft) + match = DatumGetBool(FunctionCall2Coll(&opproc, + collid, + item->values[idx], + cst->constvalue)); + else + match = DatumGetBool(FunctionCall2Coll(&opproc, + collid, + cst->constvalue, + item->values[idx])); + + /* update the match bitmap with the result */ + matches[i] = RESULT_MERGE(matches[i], is_or, match); } } else if (IsA(clause, ScalarArrayOpExpr)) @@ -1660,115 +1717,116 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses, ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause; FmgrInfo opproc; - /* valid only after examine_clause_args returns true */ - Var *var; + /* valid only after examine_opclause_args returns true */ + Node *clause_expr; Const *cst; - bool varonleft; + bool expronleft; + Oid collid; + int idx; + + /* array evaluation */ + ArrayType *arrayval; + int16 elmlen; + bool elmbyval; + char elmalign; + int num_elems; + Datum *elem_values; + bool *elem_nulls; fmgr_info(get_opcode(expr->opno), &opproc); - /* extract the var and const from the expression */ - if (examine_clause_args(expr->args, &var, &cst, &varonleft)) + /* extract the var/expr and const from the expression */ + if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft)) + elog(ERROR, "incompatible clause"); + + /* ScalarArrayOpExpr has the Var always on the left */ + Assert(expronleft); + + /* XXX what if (cst->constisnull == NULL)? */ + if (!cst->constisnull) { - int idx; + arrayval = DatumGetArrayTypeP(cst->constvalue); + get_typlenbyvalalign(ARR_ELEMTYPE(arrayval), + &elmlen, &elmbyval, &elmalign); + deconstruct_array(arrayval, + ARR_ELEMTYPE(arrayval), + elmlen, elmbyval, elmalign, + &elem_values, &elem_nulls, &num_elems); + } - ArrayType *arrayval; - int16 elmlen; - bool elmbyval; - char elmalign; - int num_elems; - Datum *elem_values; - bool *elem_nulls; + /* match the attribute/expression to a dimension of the statistic */ + idx = mcv_match_expression(clause_expr, keys, exprs, &collid); - /* ScalarArrayOpExpr has the Var always on the left */ - Assert(varonleft); + /* + * Walk through the MCV items and evaluate the current clause. We + * can skip items that were already ruled out, and terminate if + * there are no remaining MCV items that might possibly match. + */ + for (i = 0; i < mcvlist->nitems; i++) + { + int j; + bool match = (expr->useOr ? false : true); + MCVItem *item = &mcvlist->items[i]; - if (!cst->constisnull) + /* + * When the MCV item or the Const value is NULL we can treat + * this as a mismatch. We must not call the operator because + * of strictness. + */ + if (item->isnull[idx] || cst->constisnull) { - arrayval = DatumGetArrayTypeP(cst->constvalue); - get_typlenbyvalalign(ARR_ELEMTYPE(arrayval), - &elmlen, &elmbyval, &elmalign); - deconstruct_array(arrayval, - ARR_ELEMTYPE(arrayval), - elmlen, elmbyval, elmalign, - &elem_values, &elem_nulls, &num_elems); + matches[i] = RESULT_MERGE(matches[i], is_or, false); + continue; } - /* match the attribute to a dimension of the statistic */ - idx = bms_member_index(keys, var->varattno); - /* - * Walk through the MCV items and evaluate the current clause. - * We can skip items that were already ruled out, and - * terminate if there are no remaining MCV items that might - * possibly match. + * Skip MCV items that can't change result in the bitmap. Once + * the value gets false for AND-lists, or true for OR-lists, + * we don't need to look at more clauses. */ - for (i = 0; i < mcvlist->nitems; i++) + if (RESULT_IS_FINAL(matches[i], is_or)) + continue; + + for (j = 0; j < num_elems; j++) { - int j; - bool match = (expr->useOr ? false : true); - MCVItem *item = &mcvlist->items[i]; + Datum elem_value = elem_values[j]; + bool elem_isnull = elem_nulls[j]; + bool elem_match; - /* - * When the MCV item or the Const value is NULL we can - * treat this as a mismatch. We must not call the operator - * because of strictness. - */ - if (item->isnull[idx] || cst->constisnull) + /* NULL values always evaluate as not matching. */ + if (elem_isnull) { - matches[i] = RESULT_MERGE(matches[i], is_or, false); + match = RESULT_MERGE(match, expr->useOr, false); continue; } /* - * Skip MCV items that can't change result in the bitmap. - * Once the value gets false for AND-lists, or true for - * OR-lists, we don't need to look at more clauses. + * Stop evaluating the array elements once we reach match + * value that can't change - ALL() is the same as + * AND-list, ANY() is the same as OR-list. */ - if (RESULT_IS_FINAL(matches[i], is_or)) - continue; + if (RESULT_IS_FINAL(match, expr->useOr)) + break; - for (j = 0; j < num_elems; j++) - { - Datum elem_value = elem_values[j]; - bool elem_isnull = elem_nulls[j]; - bool elem_match; - - /* NULL values always evaluate as not matching. */ - if (elem_isnull) - { - match = RESULT_MERGE(match, expr->useOr, false); - continue; - } - - /* - * Stop evaluating the array elements once we reach - * match value that can't change - ALL() is the same - * as AND-list, ANY() is the same as OR-list. - */ - if (RESULT_IS_FINAL(match, expr->useOr)) - break; - - elem_match = DatumGetBool(FunctionCall2Coll(&opproc, - var->varcollid, - item->values[idx], - elem_value)); - - match = RESULT_MERGE(match, expr->useOr, elem_match); - } + elem_match = DatumGetBool(FunctionCall2Coll(&opproc, + collid, + item->values[idx], + elem_value)); - /* update the match bitmap with the result */ - matches[i] = RESULT_MERGE(matches[i], is_or, match); + match = RESULT_MERGE(match, expr->useOr, elem_match); } + + /* update the match bitmap with the result */ + matches[i] = RESULT_MERGE(matches[i], is_or, match); } } else if (IsA(clause, NullTest)) { NullTest *expr = (NullTest *) clause; - Var *var = (Var *) (expr->arg); + Node *clause_expr = (Node *) (expr->arg); - /* match the attribute to a dimension of the statistic */ - int idx = bms_member_index(keys, var->varattno); + /* match the attribute/expression to a dimension of the statistic */ + int idx = mcv_match_expression(clause_expr, keys, exprs, NULL); /* * Walk through the MCV items and evaluate the current clause. We @@ -1811,7 +1869,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses, Assert(list_length(bool_clauses) >= 2); /* build the match bitmap for the OR-clauses */ - bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys, + bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys, exprs, mcvlist, is_orclause(clause)); /* @@ -1839,7 +1897,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses, Assert(list_length(not_args) == 1); /* build the match bitmap for the NOT-clause */ - not_matches = mcv_get_match_bitmap(root, not_args, keys, + not_matches = mcv_get_match_bitmap(root, not_args, keys, exprs, mcvlist, false); /* @@ -1982,7 +2040,8 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat, mcv = statext_mcv_load(stat->statOid); /* build a match bitmap for the clauses */ - matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, false); + matches = mcv_get_match_bitmap(root, clauses, stat->keys, stat->exprs, + mcv, false); /* sum frequencies for all the matching MCV items */ *basesel = 0.0; @@ -2056,7 +2115,7 @@ mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat, /* build the match bitmap for the new clause */ new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys, - mcv, false); + stat->exprs, mcv, false); /* * Sum the frequencies for all the MCV items matching this clause and also diff --git a/src/backend/statistics/mvdistinct.c b/src/backend/statistics/mvdistinct.c index e08c001e3f..4481312d61 100644 --- a/src/backend/statistics/mvdistinct.c +++ b/src/backend/statistics/mvdistinct.c @@ -36,8 +36,7 @@ #include "utils/syscache.h" #include "utils/typcache.h" -static double ndistinct_for_combination(double totalrows, int numrows, - HeapTuple *rows, VacAttrStats **stats, +static double ndistinct_for_combination(double totalrows, StatsBuildData *data, int k, int *combination); static double estimate_ndistinct(double totalrows, int numrows, int d, int f1); static int n_choose_k(int n, int k); @@ -81,15 +80,18 @@ static void generate_combinations(CombinationGenerator *state); * * This computes the ndistinct estimate using the same estimator used * in analyze.c and then computes the coefficient. + * + * To handle expressions easily, we treat them as system attributes with + * negative attnums, and offset everything by number of expressions to + * allow using Bitmapsets. */ MVNDistinct * -statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows, - Bitmapset *attrs, VacAttrStats **stats) +statext_ndistinct_build(double totalrows, StatsBuildData *data) { MVNDistinct *result; int k; int itemcnt; - int numattrs = bms_num_members(attrs); + int numattrs = data->nattnums; int numcombs = num_combinations(numattrs); result = palloc(offsetof(MVNDistinct, items) + @@ -112,13 +114,19 @@ statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows, MVNDistinctItem *item = &result->items[itemcnt]; int j; - item->attrs = NULL; + item->attributes = palloc(sizeof(AttrNumber) * k); + item->nattributes = k; + + /* translate the indexes to attnums */ for (j = 0; j < k; j++) - item->attrs = bms_add_member(item->attrs, - stats[combination[j]]->attr->attnum); + { + item->attributes[j] = data->attnums[combination[j]]; + + Assert(AttributeNumberIsValid(item->attributes[j])); + } + item->ndistinct = - ndistinct_for_combination(totalrows, numrows, rows, - stats, k, combination); + ndistinct_for_combination(totalrows, data, k, combination); itemcnt++; Assert(itemcnt <= result->nitems); @@ -189,7 +197,7 @@ statext_ndistinct_serialize(MVNDistinct *ndistinct) { int nmembers; - nmembers = bms_num_members(ndistinct->items[i].attrs); + nmembers = ndistinct->items[i].nattributes; Assert(nmembers >= 2); len += SizeOfItem(nmembers); @@ -214,22 +222,15 @@ statext_ndistinct_serialize(MVNDistinct *ndistinct) for (i = 0; i < ndistinct->nitems; i++) { MVNDistinctItem item = ndistinct->items[i]; - int nmembers = bms_num_members(item.attrs); - int x; + int nmembers = item.nattributes; memcpy(tmp, &item.ndistinct, sizeof(double)); tmp += sizeof(double); memcpy(tmp, &nmembers, sizeof(int)); tmp += sizeof(int); - x = -1; - while ((x = bms_next_member(item.attrs, x)) >= 0) - { - AttrNumber value = (AttrNumber) x; - - memcpy(tmp, &value, sizeof(AttrNumber)); - tmp += sizeof(AttrNumber); - } + memcpy(tmp, item.attributes, sizeof(AttrNumber) * nmembers); + tmp += nmembers * sizeof(AttrNumber); /* protect against overflows */ Assert(tmp <= ((char *) output + len)); @@ -301,27 +302,21 @@ statext_ndistinct_deserialize(bytea *data) for (i = 0; i < ndistinct->nitems; i++) { MVNDistinctItem *item = &ndistinct->items[i]; - int nelems; - - item->attrs = NULL; /* ndistinct value */ memcpy(&item->ndistinct, tmp, sizeof(double)); tmp += sizeof(double); /* number of attributes */ - memcpy(&nelems, tmp, sizeof(int)); + memcpy(&item->nattributes, tmp, sizeof(int)); tmp += sizeof(int); - Assert((nelems >= 2) && (nelems <= STATS_MAX_DIMENSIONS)); + Assert((item->nattributes >= 2) && (item->nattributes <= STATS_MAX_DIMENSIONS)); - while (nelems-- > 0) - { - AttrNumber attno; + item->attributes + = (AttrNumber *) palloc(item->nattributes * sizeof(AttrNumber)); - memcpy(&attno, tmp, sizeof(AttrNumber)); - tmp += sizeof(AttrNumber); - item->attrs = bms_add_member(item->attrs, attno); - } + memcpy(item->attributes, tmp, sizeof(AttrNumber) * item->nattributes); + tmp += sizeof(AttrNumber) * item->nattributes; /* still within the bytea */ Assert(tmp <= ((char *) data + VARSIZE_ANY(data))); @@ -369,17 +364,17 @@ pg_ndistinct_out(PG_FUNCTION_ARGS) for (i = 0; i < ndist->nitems; i++) { + int j; MVNDistinctItem item = ndist->items[i]; - int x = -1; - bool first = true; if (i > 0) appendStringInfoString(&str, ", "); - while ((x = bms_next_member(item.attrs, x)) >= 0) + for (j = 0; j < item.nattributes; j++) { - appendStringInfo(&str, "%s%d", first ? "\"" : ", ", x); - first = false; + AttrNumber attnum = item.attributes[j]; + + appendStringInfo(&str, "%s%d", (j == 0) ? "\"" : ", ", attnum); } appendStringInfo(&str, "\": %d", (int) item.ndistinct); } @@ -427,8 +422,8 @@ pg_ndistinct_send(PG_FUNCTION_ARGS) * combination of multiple columns. */ static double -ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows, - VacAttrStats **stats, int k, int *combination) +ndistinct_for_combination(double totalrows, StatsBuildData *data, + int k, int *combination) { int i, j; @@ -439,6 +434,7 @@ ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows, Datum *values; SortItem *items; MultiSortSupport mss; + int numrows = data->numrows; mss = multi_sort_init(k); @@ -467,25 +463,27 @@ ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows, */ for (i = 0; i < k; i++) { - VacAttrStats *colstat = stats[combination[i]]; + Oid typid; TypeCacheEntry *type; + Oid collid = InvalidOid; + VacAttrStats *colstat = data->stats[combination[i]]; + + typid = colstat->attrtypid; + collid = colstat->attrcollid; - type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR); + type = lookup_type_cache(typid, TYPECACHE_LT_OPR); if (type->lt_opr == InvalidOid) /* shouldn't happen */ elog(ERROR, "cache lookup failed for ordering operator for type %u", - colstat->attrtypid); + typid); /* prepare the sort function for this dimension */ - multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid); + multi_sort_add_dimension(mss, i, type->lt_opr, collid); /* accumulate all the data for this dimension into the arrays */ for (j = 0; j < numrows; j++) { - items[j].values[i] = - heap_getattr(rows[j], - colstat->attr->attnum, - colstat->tupDesc, - &items[j].isnull[i]); + items[j].values[i] = data->values[combination[i]][j]; + items[j].isnull[i] = data->nulls[combination[i]][j]; } } |
