diff options
Diffstat (limited to 'src/backend/optimizer')
| -rw-r--r-- | src/backend/optimizer/path/costsize.c | 193 | ||||
| -rw-r--r-- | src/backend/optimizer/path/joinpath.c | 58 | ||||
| -rw-r--r-- | src/backend/optimizer/plan/createplan.c | 10 | ||||
| -rw-r--r-- | src/backend/optimizer/plan/initsplan.c | 9 | ||||
| -rw-r--r-- | src/backend/optimizer/plan/planner.c | 6 | ||||
| -rw-r--r-- | src/backend/optimizer/prep/prepunion.c | 6 | ||||
| -rw-r--r-- | src/backend/optimizer/util/pathnode.c | 12 | ||||
| -rw-r--r-- | src/backend/optimizer/util/plancat.c | 3 |
8 files changed, 219 insertions, 78 deletions
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index c52af72a16b..bdfbbb18186 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -41,7 +41,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.70 2001/04/25 22:04:37 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.71 2001/05/07 00:43:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -50,11 +50,15 @@ #include <math.h> +#include "catalog/pg_statistic.h" #include "executor/nodeHash.h" #include "miscadmin.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" +#include "optimizer/pathnode.h" +#include "parser/parsetree.h" #include "utils/lsyscache.h" +#include "utils/syscache.h" /* @@ -573,7 +577,7 @@ cost_mergejoin(Path *path, * 'outer_path' is the path for the outer relation * 'inner_path' is the path for the inner relation * 'restrictlist' are the RestrictInfo nodes to be applied at the join - * 'innerdispersion' is an estimate of the dispersion statistic + * 'innerbucketsize' is an estimate of the bucketsize statistic * for the inner hash key. */ void @@ -581,7 +585,7 @@ cost_hashjoin(Path *path, Path *outer_path, Path *inner_path, List *restrictlist, - Selectivity innerdispersion) + Selectivity innerbucketsize) { Cost startup_cost = 0; Cost run_cost = 0; @@ -607,22 +611,20 @@ cost_hashjoin(Path *path, /* * The number of tuple comparisons needed is the number of outer - * tuples times the typical hash bucket size. nodeHash.c tries for - * average bucket loading of NTUP_PER_BUCKET, but that goal will be - * reached only if data values are uniformly distributed among the - * buckets. To be conservative, we scale up the target bucket size by - * the number of inner rows times inner dispersion, giving an estimate - * of the typical number of duplicates of each value. We then charge - * one cpu_operator_cost per tuple comparison. + * tuples times the typical number of tuples in a hash bucket, + * which is the inner relation size times its bucketsize fraction. + * We charge one cpu_operator_cost per tuple comparison. */ run_cost += cpu_operator_cost * outer_path->parent->rows * - NTUP_PER_BUCKET * ceil(inner_path->parent->rows * innerdispersion); + ceil(inner_path->parent->rows * innerbucketsize); /* * Estimate the number of tuples that get through the hashing filter * as one per tuple in the two source relations. This could be a * drastic underestimate if there are many equal-keyed tuples in - * either relation, but we have no good way of estimating that... + * either relation, but we have no simple way of estimating that; + * and since this is only a second-order parameter, it's probably + * not worth expending a lot of effort on the estimate. */ ntuples = outer_path->parent->rows + inner_path->parent->rows; @@ -651,7 +653,7 @@ cost_hashjoin(Path *path, /* * Bias against putting larger relation on inside. We don't want an * absolute prohibition, though, since larger relation might have - * better dispersion --- and we can't trust the size estimates + * better bucketsize --- and we can't trust the size estimates * unreservedly, anyway. Instead, inflate the startup cost by the * square root of the size ratio. (Why square root? No real good * reason, but it seems reasonable...) @@ -663,6 +665,171 @@ cost_hashjoin(Path *path, path->total_cost = startup_cost + run_cost; } +/* + * Estimate hash bucketsize fraction (ie, number of entries in a bucket + * divided by total tuples in relation) if the specified Var is used + * as a hash key. + * + * This statistic is used by cost_hashjoin. We split out the calculation + * because it's useful to cache the result for re-use across multiple path + * cost calculations. + * + * XXX This is really pretty bogus since we're effectively assuming that the + * distribution of hash keys will be the same after applying restriction + * clauses as it was in the underlying relation. However, we are not nearly + * smart enough to figure out how the restrict clauses might change the + * distribution, so this will have to do for now. + * + * The executor tries for average bucket loading of NTUP_PER_BUCKET by setting + * number of buckets equal to ntuples / NTUP_PER_BUCKET, which would yield + * a bucketsize fraction of NTUP_PER_BUCKET / ntuples. But that goal will + * be reached only if the data values are uniformly distributed among the + * buckets, which requires (a) at least ntuples / NTUP_PER_BUCKET distinct + * data values, and (b) a not-too-skewed data distribution. Otherwise the + * buckets will be nonuniformly occupied. If the other relation in the join + * has a similar distribution, the most-loaded buckets are exactly those + * that will be probed most often. Therefore, the "average" bucket size for + * costing purposes should really be taken as something close to the "worst + * case" bucket size. We try to estimate this by first scaling up if there + * are too few distinct data values, and then scaling up again by the + * ratio of the most common value's frequency to the average frequency. + * + * If no statistics are available, use a default estimate of 0.1. This will + * discourage use of a hash rather strongly if the inner relation is large, + * which is what we want. We do not want to hash unless we know that the + * inner rel is well-dispersed (or the alternatives seem much worse). + */ +Selectivity +estimate_hash_bucketsize(Query *root, Var *var) +{ + Oid relid; + RelOptInfo *rel; + HeapTuple tuple; + Form_pg_statistic stats; + double estfract, + ndistinct, + needdistinct, + mcvfreq, + avgfreq; + float4 *numbers; + int nnumbers; + + /* + * Lookup info about var's relation and attribute; + * if none available, return default estimate. + */ + if (!IsA(var, Var)) + return 0.1; + + relid = getrelid(var->varno, root->rtable); + if (relid == InvalidOid) + return 0.1; + + rel = get_base_rel(root, var->varno); + + if (rel->tuples <= 0.0 || rel->rows <= 0.0) + return 0.1; /* ensure we can divide below */ + + tuple = SearchSysCache(STATRELATT, + ObjectIdGetDatum(relid), + Int16GetDatum(var->varattno), + 0, 0); + if (!HeapTupleIsValid(tuple)) + { + /* + * Perhaps the Var is a system attribute; if so, it will have no + * entry in pg_statistic, but we may be able to guess something + * about its distribution anyway. + */ + switch (var->varattno) + { + case ObjectIdAttributeNumber: + case SelfItemPointerAttributeNumber: + /* these are unique, so buckets should be well-distributed */ + return (double) NTUP_PER_BUCKET / rel->rows; + case TableOidAttributeNumber: + /* hashing this is a terrible idea... */ + return 1.0; + } + return 0.1; + } + stats = (Form_pg_statistic) GETSTRUCT(tuple); + + /* + * Obtain number of distinct data values in raw relation. + */ + ndistinct = stats->stadistinct; + if (ndistinct < 0.0) + ndistinct = -ndistinct * rel->tuples; + + /* + * Adjust ndistinct to account for restriction clauses. Observe we are + * assuming that the data distribution is affected uniformly by the + * restriction clauses! + * + * XXX Possibly better way, but much more expensive: multiply by + * selectivity of rel's restriction clauses that mention the target Var. + */ + ndistinct *= rel->rows / rel->tuples; + + /* + * Discourage use of hash join if there seem not to be very many distinct + * data values. The threshold here is somewhat arbitrary, as is the + * fraction used to "discourage" the choice. + */ + if (ndistinct < 50.0) + { + ReleaseSysCache(tuple); + return 0.5; + } + + /* + * Form initial estimate of bucketsize fraction. Here we use rel->rows, + * ie the number of rows after applying restriction clauses, because + * that's what the fraction will eventually be multiplied by in + * cost_heapjoin. + */ + estfract = (double) NTUP_PER_BUCKET / rel->rows; + + /* + * Adjust estimated bucketsize if too few distinct values to fill + * all the buckets. + */ + needdistinct = rel->rows / (double) NTUP_PER_BUCKET; + if (ndistinct < needdistinct) + estfract *= needdistinct / ndistinct; + + /* + * Look up the frequency of the most common value, if available. + */ + mcvfreq = 0.0; + + if (get_attstatsslot(tuple, var->vartype, var->vartypmod, + STATISTIC_KIND_MCV, InvalidOid, + NULL, NULL, &numbers, &nnumbers)) + { + /* + * The first MCV stat is for the most common value. + */ + if (nnumbers > 0) + mcvfreq = numbers[0]; + free_attstatsslot(var->vartype, NULL, 0, + numbers, nnumbers); + } + + /* + * Adjust estimated bucketsize upward to account for skewed distribution. + */ + avgfreq = (1.0 - stats->stanullfrac) / ndistinct; + + if (avgfreq > 0.0 && mcvfreq > avgfreq) + estfract *= mcvfreq / avgfreq; + + ReleaseSysCache(tuple); + + return (Selectivity) estfract; +} + /* * cost_qual_eval diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index d41336ddcee..cd7cabd41de 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -8,15 +8,15 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/optimizer/path/joinpath.c,v 1.63 2001/04/15 00:48:17 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/optimizer/path/joinpath.c,v 1.64 2001/05/07 00:43:20 tgl Exp $ * *------------------------------------------------------------------------- */ +#include "postgres.h" + #include <sys/types.h> #include <math.h> -#include "postgres.h" - #include "optimizer/clauses.h" #include "optimizer/cost.h" #include "optimizer/pathnode.h" @@ -45,7 +45,6 @@ static void hash_inner_and_outer(Query *root, RelOptInfo *joinrel, List *restrictlist, JoinType jointype); static Path *best_innerjoin(List *join_paths, List *outer_relid, JoinType jointype); -static Selectivity estimate_dispersion(Query *root, Var *var); static List *select_mergejoin_clauses(RelOptInfo *joinrel, RelOptInfo *outerrel, RelOptInfo *innerrel, @@ -722,7 +721,7 @@ hash_inner_and_outer(Query *root, Expr *clause; Var *left, *right; - Selectivity innerdispersion; + Selectivity innerbucketsize; List *hashclauses; if (restrictinfo->hashjoinoperator == InvalidOid) @@ -742,34 +741,34 @@ hash_inner_and_outer(Query *root, /* * Check if clause is usable with these sub-rels, find inner side, - * estimate dispersion of inner var for costing purposes. + * estimate bucketsize of inner var for costing purposes. * * Since we tend to visit the same clauses over and over when - * planning a large query, we cache the dispersion estimates in + * planning a large query, we cache the bucketsize estimates in * the RestrictInfo node to avoid repeated lookups of statistics. */ if (intMember(left->varno, outerrelids) && intMember(right->varno, innerrelids)) { /* righthand side is inner */ - innerdispersion = restrictinfo->right_dispersion; - if (innerdispersion < 0) + innerbucketsize = restrictinfo->right_bucketsize; + if (innerbucketsize < 0) { /* not cached yet */ - innerdispersion = estimate_dispersion(root, right); - restrictinfo->right_dispersion = innerdispersion; + innerbucketsize = estimate_hash_bucketsize(root, right); + restrictinfo->right_bucketsize = innerbucketsize; } } else if (intMember(left->varno, innerrelids) && intMember(right->varno, outerrelids)) { /* lefthand side is inner */ - innerdispersion = restrictinfo->left_dispersion; - if (innerdispersion < 0) + innerbucketsize = restrictinfo->left_bucketsize; + if (innerbucketsize < 0) { /* not cached yet */ - innerdispersion = estimate_dispersion(root, left); - restrictinfo->left_dispersion = innerdispersion; + innerbucketsize = estimate_hash_bucketsize(root, left); + restrictinfo->left_bucketsize = innerbucketsize; } } else @@ -790,7 +789,7 @@ hash_inner_and_outer(Query *root, innerrel->cheapest_total_path, restrictlist, hashclauses, - innerdispersion)); + innerbucketsize)); if (outerrel->cheapest_startup_path != outerrel->cheapest_total_path) add_path(joinrel, (Path *) create_hashjoin_path(joinrel, @@ -799,7 +798,7 @@ hash_inner_and_outer(Query *root, innerrel->cheapest_total_path, restrictlist, hashclauses, - innerdispersion)); + innerbucketsize)); } } @@ -867,31 +866,6 @@ best_innerjoin(List *join_paths, Relids outer_relids, JoinType jointype) } /* - * Estimate dispersion of the specified Var - * - * We use a default of 0.1 if we can't figure out anything better. - * This will typically discourage use of a hash rather strongly, - * if the inner relation is large. We do not want to hash unless - * we know that the inner rel is well-dispersed (or the alternatives - * seem much worse). - */ -static Selectivity -estimate_dispersion(Query *root, Var *var) -{ - Oid relid; - - if (!IsA(var, Var)) - return 0.1; - - relid = getrelid(var->varno, root->rtable); - - if (relid == InvalidOid) - return 0.1; - - return (Selectivity) get_attdispersion(relid, var->varattno, 0.1); -} - -/* * select_mergejoin_clauses * Select mergejoin clauses that are usable for a particular join. * Returns a list of RestrictInfo nodes for those clauses. diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 8c3b00289d3..2d264c46881 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -10,14 +10,14 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.104 2001/03/22 03:59:36 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.105 2001/05/07 00:43:20 tgl Exp $ * *------------------------------------------------------------------------- */ -#include <sys/types.h> - #include "postgres.h" +#include <sys/types.h> + #include "catalog/pg_index.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" @@ -1484,9 +1484,9 @@ make_sort_from_pathkeys(List *tlist, Plan *lefttree, List *pathkeys) */ if (resdom->reskey == 0) { - /* OK, mark it as a sort key and set the sort operator regproc */ + /* OK, mark it as a sort key and set the sort operator */ resdom->reskey = ++numsortkeys; - resdom->reskeyop = get_opcode(pathkey->sortop); + resdom->reskeyop = pathkey->sortop; } } diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 7c3e15a8f88..5d67e02dacb 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -8,13 +8,14 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/optimizer/plan/initsplan.c,v 1.59 2001/04/16 19:44:10 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/optimizer/plan/initsplan.c,v 1.60 2001/05/07 00:43:21 tgl Exp $ * *------------------------------------------------------------------------- */ +#include "postgres.h" + #include <sys/types.h> -#include "postgres.h" #include "catalog/pg_operator.h" #include "catalog/pg_type.h" #include "nodes/makefuncs.h" @@ -348,8 +349,8 @@ distribute_qual_to_rels(Query *root, Node *clause, restrictinfo->left_pathkey = NIL; /* not computable yet */ restrictinfo->right_pathkey = NIL; restrictinfo->hashjoinoperator = InvalidOid; - restrictinfo->left_dispersion = -1; /* not computed until needed */ - restrictinfo->right_dispersion = -1; + restrictinfo->left_bucketsize = -1; /* not computed until needed */ + restrictinfo->right_bucketsize = -1; /* * Retrieve all relids and vars contained within the clause. diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index b2ab4600209..0aba4808c16 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/optimizer/plan/planner.c,v 1.105 2001/04/30 19:24:47 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/optimizer/plan/planner.c,v 1.106 2001/05/07 00:43:21 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1367,7 +1367,7 @@ make_groupplan(List *group_tlist, { /* OK, insert the ordering info needed by the executor. */ resdom->reskey = ++keyno; - resdom->reskeyop = get_opcode(grpcl->sortop); + resdom->reskeyop = grpcl->sortop; } } @@ -1412,7 +1412,7 @@ make_sortplan(List *tlist, Plan *plannode, List *sortcls) { /* OK, insert the ordering info needed by the executor. */ resdom->reskey = ++keyno; - resdom->reskeyop = get_opcode(sortcl->sortop); + resdom->reskeyop = sortcl->sortop; } } diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 0b173466cf9..ede4159d970 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -14,7 +14,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/optimizer/prep/prepunion.c,v 1.62 2001/03/27 18:02:19 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/optimizer/prep/prepunion.c,v 1.63 2001/05/07 00:43:22 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -682,8 +682,8 @@ adjust_inherited_attrs_mutator(Node *node, newinfo->eval_cost = -1; /* reset this too */ newinfo->left_pathkey = NIL; /* and these */ newinfo->right_pathkey = NIL; - newinfo->left_dispersion = -1; - newinfo->right_dispersion = -1; + newinfo->left_bucketsize = -1; + newinfo->right_bucketsize = -1; return (Node *) newinfo; } diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index cfba3ee395f..407c132b4f7 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -8,14 +8,14 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/optimizer/util/pathnode.c,v 1.71 2001/03/22 03:59:39 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/optimizer/util/pathnode.c,v 1.72 2001/05/07 00:43:22 tgl Exp $ * *------------------------------------------------------------------------- */ -#include <math.h> - #include "postgres.h" +#include <math.h> + #include "nodes/plannodes.h" #include "optimizer/cost.h" #include "optimizer/pathnode.h" @@ -559,7 +559,7 @@ create_mergejoin_path(RelOptInfo *joinrel, * 'restrict_clauses' are the RestrictInfo nodes to apply at the join * 'hashclauses' is a list of the hash join clause (always a 1-element list) * (this should be a subset of the restrict_clauses list) - * 'innerdispersion' is an estimate of the dispersion of the inner hash key + * 'innerbucketsize' is an estimate of the bucketsize of the inner hash key * */ HashPath * @@ -569,7 +569,7 @@ create_hashjoin_path(RelOptInfo *joinrel, Path *inner_path, List *restrict_clauses, List *hashclauses, - Selectivity innerdispersion) + Selectivity innerbucketsize) { HashPath *pathnode = makeNode(HashPath); @@ -587,7 +587,7 @@ create_hashjoin_path(RelOptInfo *joinrel, outer_path, inner_path, restrict_clauses, - innerdispersion); + innerbucketsize); return pathnode; } diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 4f711df203c..ee3523553e8 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -9,11 +9,10 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/optimizer/util/plancat.c,v 1.64 2001/03/22 03:59:40 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/optimizer/util/plancat.c,v 1.65 2001/05/07 00:43:22 tgl Exp $ * *------------------------------------------------------------------------- */ - #include "postgres.h" #include <math.h> |
