summaryrefslogtreecommitdiff
path: root/src/backend/optimizer
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/optimizer')
-rw-r--r--src/backend/optimizer/path/costsize.c193
-rw-r--r--src/backend/optimizer/path/joinpath.c58
-rw-r--r--src/backend/optimizer/plan/createplan.c10
-rw-r--r--src/backend/optimizer/plan/initsplan.c9
-rw-r--r--src/backend/optimizer/plan/planner.c6
-rw-r--r--src/backend/optimizer/prep/prepunion.c6
-rw-r--r--src/backend/optimizer/util/pathnode.c12
-rw-r--r--src/backend/optimizer/util/plancat.c3
8 files changed, 219 insertions, 78 deletions
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index c52af72a16b..bdfbbb18186 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -41,7 +41,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.70 2001/04/25 22:04:37 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.71 2001/05/07 00:43:20 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -50,11 +50,15 @@
#include <math.h>
+#include "catalog/pg_statistic.h"
#include "executor/nodeHash.h"
#include "miscadmin.h"
#include "optimizer/clauses.h"
#include "optimizer/cost.h"
+#include "optimizer/pathnode.h"
+#include "parser/parsetree.h"
#include "utils/lsyscache.h"
+#include "utils/syscache.h"
/*
@@ -573,7 +577,7 @@ cost_mergejoin(Path *path,
* 'outer_path' is the path for the outer relation
* 'inner_path' is the path for the inner relation
* 'restrictlist' are the RestrictInfo nodes to be applied at the join
- * 'innerdispersion' is an estimate of the dispersion statistic
+ * 'innerbucketsize' is an estimate of the bucketsize statistic
* for the inner hash key.
*/
void
@@ -581,7 +585,7 @@ cost_hashjoin(Path *path,
Path *outer_path,
Path *inner_path,
List *restrictlist,
- Selectivity innerdispersion)
+ Selectivity innerbucketsize)
{
Cost startup_cost = 0;
Cost run_cost = 0;
@@ -607,22 +611,20 @@ cost_hashjoin(Path *path,
/*
* The number of tuple comparisons needed is the number of outer
- * tuples times the typical hash bucket size. nodeHash.c tries for
- * average bucket loading of NTUP_PER_BUCKET, but that goal will be
- * reached only if data values are uniformly distributed among the
- * buckets. To be conservative, we scale up the target bucket size by
- * the number of inner rows times inner dispersion, giving an estimate
- * of the typical number of duplicates of each value. We then charge
- * one cpu_operator_cost per tuple comparison.
+ * tuples times the typical number of tuples in a hash bucket,
+ * which is the inner relation size times its bucketsize fraction.
+ * We charge one cpu_operator_cost per tuple comparison.
*/
run_cost += cpu_operator_cost * outer_path->parent->rows *
- NTUP_PER_BUCKET * ceil(inner_path->parent->rows * innerdispersion);
+ ceil(inner_path->parent->rows * innerbucketsize);
/*
* Estimate the number of tuples that get through the hashing filter
* as one per tuple in the two source relations. This could be a
* drastic underestimate if there are many equal-keyed tuples in
- * either relation, but we have no good way of estimating that...
+ * either relation, but we have no simple way of estimating that;
+ * and since this is only a second-order parameter, it's probably
+ * not worth expending a lot of effort on the estimate.
*/
ntuples = outer_path->parent->rows + inner_path->parent->rows;
@@ -651,7 +653,7 @@ cost_hashjoin(Path *path,
/*
* Bias against putting larger relation on inside. We don't want an
* absolute prohibition, though, since larger relation might have
- * better dispersion --- and we can't trust the size estimates
+ * better bucketsize --- and we can't trust the size estimates
* unreservedly, anyway. Instead, inflate the startup cost by the
* square root of the size ratio. (Why square root? No real good
* reason, but it seems reasonable...)
@@ -663,6 +665,171 @@ cost_hashjoin(Path *path,
path->total_cost = startup_cost + run_cost;
}
+/*
+ * Estimate hash bucketsize fraction (ie, number of entries in a bucket
+ * divided by total tuples in relation) if the specified Var is used
+ * as a hash key.
+ *
+ * This statistic is used by cost_hashjoin. We split out the calculation
+ * because it's useful to cache the result for re-use across multiple path
+ * cost calculations.
+ *
+ * XXX This is really pretty bogus since we're effectively assuming that the
+ * distribution of hash keys will be the same after applying restriction
+ * clauses as it was in the underlying relation. However, we are not nearly
+ * smart enough to figure out how the restrict clauses might change the
+ * distribution, so this will have to do for now.
+ *
+ * The executor tries for average bucket loading of NTUP_PER_BUCKET by setting
+ * number of buckets equal to ntuples / NTUP_PER_BUCKET, which would yield
+ * a bucketsize fraction of NTUP_PER_BUCKET / ntuples. But that goal will
+ * be reached only if the data values are uniformly distributed among the
+ * buckets, which requires (a) at least ntuples / NTUP_PER_BUCKET distinct
+ * data values, and (b) a not-too-skewed data distribution. Otherwise the
+ * buckets will be nonuniformly occupied. If the other relation in the join
+ * has a similar distribution, the most-loaded buckets are exactly those
+ * that will be probed most often. Therefore, the "average" bucket size for
+ * costing purposes should really be taken as something close to the "worst
+ * case" bucket size. We try to estimate this by first scaling up if there
+ * are too few distinct data values, and then scaling up again by the
+ * ratio of the most common value's frequency to the average frequency.
+ *
+ * If no statistics are available, use a default estimate of 0.1. This will
+ * discourage use of a hash rather strongly if the inner relation is large,
+ * which is what we want. We do not want to hash unless we know that the
+ * inner rel is well-dispersed (or the alternatives seem much worse).
+ */
+Selectivity
+estimate_hash_bucketsize(Query *root, Var *var)
+{
+ Oid relid;
+ RelOptInfo *rel;
+ HeapTuple tuple;
+ Form_pg_statistic stats;
+ double estfract,
+ ndistinct,
+ needdistinct,
+ mcvfreq,
+ avgfreq;
+ float4 *numbers;
+ int nnumbers;
+
+ /*
+ * Lookup info about var's relation and attribute;
+ * if none available, return default estimate.
+ */
+ if (!IsA(var, Var))
+ return 0.1;
+
+ relid = getrelid(var->varno, root->rtable);
+ if (relid == InvalidOid)
+ return 0.1;
+
+ rel = get_base_rel(root, var->varno);
+
+ if (rel->tuples <= 0.0 || rel->rows <= 0.0)
+ return 0.1; /* ensure we can divide below */
+
+ tuple = SearchSysCache(STATRELATT,
+ ObjectIdGetDatum(relid),
+ Int16GetDatum(var->varattno),
+ 0, 0);
+ if (!HeapTupleIsValid(tuple))
+ {
+ /*
+ * Perhaps the Var is a system attribute; if so, it will have no
+ * entry in pg_statistic, but we may be able to guess something
+ * about its distribution anyway.
+ */
+ switch (var->varattno)
+ {
+ case ObjectIdAttributeNumber:
+ case SelfItemPointerAttributeNumber:
+ /* these are unique, so buckets should be well-distributed */
+ return (double) NTUP_PER_BUCKET / rel->rows;
+ case TableOidAttributeNumber:
+ /* hashing this is a terrible idea... */
+ return 1.0;
+ }
+ return 0.1;
+ }
+ stats = (Form_pg_statistic) GETSTRUCT(tuple);
+
+ /*
+ * Obtain number of distinct data values in raw relation.
+ */
+ ndistinct = stats->stadistinct;
+ if (ndistinct < 0.0)
+ ndistinct = -ndistinct * rel->tuples;
+
+ /*
+ * Adjust ndistinct to account for restriction clauses. Observe we are
+ * assuming that the data distribution is affected uniformly by the
+ * restriction clauses!
+ *
+ * XXX Possibly better way, but much more expensive: multiply by
+ * selectivity of rel's restriction clauses that mention the target Var.
+ */
+ ndistinct *= rel->rows / rel->tuples;
+
+ /*
+ * Discourage use of hash join if there seem not to be very many distinct
+ * data values. The threshold here is somewhat arbitrary, as is the
+ * fraction used to "discourage" the choice.
+ */
+ if (ndistinct < 50.0)
+ {
+ ReleaseSysCache(tuple);
+ return 0.5;
+ }
+
+ /*
+ * Form initial estimate of bucketsize fraction. Here we use rel->rows,
+ * ie the number of rows after applying restriction clauses, because
+ * that's what the fraction will eventually be multiplied by in
+ * cost_heapjoin.
+ */
+ estfract = (double) NTUP_PER_BUCKET / rel->rows;
+
+ /*
+ * Adjust estimated bucketsize if too few distinct values to fill
+ * all the buckets.
+ */
+ needdistinct = rel->rows / (double) NTUP_PER_BUCKET;
+ if (ndistinct < needdistinct)
+ estfract *= needdistinct / ndistinct;
+
+ /*
+ * Look up the frequency of the most common value, if available.
+ */
+ mcvfreq = 0.0;
+
+ if (get_attstatsslot(tuple, var->vartype, var->vartypmod,
+ STATISTIC_KIND_MCV, InvalidOid,
+ NULL, NULL, &numbers, &nnumbers))
+ {
+ /*
+ * The first MCV stat is for the most common value.
+ */
+ if (nnumbers > 0)
+ mcvfreq = numbers[0];
+ free_attstatsslot(var->vartype, NULL, 0,
+ numbers, nnumbers);
+ }
+
+ /*
+ * Adjust estimated bucketsize upward to account for skewed distribution.
+ */
+ avgfreq = (1.0 - stats->stanullfrac) / ndistinct;
+
+ if (avgfreq > 0.0 && mcvfreq > avgfreq)
+ estfract *= mcvfreq / avgfreq;
+
+ ReleaseSysCache(tuple);
+
+ return (Selectivity) estfract;
+}
+
/*
* cost_qual_eval
diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c
index d41336ddcee..cd7cabd41de 100644
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -8,15 +8,15 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/optimizer/path/joinpath.c,v 1.63 2001/04/15 00:48:17 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/optimizer/path/joinpath.c,v 1.64 2001/05/07 00:43:20 tgl Exp $
*
*-------------------------------------------------------------------------
*/
+#include "postgres.h"
+
#include <sys/types.h>
#include <math.h>
-#include "postgres.h"
-
#include "optimizer/clauses.h"
#include "optimizer/cost.h"
#include "optimizer/pathnode.h"
@@ -45,7 +45,6 @@ static void hash_inner_and_outer(Query *root, RelOptInfo *joinrel,
List *restrictlist, JoinType jointype);
static Path *best_innerjoin(List *join_paths, List *outer_relid,
JoinType jointype);
-static Selectivity estimate_dispersion(Query *root, Var *var);
static List *select_mergejoin_clauses(RelOptInfo *joinrel,
RelOptInfo *outerrel,
RelOptInfo *innerrel,
@@ -722,7 +721,7 @@ hash_inner_and_outer(Query *root,
Expr *clause;
Var *left,
*right;
- Selectivity innerdispersion;
+ Selectivity innerbucketsize;
List *hashclauses;
if (restrictinfo->hashjoinoperator == InvalidOid)
@@ -742,34 +741,34 @@ hash_inner_and_outer(Query *root,
/*
* Check if clause is usable with these sub-rels, find inner side,
- * estimate dispersion of inner var for costing purposes.
+ * estimate bucketsize of inner var for costing purposes.
*
* Since we tend to visit the same clauses over and over when
- * planning a large query, we cache the dispersion estimates in
+ * planning a large query, we cache the bucketsize estimates in
* the RestrictInfo node to avoid repeated lookups of statistics.
*/
if (intMember(left->varno, outerrelids) &&
intMember(right->varno, innerrelids))
{
/* righthand side is inner */
- innerdispersion = restrictinfo->right_dispersion;
- if (innerdispersion < 0)
+ innerbucketsize = restrictinfo->right_bucketsize;
+ if (innerbucketsize < 0)
{
/* not cached yet */
- innerdispersion = estimate_dispersion(root, right);
- restrictinfo->right_dispersion = innerdispersion;
+ innerbucketsize = estimate_hash_bucketsize(root, right);
+ restrictinfo->right_bucketsize = innerbucketsize;
}
}
else if (intMember(left->varno, innerrelids) &&
intMember(right->varno, outerrelids))
{
/* lefthand side is inner */
- innerdispersion = restrictinfo->left_dispersion;
- if (innerdispersion < 0)
+ innerbucketsize = restrictinfo->left_bucketsize;
+ if (innerbucketsize < 0)
{
/* not cached yet */
- innerdispersion = estimate_dispersion(root, left);
- restrictinfo->left_dispersion = innerdispersion;
+ innerbucketsize = estimate_hash_bucketsize(root, left);
+ restrictinfo->left_bucketsize = innerbucketsize;
}
}
else
@@ -790,7 +789,7 @@ hash_inner_and_outer(Query *root,
innerrel->cheapest_total_path,
restrictlist,
hashclauses,
- innerdispersion));
+ innerbucketsize));
if (outerrel->cheapest_startup_path != outerrel->cheapest_total_path)
add_path(joinrel, (Path *)
create_hashjoin_path(joinrel,
@@ -799,7 +798,7 @@ hash_inner_and_outer(Query *root,
innerrel->cheapest_total_path,
restrictlist,
hashclauses,
- innerdispersion));
+ innerbucketsize));
}
}
@@ -867,31 +866,6 @@ best_innerjoin(List *join_paths, Relids outer_relids, JoinType jointype)
}
/*
- * Estimate dispersion of the specified Var
- *
- * We use a default of 0.1 if we can't figure out anything better.
- * This will typically discourage use of a hash rather strongly,
- * if the inner relation is large. We do not want to hash unless
- * we know that the inner rel is well-dispersed (or the alternatives
- * seem much worse).
- */
-static Selectivity
-estimate_dispersion(Query *root, Var *var)
-{
- Oid relid;
-
- if (!IsA(var, Var))
- return 0.1;
-
- relid = getrelid(var->varno, root->rtable);
-
- if (relid == InvalidOid)
- return 0.1;
-
- return (Selectivity) get_attdispersion(relid, var->varattno, 0.1);
-}
-
-/*
* select_mergejoin_clauses
* Select mergejoin clauses that are usable for a particular join.
* Returns a list of RestrictInfo nodes for those clauses.
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 8c3b00289d3..2d264c46881 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -10,14 +10,14 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.104 2001/03/22 03:59:36 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.105 2001/05/07 00:43:20 tgl Exp $
*
*-------------------------------------------------------------------------
*/
-#include <sys/types.h>
-
#include "postgres.h"
+#include <sys/types.h>
+
#include "catalog/pg_index.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
@@ -1484,9 +1484,9 @@ make_sort_from_pathkeys(List *tlist, Plan *lefttree, List *pathkeys)
*/
if (resdom->reskey == 0)
{
- /* OK, mark it as a sort key and set the sort operator regproc */
+ /* OK, mark it as a sort key and set the sort operator */
resdom->reskey = ++numsortkeys;
- resdom->reskeyop = get_opcode(pathkey->sortop);
+ resdom->reskeyop = pathkey->sortop;
}
}
diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c
index 7c3e15a8f88..5d67e02dacb 100644
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -8,13 +8,14 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/optimizer/plan/initsplan.c,v 1.59 2001/04/16 19:44:10 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/optimizer/plan/initsplan.c,v 1.60 2001/05/07 00:43:21 tgl Exp $
*
*-------------------------------------------------------------------------
*/
+#include "postgres.h"
+
#include <sys/types.h>
-#include "postgres.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_type.h"
#include "nodes/makefuncs.h"
@@ -348,8 +349,8 @@ distribute_qual_to_rels(Query *root, Node *clause,
restrictinfo->left_pathkey = NIL; /* not computable yet */
restrictinfo->right_pathkey = NIL;
restrictinfo->hashjoinoperator = InvalidOid;
- restrictinfo->left_dispersion = -1; /* not computed until needed */
- restrictinfo->right_dispersion = -1;
+ restrictinfo->left_bucketsize = -1; /* not computed until needed */
+ restrictinfo->right_bucketsize = -1;
/*
* Retrieve all relids and vars contained within the clause.
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index b2ab4600209..0aba4808c16 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/optimizer/plan/planner.c,v 1.105 2001/04/30 19:24:47 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/optimizer/plan/planner.c,v 1.106 2001/05/07 00:43:21 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -1367,7 +1367,7 @@ make_groupplan(List *group_tlist,
{
/* OK, insert the ordering info needed by the executor. */
resdom->reskey = ++keyno;
- resdom->reskeyop = get_opcode(grpcl->sortop);
+ resdom->reskeyop = grpcl->sortop;
}
}
@@ -1412,7 +1412,7 @@ make_sortplan(List *tlist, Plan *plannode, List *sortcls)
{
/* OK, insert the ordering info needed by the executor. */
resdom->reskey = ++keyno;
- resdom->reskeyop = get_opcode(sortcl->sortop);
+ resdom->reskeyop = sortcl->sortop;
}
}
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 0b173466cf9..ede4159d970 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -14,7 +14,7 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/optimizer/prep/prepunion.c,v 1.62 2001/03/27 18:02:19 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/optimizer/prep/prepunion.c,v 1.63 2001/05/07 00:43:22 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -682,8 +682,8 @@ adjust_inherited_attrs_mutator(Node *node,
newinfo->eval_cost = -1; /* reset this too */
newinfo->left_pathkey = NIL; /* and these */
newinfo->right_pathkey = NIL;
- newinfo->left_dispersion = -1;
- newinfo->right_dispersion = -1;
+ newinfo->left_bucketsize = -1;
+ newinfo->right_bucketsize = -1;
return (Node *) newinfo;
}
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index cfba3ee395f..407c132b4f7 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -8,14 +8,14 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/optimizer/util/pathnode.c,v 1.71 2001/03/22 03:59:39 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/optimizer/util/pathnode.c,v 1.72 2001/05/07 00:43:22 tgl Exp $
*
*-------------------------------------------------------------------------
*/
-#include <math.h>
-
#include "postgres.h"
+#include <math.h>
+
#include "nodes/plannodes.h"
#include "optimizer/cost.h"
#include "optimizer/pathnode.h"
@@ -559,7 +559,7 @@ create_mergejoin_path(RelOptInfo *joinrel,
* 'restrict_clauses' are the RestrictInfo nodes to apply at the join
* 'hashclauses' is a list of the hash join clause (always a 1-element list)
* (this should be a subset of the restrict_clauses list)
- * 'innerdispersion' is an estimate of the dispersion of the inner hash key
+ * 'innerbucketsize' is an estimate of the bucketsize of the inner hash key
*
*/
HashPath *
@@ -569,7 +569,7 @@ create_hashjoin_path(RelOptInfo *joinrel,
Path *inner_path,
List *restrict_clauses,
List *hashclauses,
- Selectivity innerdispersion)
+ Selectivity innerbucketsize)
{
HashPath *pathnode = makeNode(HashPath);
@@ -587,7 +587,7 @@ create_hashjoin_path(RelOptInfo *joinrel,
outer_path,
inner_path,
restrict_clauses,
- innerdispersion);
+ innerbucketsize);
return pathnode;
}
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 4f711df203c..ee3523553e8 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -9,11 +9,10 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/optimizer/util/plancat.c,v 1.64 2001/03/22 03:59:40 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/optimizer/util/plancat.c,v 1.65 2001/05/07 00:43:22 tgl Exp $
*
*-------------------------------------------------------------------------
*/
-
#include "postgres.h"
#include <math.h>