Make use of statistics on index expressions. There are still some

author Tom Lane <tgl@sss.pgh.pa.us>

Tue, 17 Feb 2004 00:52:53 +0000 (00:52 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Tue, 17 Feb 2004 00:52:53 +0000 (00:52 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Tue, 17 Feb 2004 00:52:53 +0000 (00:52 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Tue, 17 Feb 2004 00:52:53 +0000 (00:52 +0000)
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c

index c23cf4d23246d5f3dd26bcd8ce02920c702c3eac..79674ac4b94881a74c2efe343459d07c6f40183d 100644 (file)
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -49,7 +49,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.124 2004/02/03 17:34:03 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.125 2004/02/17 00:52:53 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -102,8 +102,6 @@ bool        enable_mergejoin = true;
  bool       enable_hashjoin = true;
  
  
-static Selectivity estimate_hash_bucketsize(Query *root, Var *var,
-                        int nbuckets);
  static bool cost_qual_eval_walker(Node *node, QualCost *total);
  static Selectivity approx_selectivity(Query *root, List *quals,
                    JoinType jointype);
@@ -1152,7 +1150,7 @@ cost_hashjoin(HashPath *path, Query *root)
                     /* not cached yet */
                     thisbucketsize =
                         estimate_hash_bucketsize(root,
-                              (Var *) get_rightop(restrictinfo->clause),
+                                                get_rightop(restrictinfo->clause),
                                                  virtualbuckets);
                     restrictinfo->right_bucketsize = thisbucketsize;
                 }
@@ -1168,7 +1166,7 @@ cost_hashjoin(HashPath *path, Query *root)
                     /* not cached yet */
                     thisbucketsize =
                         estimate_hash_bucketsize(root,
-                               (Var *) get_leftop(restrictinfo->clause),
+                                                get_leftop(restrictinfo->clause),
                                                  virtualbuckets);
                     restrictinfo->left_bucketsize = thisbucketsize;
                 }
@@ -1249,179 +1247,6 @@ cost_hashjoin(HashPath *path, Query *root)
     path->jpath.path.total_cost = startup_cost + run_cost;
  }
  
-/*
- * Estimate hash bucketsize fraction (ie, number of entries in a bucket
- * divided by total tuples in relation) if the specified Var is used
- * as a hash key.
- *
- * XXX This is really pretty bogus since we're effectively assuming that the
- * distribution of hash keys will be the same after applying restriction
- * clauses as it was in the underlying relation.  However, we are not nearly
- * smart enough to figure out how the restrict clauses might change the
- * distribution, so this will have to do for now.
- *
- * We are passed the number of buckets the executor will use for the given
- * input relation. If the data were perfectly distributed, with the same
- * number of tuples going into each available bucket, then the bucketsize
- * fraction would be 1/nbuckets.  But this happy state of affairs will occur
- * only if (a) there are at least nbuckets distinct data values, and (b)
- * we have a not-too-skewed data distribution. Otherwise the buckets will
- * be nonuniformly occupied.  If the other relation in the join has a key
- * distribution similar to this one's, then the most-loaded buckets are
- * exactly those that will be probed most often.  Therefore, the "average"
- * bucket size for costing purposes should really be taken as something close
- * to the "worst case" bucket size.  We try to estimate this by adjusting the
- * fraction if there are too few distinct data values, and then scaling up
- * by the ratio of the most common value's frequency to the average frequency.
- *
- * If no statistics are available, use a default estimate of 0.1.  This will
- * discourage use of a hash rather strongly if the inner relation is large,
- * which is what we want.  We do not want to hash unless we know that the
- * inner rel is well-dispersed (or the alternatives seem much worse).
- */
-static Selectivity
-estimate_hash_bucketsize(Query *root, Var *var, int nbuckets)
-{
-   Oid         relid;
-   RelOptInfo *rel;
-   HeapTuple   tuple;
-   Form_pg_statistic stats;
-   double      estfract,
-               ndistinct,
-               mcvfreq,
-               avgfreq;
-   float4     *numbers;
-   int         nnumbers;
-
-   /* Ignore any binary-compatible relabeling */
-   if (var && IsA(var, RelabelType))
-       var = (Var *) ((RelabelType *) var)->arg;
-
-   /*
-    * Lookup info about var's relation and attribute; if none available,
-    * return default estimate.
-    */
-   if (var == NULL || !IsA(var, Var))
-       return 0.1;
-
-   relid = getrelid(var->varno, root->rtable);
-   if (relid == InvalidOid)
-       return 0.1;
-
-   rel = find_base_rel(root, var->varno);
-
-   if (rel->tuples <= 0.0 || rel->rows <= 0.0)
-       return 0.1;             /* ensure we can divide below */
-
-   tuple = SearchSysCache(STATRELATT,
-                          ObjectIdGetDatum(relid),
-                          Int16GetDatum(var->varattno),
-                          0, 0);
-   if (!HeapTupleIsValid(tuple))
-   {
-       /*
-        * If the attribute is known unique because of an index,
-        * we can treat it as well-distributed.
-        */
-       if (has_unique_index(rel, var->varattno))
-           return 1.0 / (double) nbuckets;
-
-       /*
-        * Perhaps the Var is a system attribute; if so, it will have no
-        * entry in pg_statistic, but we may be able to guess something
-        * about its distribution anyway.
-        */
-       switch (var->varattno)
-       {
-           case ObjectIdAttributeNumber:
-           case SelfItemPointerAttributeNumber:
-               /* these are unique, so buckets should be well-distributed */
-               return 1.0 / (double) nbuckets;
-           case TableOidAttributeNumber:
-               /* hashing this is a terrible idea... */
-               return 1.0;
-       }
-       return 0.1;
-   }
-   stats = (Form_pg_statistic) GETSTRUCT(tuple);
-
-   /*
-    * Obtain number of distinct data values in raw relation.
-    */
-   ndistinct = stats->stadistinct;
-   if (ndistinct < 0.0)
-       ndistinct = -ndistinct * rel->tuples;
-
-   if (ndistinct <= 0.0)       /* ensure we can divide */
-   {
-       ReleaseSysCache(tuple);
-       return 0.1;
-   }
-
-   /* Also compute avg freq of all distinct data values in raw relation */
-   avgfreq = (1.0 - stats->stanullfrac) / ndistinct;
-
-   /*
-    * Adjust ndistinct to account for restriction clauses.  Observe we
-    * are assuming that the data distribution is affected uniformly by
-    * the restriction clauses!
-    *
-    * XXX Possibly better way, but much more expensive: multiply by
-    * selectivity of rel's restriction clauses that mention the target
-    * Var.
-    */
-   ndistinct *= rel->rows / rel->tuples;
-
-   /*
-    * Initial estimate of bucketsize fraction is 1/nbuckets as long as
-    * the number of buckets is less than the expected number of distinct
-    * values; otherwise it is 1/ndistinct.
-    */
-   if (ndistinct > (double) nbuckets)
-       estfract = 1.0 / (double) nbuckets;
-   else
-       estfract = 1.0 / ndistinct;
-
-   /*
-    * Look up the frequency of the most common value, if available.
-    */
-   mcvfreq = 0.0;
-
-   if (get_attstatsslot(tuple, var->vartype, var->vartypmod,
-                        STATISTIC_KIND_MCV, InvalidOid,
-                        NULL, NULL, &numbers, &nnumbers))
-   {
-       /*
-        * The first MCV stat is for the most common value.
-        */
-       if (nnumbers > 0)
-           mcvfreq = numbers[0];
-       free_attstatsslot(var->vartype, NULL, 0,
-                         numbers, nnumbers);
-   }
-
-   /*
-    * Adjust estimated bucketsize upward to account for skewed
-    * distribution.
-    */
-   if (avgfreq > 0.0 && mcvfreq > avgfreq)
-       estfract *= mcvfreq / avgfreq;
-
-   /*
-    * Clamp bucketsize to sane range (the above adjustment could easily
-    * produce an out-of-range result).  We set the lower bound a little
-    * above zero, since zero isn't a very sane result.
-    */
-   if (estfract < 1.0e-6)
-       estfract = 1.0e-6;
-   else if (estfract > 1.0)
-       estfract = 1.0;
-
-   ReleaseSysCache(tuple);
-
-   return (Selectivity) estfract;
-}
-
  
  /*
   * cost_qual_eval
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c

index d6d093ea467666acddfbbbfde4f4114c50eac3c4..d5a5480c62e94a3c37411c724db60b9691ff836c 100644 (file)
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/optimizer/util/relnode.c,v 1.54 2003/12/08 18:19:58 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/optimizer/util/relnode.c,v 1.55 2004/02/17 00:52:53 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -214,12 +214,8 @@ find_base_rel(Query *root, int relid)
   * find_join_rel
   *   Returns relation entry corresponding to 'relids' (a set of RT indexes),
   *   or NULL if none exists.  This is for join relations.
- *
- * Note: there is probably no good reason for this to be called from
- * anywhere except build_join_rel, but keep it as a separate routine
- * just in case.
   */
-static RelOptInfo *
+RelOptInfo *
  find_join_rel(Query *root, Relids relids)
  {
     List       *joinrels;
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index 84f18dc935993a70a20459a4bfe4df0314223f05..054739140970a5de8805e59ee5f37e293d2778bc 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.156 2004/02/02 03:07:08 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.157 2004/02/17 00:52:53 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -136,7 +136,6 @@
  /* default selectivity estimate for boolean and null test nodes */
  #define DEFAULT_UNK_SEL            0.005
  #define DEFAULT_NOT_UNK_SEL        (1.0 - DEFAULT_UNK_SEL)
-#define DEFAULT_BOOL_SEL       0.5
  
  /*
   * Clamp a computed probability estimate (which may suffer from roundoff or
@@ -151,7 +150,25 @@
     } while (0)
  
  
-static bool get_var_maximum(Query *root, Var *var, Oid sortop, Datum *max);
+/* Return data from examine_variable and friends */
+typedef struct
+{
+   Node       *var;            /* the Var or expression tree */
+   RelOptInfo *rel;            /* Relation, or NULL if not identifiable */
+   HeapTuple   statsTuple;     /* pg_statistic tuple, or NULL if none */
+   /* NB: if statsTuple!=NULL, it must be freed when caller is done */
+   Oid         atttype;        /* type to pass to get_attstatsslot */
+   int32       atttypmod;      /* typmod to pass to get_attstatsslot */
+   bool        isunique;       /* true if matched to a unique index */
+} VariableStatData;
+
+#define ReleaseVariableStats(vardata)  \
+   do { \
+       if (HeapTupleIsValid((vardata).statsTuple)) \
+           ReleaseSysCache((vardata).statsTuple); \
+   } while(0)
+
+
  static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
                   Datum lobound, Datum hibound, Oid boundstypid,
                   double *scaledlobound, double *scaledhibound);
@@ -174,13 +191,18 @@ static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
                             int rangelo, int rangehi);
  static unsigned char *convert_string_datum(Datum value, Oid typid);
  static double convert_timevalue_to_scalar(Datum value, Oid typid);
-static double get_att_numdistinct(Query *root, Var *var,
-                   Form_pg_statistic stats);
-static bool get_restriction_var(List *args, int varRelid,
-                   Var **var, Node **other,
+static bool get_restriction_variable(Query *root, List *args, int varRelid,
+                   VariableStatData *vardata, Node **other,
                     bool *varonleft);
-static void get_join_vars(List *args, Var **var1, Var **var2);
-static Selectivity prefix_selectivity(Query *root, Var *var,
+static void get_join_variables(Query *root, List *args,
+                              VariableStatData *vardata1,
+                              VariableStatData *vardata2);
+static void examine_variable(Query *root, Node *node, int varRelid,
+                            VariableStatData *vardata);
+static double get_variable_numdistinct(VariableStatData *vardata);
+static bool get_variable_maximum(Query *root, VariableStatData *vardata,
+                                Oid sortop, Datum *max);
+static Selectivity prefix_selectivity(Query *root, VariableStatData *vardata,
                    Oid opclass, Const *prefix);
  static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
  static Datum string_to_datum(const char *str, Oid datatype);
@@ -203,11 +225,9 @@ eqsel(PG_FUNCTION_ARGS)
     Oid         operator = PG_GETARG_OID(1);
     List       *args = (List *) PG_GETARG_POINTER(2);
     int         varRelid = PG_GETARG_INT32(3);
-   Var        *var;
+   VariableStatData vardata;
     Node       *other;
     bool        varonleft;
-   Oid         relid;
-   HeapTuple   statsTuple;
     Datum      *values;
     int         nvalues;
     float4     *numbers;
@@ -215,15 +235,11 @@ eqsel(PG_FUNCTION_ARGS)
     double      selec;
  
     /*
-    * If expression is not var = something or something = var for a
-    * simple var of a real relation (no subqueries, for now), then punt
-    * and return a default estimate.
+    * If expression is not variable = something or something = variable,
+    * then punt and return a default estimate.
      */
-   if (!get_restriction_var(args, varRelid,
-                            &var, &other, &varonleft))
-       PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
-   relid = getrelid(var->varno, root->rtable);
-   if (relid == InvalidOid)
+   if (!get_restriction_variable(root, args, varRelid,
+                                 &vardata, &other, &varonleft))
         PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
  
     /*
@@ -232,22 +248,20 @@ eqsel(PG_FUNCTION_ARGS)
      */
     if (IsA(other, Const) &&
         ((Const *) other)->constisnull)
+   {
+       ReleaseVariableStats(vardata);
         PG_RETURN_FLOAT8(0.0);
+   }
  
-   /* get stats for the attribute, if available */
-   statsTuple = SearchSysCache(STATRELATT,
-                               ObjectIdGetDatum(relid),
-                               Int16GetDatum(var->varattno),
-                               0, 0);
-   if (HeapTupleIsValid(statsTuple))
+   if (HeapTupleIsValid(vardata.statsTuple))
     {
         Form_pg_statistic stats;
  
-       stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
+       stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
  
         if (IsA(other, Const))
         {
-           /* Var is being compared to a known non-null constant */
+           /* Variable is being compared to a known non-null constant */
             Datum       constval = ((Const *) other)->constvalue;
             bool        match = false;
             int         i;
@@ -259,7 +273,8 @@ eqsel(PG_FUNCTION_ARGS)
              * an appropriate test.  If you don't like this, maybe you
              * shouldn't be using eqsel for your operator...)
              */
-           if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
+           if (get_attstatsslot(vardata.statsTuple,
+                                vardata.atttype, vardata.atttypmod,
                                  STATISTIC_KIND_MCV, InvalidOid,
                                  &values, &nvalues,
                                  &numbers, &nnumbers))
@@ -321,7 +336,7 @@ eqsel(PG_FUNCTION_ARGS)
                  * remaining fraction equally, so we divide by the number
                  * of other distinct values.
                  */
-               otherdistinct = get_att_numdistinct(root, var, stats)
+               otherdistinct = get_variable_numdistinct(&vardata)
                     - nnumbers;
                 if (otherdistinct > 1)
                     selec /= otherdistinct;
@@ -334,7 +349,7 @@ eqsel(PG_FUNCTION_ARGS)
                     selec = numbers[nnumbers - 1];
             }
  
-           free_attstatsslot(var->vartype, values, nvalues,
+           free_attstatsslot(vardata.atttype, values, nvalues,
                               numbers, nnumbers);
         }
         else
@@ -352,7 +367,7 @@ eqsel(PG_FUNCTION_ARGS)
              * frequency in the table.  Is that a good idea?)
              */
             selec = 1.0 - stats->stanullfrac;
-           ndistinct = get_att_numdistinct(root, var, stats);
+           ndistinct = get_variable_numdistinct(&vardata);
             if (ndistinct > 1)
                 selec /= ndistinct;
  
@@ -360,18 +375,17 @@ eqsel(PG_FUNCTION_ARGS)
              * Cross-check: selectivity should never be estimated as more
              * than the most common value's.
              */
-           if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
+           if (get_attstatsslot(vardata.statsTuple,
+                                vardata.atttype, vardata.atttypmod,
                                  STATISTIC_KIND_MCV, InvalidOid,
                                  NULL, NULL,
                                  &numbers, &nnumbers))
             {
                 if (nnumbers > 0 && selec > numbers[0])
                     selec = numbers[0];
-               free_attstatsslot(var->vartype, NULL, 0, numbers, nnumbers);
+               free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers);
             }
         }
-
-       ReleaseSysCache(statsTuple);
     }
     else
     {
@@ -381,9 +395,11 @@ eqsel(PG_FUNCTION_ARGS)
          * equally common.  (The guess is unlikely to be very good, but we
          * do know a few special cases.)
          */
-       selec = 1.0 / get_att_numdistinct(root, var, NULL);
+       selec = 1.0 / get_variable_numdistinct(&vardata);
     }
  
+   ReleaseVariableStats(vardata);
+
     /* result should be in range, but make sure... */
     CLAMP_PROBABILITY(selec);
  
@@ -433,7 +449,7 @@ neqsel(PG_FUNCTION_ARGS)
   * scalarineqsel       - Selectivity of "<", "<=", ">", ">=" for scalars.
   *
   * This is the guts of both scalarltsel and scalargtsel.  The caller has
- * commuted the clause, if necessary, so that we can treat the Var as
+ * commuted the clause, if necessary, so that we can treat the variable as
   * being on the left.  The caller must also make sure that the other side
   * of the clause is a non-null Const, and dissect same into a value and
   * datatype.
@@ -444,10 +460,8 @@ neqsel(PG_FUNCTION_ARGS)
   */
  static double
  scalarineqsel(Query *root, Oid operator, bool isgt,
-             Var *var, Datum constval, Oid consttype)
+             VariableStatData *vardata, Datum constval, Oid consttype)
  {
-   Oid         relid;
-   HeapTuple   statsTuple;
     Form_pg_statistic stats;
     FmgrInfo    opproc;
     Datum      *values;
@@ -460,26 +474,12 @@ scalarineqsel(Query *root, Oid operator, bool isgt,
     double      selec;
     int         i;
  
-   /*
-    * If expression is not var op something or something op var for a
-    * simple var of a real relation (no subqueries, for now), then punt
-    * and return a default estimate.
-    */
-   relid = getrelid(var->varno, root->rtable);
-   if (relid == InvalidOid)
-       return DEFAULT_INEQ_SEL;
-
-   /* get stats for the attribute */
-   statsTuple = SearchSysCache(STATRELATT,
-                               ObjectIdGetDatum(relid),
-                               Int16GetDatum(var->varattno),
-                               0, 0);
-   if (!HeapTupleIsValid(statsTuple))
+   if (!HeapTupleIsValid(vardata->statsTuple))
     {
         /* no stats available, so default result */
         return DEFAULT_INEQ_SEL;
     }
-   stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
+   stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
  
     fmgr_info(get_opcode(operator), &opproc);
  
@@ -492,7 +492,8 @@ scalarineqsel(Query *root, Oid operator, bool isgt,
     mcv_selec = 0.0;
     sumcommon = 0.0;
  
-   if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
+   if (get_attstatsslot(vardata->statsTuple,
+                        vardata->atttype, vardata->atttypmod,
                          STATISTIC_KIND_MCV, InvalidOid,
                          &values, &nvalues,
                          &numbers, &nnumbers))
@@ -505,7 +506,8 @@ scalarineqsel(Query *root, Oid operator, bool isgt,
                 mcv_selec += numbers[i];
             sumcommon += numbers[i];
         }
-       free_attstatsslot(var->vartype, values, nvalues, numbers, nnumbers);
+       free_attstatsslot(vardata->atttype, values, nvalues,
+                         numbers, nnumbers);
     }
  
     /*
@@ -523,7 +525,8 @@ scalarineqsel(Query *root, Oid operator, bool isgt,
      */
     hist_selec = 0.0;
  
-   if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
+   if (get_attstatsslot(vardata->statsTuple,
+                        vardata->atttype, vardata->atttypmod,
                          STATISTIC_KIND_HISTOGRAM, InvalidOid,
                          &values, &nvalues,
                          NULL, NULL))
@@ -582,7 +585,7 @@ scalarineqsel(Query *root, Oid operator, bool isgt,
                      */
                     if (convert_to_scalar(constval, consttype, &val,
                                           values[i - 1], values[i],
-                                         var->vartype,
+                                         vardata->atttype,
                                           &low, &high))
                     {
                         if (high <= low)
@@ -653,7 +656,7 @@ scalarineqsel(Query *root, Oid operator, bool isgt,
                 hist_selec = 0.9999;
         }
  
-       free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
+       free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
     }
  
     /*
@@ -676,8 +679,6 @@ scalarineqsel(Query *root, Oid operator, bool isgt,
  
     selec += mcv_selec;
  
-   ReleaseSysCache(statsTuple);
-
     /* result should be in range, but make sure... */
     CLAMP_PROBABILITY(selec);
  
@@ -694,21 +695,20 @@ scalarltsel(PG_FUNCTION_ARGS)
     Oid         operator = PG_GETARG_OID(1);
     List       *args = (List *) PG_GETARG_POINTER(2);
     int         varRelid = PG_GETARG_INT32(3);
-   Var        *var;
+   VariableStatData vardata;
     Node       *other;
+   bool        varonleft;
     Datum       constval;
     Oid         consttype;
-   bool        varonleft;
     bool        isgt;
     double      selec;
  
     /*
-    * If expression is not var op something or something op var for a
-    * simple var of a real relation (no subqueries, for now), then punt
-    * and return a default estimate.
+    * If expression is not variable op something or something op variable,
+    * then punt and return a default estimate.
      */
-   if (!get_restriction_var(args, varRelid,
-                            &var, &other, &varonleft))
+   if (!get_restriction_variable(root, args, varRelid,
+                                 &vardata, &other, &varonleft))
         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
  
     /*
@@ -716,14 +716,20 @@ scalarltsel(PG_FUNCTION_ARGS)
      * either.
      */
     if (!IsA(other, Const))
+   {
+       ReleaseVariableStats(vardata);
         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
+   }
  
     /*
      * If the constant is NULL, assume operator is strict and return zero,
      * ie, operator will never return TRUE.
      */
     if (((Const *) other)->constisnull)
+   {
+       ReleaseVariableStats(vardata);
         PG_RETURN_FLOAT8(0.0);
+   }
     constval = ((Const *) other)->constvalue;
     consttype = ((Const *) other)->consttype;
  
@@ -742,12 +748,15 @@ scalarltsel(PG_FUNCTION_ARGS)
         if (!operator)
         {
             /* Use default selectivity (should we raise an error instead?) */
+           ReleaseVariableStats(vardata);
             PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
         }
         isgt = true;
     }
  
-   selec = scalarineqsel(root, operator, isgt, var, constval, consttype);
+   selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
+
+   ReleaseVariableStats(vardata);
  
     PG_RETURN_FLOAT8((float8) selec);
  }
@@ -762,21 +771,20 @@ scalargtsel(PG_FUNCTION_ARGS)
     Oid         operator = PG_GETARG_OID(1);
     List       *args = (List *) PG_GETARG_POINTER(2);
     int         varRelid = PG_GETARG_INT32(3);
-   Var        *var;
+   VariableStatData vardata;
     Node       *other;
+   bool        varonleft;
     Datum       constval;
     Oid         consttype;
-   bool        varonleft;
     bool        isgt;
     double      selec;
  
     /*
-    * If expression is not var op something or something op var for a
-    * simple var of a real relation (no subqueries, for now), then punt
-    * and return a default estimate.
+    * If expression is not variable op something or something op variable,
+    * then punt and return a default estimate.
      */
-   if (!get_restriction_var(args, varRelid,
-                            &var, &other, &varonleft))
+   if (!get_restriction_variable(root, args, varRelid,
+                                 &vardata, &other, &varonleft))
         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
  
     /*
@@ -784,14 +792,20 @@ scalargtsel(PG_FUNCTION_ARGS)
      * either.
      */
     if (!IsA(other, Const))
+   {
+       ReleaseVariableStats(vardata);
         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
+   }
  
     /*
      * If the constant is NULL, assume operator is strict and return zero,
      * ie, operator will never return TRUE.
      */
     if (((Const *) other)->constisnull)
+   {
+       ReleaseVariableStats(vardata);
         PG_RETURN_FLOAT8(0.0);
+   }
     constval = ((Const *) other)->constvalue;
     consttype = ((Const *) other)->consttype;
  
@@ -810,12 +824,15 @@ scalargtsel(PG_FUNCTION_ARGS)
         if (!operator)
         {
             /* Use default selectivity (should we raise an error instead?) */
+           ReleaseVariableStats(vardata);
             PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
         }
         isgt = false;
     }
  
-   selec = scalarineqsel(root, operator, isgt, var, constval, consttype);
+   selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
+
+   ReleaseVariableStats(vardata);
  
     PG_RETURN_FLOAT8((float8) selec);
  }
@@ -833,10 +850,9 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
  #endif
     List       *args = (List *) PG_GETARG_POINTER(2);
     int         varRelid = PG_GETARG_INT32(3);
-   Var        *var;
+   VariableStatData vardata;
     Node       *other;
     bool        varonleft;
-   Oid         relid;
     Datum       constval;
     Oid         consttype;
     Oid         vartype;
@@ -848,25 +864,27 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
     double      result;
  
     /*
-    * If expression is not var op constant for a simple var of a real
-    * relation (no subqueries, for now), then punt and return a default
-    * estimate.
+    * If expression is not variable op constant, then punt and return a
+    * default estimate.
      */
-   if (!get_restriction_var(args, varRelid,
-                            &var, &other, &varonleft))
+   if (!get_restriction_variable(root, args, varRelid,
+                                 &vardata, &other, &varonleft))
         return DEFAULT_MATCH_SEL;
     if (!varonleft || !IsA(other, Const))
+   {
+       ReleaseVariableStats(vardata);
         return DEFAULT_MATCH_SEL;
-   relid = getrelid(var->varno, root->rtable);
-   if (relid == InvalidOid)
-       return DEFAULT_MATCH_SEL;
+   }
  
     /*
      * If the constant is NULL, assume operator is strict and return zero,
      * ie, operator will never return TRUE.
      */
     if (((Const *) other)->constisnull)
+   {
+       ReleaseVariableStats(vardata);
         return 0.0;
+   }
     constval = ((Const *) other)->constvalue;
     consttype = ((Const *) other)->consttype;
  
@@ -877,14 +895,17 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
      * match the operator's declared type.
      */
     if (consttype != TEXTOID && consttype != BYTEAOID)
+   {
+       ReleaseVariableStats(vardata);
         return DEFAULT_MATCH_SEL;
+   }
  
     /*
      * The var, on the other hand, might be a binary-compatible type;
      * particularly a domain.  Try to fold it if it's not recognized
      * immediately.
      */
-   vartype = var->vartype;
+   vartype = vardata.atttype;
     if (vartype != consttype)
         vartype = getBaseType(vartype);
  
@@ -915,6 +936,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
             opclass = BYTEA_BTREE_OPS_OID;
             break;
         default:
+           ReleaseVariableStats(vardata);
             return DEFAULT_MATCH_SEL;
     }
  
@@ -943,6 +965,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
             default:
                 elog(ERROR, "unrecognized consttype: %u",
                      prefix->consttype);
+               ReleaseVariableStats(vardata);
                 return DEFAULT_MATCH_SEL;
         }
         prefix = string_to_const(prefixstr, vartype);
@@ -960,7 +983,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
  
         if (eqopr == InvalidOid)
             elog(ERROR, "no = operator for opclass %u", opclass);
-       eqargs = makeList2(var, prefix);
+       eqargs = makeList2(vardata.var, prefix);
         result = DatumGetFloat8(DirectFunctionCall4(eqsel,
                                                     PointerGetDatum(root),
                                                  ObjectIdGetDatum(eqopr),
@@ -979,7 +1002,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
         Selectivity selec;
  
         if (pstatus == Pattern_Prefix_Partial)
-           prefixsel = prefix_selectivity(root, var, opclass, prefix);
+           prefixsel = prefix_selectivity(root, &vardata, opclass, prefix);
         else
             prefixsel = 1.0;
         restsel = pattern_selectivity(rest, ptype);
@@ -995,6 +1018,8 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
         pfree(prefix);
     }
  
+   ReleaseVariableStats(vardata);
+
     return result;
  }
  
@@ -1093,80 +1118,25 @@ Selectivity
  booltestsel(Query *root, BoolTestType booltesttype, Node *arg,
             int varRelid, JoinType jointype)
  {
-   Var        *var;
-   Oid         relid;
-   HeapTuple   statsTuple;
-   Datum      *values;
-   int         nvalues;
-   float4     *numbers;
-   int         nnumbers;
+   VariableStatData vardata;
     double      selec;
  
-   /*
-    * Ignore any binary-compatible relabeling (probably unnecessary, but
-    * can't hurt)
-    */
-   if (IsA(arg, RelabelType))
-       arg = (Node *) ((RelabelType *) arg)->arg;
-
-   if (IsA(arg, Var) &&
-       (varRelid == 0 || varRelid == ((Var *) arg)->varno))
-       var = (Var *) arg;
-   else
-   {
-       /*
-        * If argument is not a Var, we can't get statistics for it, but
-        * perhaps clause_selectivity can do something with it.  We ignore
-        * the possibility of a NULL value when using clause_selectivity,
-        * and just assume the value is either TRUE or FALSE.
-        */
-       switch (booltesttype)
-       {
-           case IS_UNKNOWN:
-               selec = DEFAULT_UNK_SEL;
-               break;
-           case IS_NOT_UNKNOWN:
-               selec = DEFAULT_NOT_UNK_SEL;
-               break;
-           case IS_TRUE:
-           case IS_NOT_FALSE:
-               selec = (double) clause_selectivity(root, arg,
-                                                   varRelid, jointype);
-               break;
-           case IS_FALSE:
-           case IS_NOT_TRUE:
-               selec = 1.0 - (double) clause_selectivity(root, arg,
-                                                    varRelid, jointype);
-               break;
-           default:
-               elog(ERROR, "unrecognized booltesttype: %d",
-                    (int) booltesttype);
-               selec = 0.0;    /* Keep compiler quiet */
-               break;
-       }
-       return (Selectivity) selec;
-   }
-
-   /* get stats for the attribute, if available */
-   relid = getrelid(var->varno, root->rtable);
-   if (relid == InvalidOid)
-       statsTuple = NULL;
-   else
-       statsTuple = SearchSysCache(STATRELATT,
-                                   ObjectIdGetDatum(relid),
-                                   Int16GetDatum(var->varattno),
-                                   0, 0);
+   examine_variable(root, arg, varRelid, &vardata);
  
-   if (HeapTupleIsValid(statsTuple))
+   if (HeapTupleIsValid(vardata.statsTuple))
     {
         Form_pg_statistic stats;
         double      freq_null;
+       Datum      *values;
+       int         nvalues;
+       float4     *numbers;
+       int         nnumbers;
  
-       stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
-
+       stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
         freq_null = stats->stanullfrac;
  
-       if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
+       if (get_attstatsslot(vardata.statsTuple,
+                            vardata.atttype, vardata.atttypmod,
                              STATISTIC_KIND_MCV, InvalidOid,
                              &values, &nvalues,
                              &numbers, &nnumbers)
@@ -1184,7 +1154,7 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg,
                 freq_true = 1.0 - numbers[0] - freq_null;
  
             /*
-            * Next derive freqency for false. Then use these as
+            * Next derive frequency for false. Then use these as
              * appropriate to derive frequency for each case.
              */
             freq_false = 1.0 - freq_true - freq_null;
@@ -1222,7 +1192,7 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg,
                     break;
             }
  
-           free_attstatsslot(var->vartype, values, nvalues,
+           free_attstatsslot(vardata.atttype, values, nvalues,
                               numbers, nnumbers);
         }
         else
@@ -1263,14 +1233,14 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg,
                     break;
             }
         }
-
-       ReleaseSysCache(statsTuple);
     }
     else
     {
         /*
-        * No VACUUM ANALYZE stats available, so use a default value.
-        * (Note: not much point in recursing to clause_selectivity here.)
+        * If we can't get variable statistics for the argument, perhaps
+        * clause_selectivity can do something with it.  We ignore
+        * the possibility of a NULL value when using clause_selectivity,
+        * and just assume the value is either TRUE or FALSE.
          */
         switch (booltesttype)
         {
@@ -1281,10 +1251,14 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg,
                 selec = DEFAULT_NOT_UNK_SEL;
                 break;
             case IS_TRUE:
-           case IS_NOT_TRUE:
-           case IS_FALSE:
             case IS_NOT_FALSE:
-               selec = DEFAULT_BOOL_SEL;
+               selec = (double) clause_selectivity(root, arg,
+                                                   varRelid, jointype);
+               break;
+           case IS_FALSE:
+           case IS_NOT_TRUE:
+               selec = 1.0 - (double) clause_selectivity(root, arg,
+                                                         varRelid, jointype);
                 break;
             default:
                 elog(ERROR, "unrecognized booltesttype: %d",
@@ -1294,6 +1268,8 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg,
         }
     }
  
+   ReleaseVariableStats(vardata);
+
     /* result should be in range, but make sure... */
     CLAMP_PROBABILITY(selec);
  
@@ -1306,56 +1282,17 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg,
  Selectivity
  nulltestsel(Query *root, NullTestType nulltesttype, Node *arg, int varRelid)
  {
-   Var        *var;
-   Oid         relid;
-   HeapTuple   statsTuple;
+   VariableStatData vardata;
     double      selec;
-   double      defselec;
-   double      freq_null;
-
-   switch (nulltesttype)
-   {
-       case IS_NULL:
-           defselec = DEFAULT_UNK_SEL;
-           break;
-       case IS_NOT_NULL:
-           defselec = DEFAULT_NOT_UNK_SEL;
-           break;
-       default:
-           elog(ERROR, "unrecognized nulltesttype: %d",
-                (int) nulltesttype);
-           return (Selectivity) 0;     /* keep compiler quiet */
-   }
-
-   /*
-    * Ignore any binary-compatible relabeling
-    */
-   if (IsA(arg, RelabelType))
-       arg = (Node *) ((RelabelType *) arg)->arg;
-
-   if (IsA(arg, Var) &&
-       (varRelid == 0 || varRelid == ((Var *) arg)->varno))
-       var = (Var *) arg;
-   else
-   {
-       /* punt if non-Var argument */
-       return (Selectivity) defselec;
-   }
  
-   relid = getrelid(var->varno, root->rtable);
-   if (relid == InvalidOid)
-       return (Selectivity) defselec;
+   examine_variable(root, arg, varRelid, &vardata);
  
-   /* get stats for the attribute, if available */
-   statsTuple = SearchSysCache(STATRELATT,
-                               ObjectIdGetDatum(relid),
-                               Int16GetDatum(var->varattno),
-                               0, 0);
-   if (HeapTupleIsValid(statsTuple))
+   if (HeapTupleIsValid(vardata.statsTuple))
     {
         Form_pg_statistic stats;
+       double      freq_null;
  
-       stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
+       stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
         freq_null = stats->stanullfrac;
  
         switch (nulltesttype)
@@ -1380,17 +1317,29 @@ nulltestsel(Query *root, NullTestType nulltesttype, Node *arg, int varRelid)
                      (int) nulltesttype);
                 return (Selectivity) 0; /* keep compiler quiet */
         }
-
-       ReleaseSysCache(statsTuple);
     }
     else
     {
         /*
          * No VACUUM ANALYZE stats available, so make a guess
          */
-       selec = defselec;
+       switch (nulltesttype)
+       {
+           case IS_NULL:
+               selec = DEFAULT_UNK_SEL;
+               break;
+           case IS_NOT_NULL:
+               selec = DEFAULT_NOT_UNK_SEL;
+               break;
+           default:
+               elog(ERROR, "unrecognized nulltesttype: %d",
+                    (int) nulltesttype);
+               return (Selectivity) 0;     /* keep compiler quiet */
+       }
     }
  
+   ReleaseVariableStats(vardata);
+
     /* result should be in range, but make sure... */
     CLAMP_PROBABILITY(selec);
  
@@ -1407,293 +1356,257 @@ eqjoinsel(PG_FUNCTION_ARGS)
     Oid         operator = PG_GETARG_OID(1);
     List       *args = (List *) PG_GETARG_POINTER(2);
     JoinType    jointype = (JoinType) PG_GETARG_INT16(3);
-   Var        *var1;
-   Var        *var2;
     double      selec;
+   VariableStatData vardata1;
+   VariableStatData vardata2;
+   double      nd1;
+   double      nd2;
+   Form_pg_statistic stats1 = NULL;
+   Form_pg_statistic stats2 = NULL;
+   bool        have_mcvs1 = false;
+   Datum      *values1 = NULL;
+   int         nvalues1 = 0;
+   float4     *numbers1 = NULL;
+   int         nnumbers1 = 0;
+   bool        have_mcvs2 = false;
+   Datum      *values2 = NULL;
+   int         nvalues2 = 0;
+   float4     *numbers2 = NULL;
+   int         nnumbers2 = 0;
+
+   get_join_variables(root, args, &vardata1, &vardata2);
+
+   nd1 = get_variable_numdistinct(&vardata1);
+   nd2 = get_variable_numdistinct(&vardata2);
+
+   if (HeapTupleIsValid(vardata1.statsTuple))
+   {
+       stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
+       have_mcvs1 = get_attstatsslot(vardata1.statsTuple,
+                                     vardata1.atttype,
+                                     vardata1.atttypmod,
+                                     STATISTIC_KIND_MCV,
+                                     InvalidOid,
+                                     &values1, &nvalues1,
+                                     &numbers1, &nnumbers1);
+   }
  
-   get_join_vars(args, &var1, &var2);
+   if (HeapTupleIsValid(vardata2.statsTuple))
+   {
+       stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
+       have_mcvs2 = get_attstatsslot(vardata2.statsTuple,
+                                     vardata2.atttype,
+                                     vardata2.atttypmod,
+                                     STATISTIC_KIND_MCV,
+                                     InvalidOid,
+                                     &values2, &nvalues2,
+                                     &numbers2, &nnumbers2);
+   }
  
-   if (var1 == NULL && var2 == NULL)
-       selec = DEFAULT_EQ_SEL;
-   else
+   if (have_mcvs1 && have_mcvs2)
     {
-       HeapTuple   statsTuple1 = NULL;
-       HeapTuple   statsTuple2 = NULL;
-       Form_pg_statistic stats1 = NULL;
-       Form_pg_statistic stats2 = NULL;
-       double      nd1 = DEFAULT_NUM_DISTINCT;
-       double      nd2 = DEFAULT_NUM_DISTINCT;
-       bool        have_mcvs1 = false;
-       Datum      *values1 = NULL;
-       int         nvalues1 = 0;
-       float4     *numbers1 = NULL;
-       int         nnumbers1 = 0;
-       bool        have_mcvs2 = false;
-       Datum      *values2 = NULL;
-       int         nvalues2 = 0;
-       float4     *numbers2 = NULL;
-       int         nnumbers2 = 0;
-
-       if (var1 != NULL)
-       {
-           /* get stats for the attribute, if available */
-           Oid         relid1 = getrelid(var1->varno, root->rtable);
+       /*
+        * We have most-common-value lists for both relations.  Run
+        * through the lists to see which MCVs actually join to each
+        * other with the given operator.  This allows us to determine
+        * the exact join selectivity for the portion of the relations
+        * represented by the MCV lists.  We still have to estimate
+        * for the remaining population, but in a skewed distribution
+        * this gives us a big leg up in accuracy.  For motivation see
+        * the analysis in Y. Ioannidis and S. Christodoulakis, "On
+        * the propagation of errors in the size of join results",
+        * Technical Report 1018, Computer Science Dept., University
+        * of Wisconsin, Madison, March 1991 (available from
+        * ftp.cs.wisc.edu).
+        */
+       FmgrInfo    eqproc;
+       bool       *hasmatch1;
+       bool       *hasmatch2;
+       double      nullfrac1 = stats1->stanullfrac;
+       double      nullfrac2 = stats2->stanullfrac;
+       double      matchprodfreq,
+                   matchfreq1,
+                   matchfreq2,
+                   unmatchfreq1,
+                   unmatchfreq2,
+                   otherfreq1,
+                   otherfreq2,
+                   totalsel1,
+                   totalsel2;
+       int         i,
+                   nmatches;
+
+       fmgr_info(get_opcode(operator), &eqproc);
+       hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
+       hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
  
-           if (relid1 != InvalidOid)
-           {
-               statsTuple1 = SearchSysCache(STATRELATT,
-                                            ObjectIdGetDatum(relid1),
-                                          Int16GetDatum(var1->varattno),
-                                            0, 0);
-               if (HeapTupleIsValid(statsTuple1))
-               {
-                   stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1);
-                   have_mcvs1 = get_attstatsslot(statsTuple1,
-                                                 var1->vartype,
-                                                 var1->vartypmod,
-                                                 STATISTIC_KIND_MCV,
-                                                 InvalidOid,
-                                                 &values1, &nvalues1,
-                                                 &numbers1, &nnumbers1);
-               }
+       /*
+        * If we are doing any variant of JOIN_IN, pretend all the
+        * values of the righthand relation are unique (ie, act as if
+        * it's been DISTINCT'd).
+        *
+        * NOTE: it might seem that we should unique-ify the lefthand
+        * input when considering JOIN_REVERSE_IN.  But this is not
+        * so, because the join clause we've been handed has not been
+        * commuted from the way the parser originally wrote it.  We
+        * know that the unique side of the IN clause is *always* on
+        * the right.
+        *
+        * NOTE: it would be dangerous to try to be smart about JOIN_LEFT
+        * or JOIN_RIGHT here, because we do not have enough
+        * information to determine which var is really on which side
+        * of the join. Perhaps someday we should pass in more
+        * information.
+        */
+       if (jointype == JOIN_IN ||
+           jointype == JOIN_REVERSE_IN ||
+           jointype == JOIN_UNIQUE_INNER ||
+           jointype == JOIN_UNIQUE_OUTER)
+       {
+           float4      oneovern = 1.0 / nd2;
  
-               nd1 = get_att_numdistinct(root, var1, stats1);
-           }
+           for (i = 0; i < nvalues2; i++)
+               numbers2[i] = oneovern;
+           nullfrac2 = oneovern;
         }
  
-       if (var2 != NULL)
+       /*
+        * Note we assume that each MCV will match at most one member
+        * of the other MCV list.  If the operator isn't really
+        * equality, there could be multiple matches --- but we don't
+        * look for them, both for speed and because the math wouldn't
+        * add up...
+        */
+       matchprodfreq = 0.0;
+       nmatches = 0;
+       for (i = 0; i < nvalues1; i++)
         {
-           /* get stats for the attribute, if available */
-           Oid         relid2 = getrelid(var2->varno, root->rtable);
+           int         j;
  
-           if (relid2 != InvalidOid)
+           for (j = 0; j < nvalues2; j++)
             {
-               statsTuple2 = SearchSysCache(STATRELATT,
-                                            ObjectIdGetDatum(relid2),
-                                          Int16GetDatum(var2->varattno),
-                                            0, 0);
-               if (HeapTupleIsValid(statsTuple2))
+               if (hasmatch2[j])
+                   continue;
+               if (DatumGetBool(FunctionCall2(&eqproc,
+                                              values1[i],
+                                              values2[j])))
                 {
-                   stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2);
-                   have_mcvs2 = get_attstatsslot(statsTuple2,
-                                                 var2->vartype,
-                                                 var2->vartypmod,
-                                                 STATISTIC_KIND_MCV,
-                                                 InvalidOid,
-                                                 &values2, &nvalues2,
-                                                 &numbers2, &nnumbers2);
+                   hasmatch1[i] = hasmatch2[j] = true;
+                   matchprodfreq += numbers1[i] * numbers2[j];
+                   nmatches++;
+                   break;
                 }
-
-               nd2 = get_att_numdistinct(root, var2, stats2);
             }
         }
-
-       if (have_mcvs1 && have_mcvs2)
+       CLAMP_PROBABILITY(matchprodfreq);
+       /* Sum up frequencies of matched and unmatched MCVs */
+       matchfreq1 = unmatchfreq1 = 0.0;
+       for (i = 0; i < nvalues1; i++)
         {
-           /*
-            * We have most-common-value lists for both relations.  Run
-            * through the lists to see which MCVs actually join to each
-            * other with the given operator.  This allows us to determine
-            * the exact join selectivity for the portion of the relations
-            * represented by the MCV lists.  We still have to estimate
-            * for the remaining population, but in a skewed distribution
-            * this gives us a big leg up in accuracy.  For motivation see
-            * the analysis in Y. Ioannidis and S. Christodoulakis, "On
-            * the propagation of errors in the size of join results",
-            * Technical Report 1018, Computer Science Dept., University
-            * of Wisconsin, Madison, March 1991 (available from
-            * ftp.cs.wisc.edu).
-            */
-           FmgrInfo    eqproc;
-           bool       *hasmatch1;
-           bool       *hasmatch2;
-           double      nullfrac1 = stats1->stanullfrac;
-           double      nullfrac2 = stats2->stanullfrac;
-           double      matchprodfreq,
-                       matchfreq1,
-                       matchfreq2,
-                       unmatchfreq1,
-                       unmatchfreq2,
-                       otherfreq1,
-                       otherfreq2,
-                       totalsel1,
-                       totalsel2;
-           int         i,
-                       nmatches;
-
-           fmgr_info(get_opcode(operator), &eqproc);
-           hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
-           hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
-
-           /*
-            * If we are doing any variant of JOIN_IN, pretend all the
-            * values of the righthand relation are unique (ie, act as if
-            * it's been DISTINCT'd).
-            *
-            * NOTE: it might seem that we should unique-ify the lefthand
-            * input when considering JOIN_REVERSE_IN.  But this is not
-            * so, because the join clause we've been handed has not been
-            * commuted from the way the parser originally wrote it.  We
-            * know that the unique side of the IN clause is *always* on
-            * the right.
-            *
-            * NOTE: it would be dangerous to try to be smart about JOIN_LEFT
-            * or JOIN_RIGHT here, because we do not have enough
-            * information to determine which var is really on which side
-            * of the join. Perhaps someday we should pass in more
-            * information.
-            */
-           if (jointype == JOIN_IN ||
-               jointype == JOIN_REVERSE_IN ||
-               jointype == JOIN_UNIQUE_INNER ||
-               jointype == JOIN_UNIQUE_OUTER)
-           {
-               float4      oneovern = 1.0 / nd2;
-
-               for (i = 0; i < nvalues2; i++)
-                   numbers2[i] = oneovern;
-               nullfrac2 = oneovern;
-           }
-
-           /*
-            * Note we assume that each MCV will match at most one member
-            * of the other MCV list.  If the operator isn't really
-            * equality, there could be multiple matches --- but we don't
-            * look for them, both for speed and because the math wouldn't
-            * add up...
-            */
-           matchprodfreq = 0.0;
-           nmatches = 0;
-           for (i = 0; i < nvalues1; i++)
-           {
-               int         j;
+           if (hasmatch1[i])
+               matchfreq1 += numbers1[i];
+           else
+               unmatchfreq1 += numbers1[i];
+       }
+       CLAMP_PROBABILITY(matchfreq1);
+       CLAMP_PROBABILITY(unmatchfreq1);
+       matchfreq2 = unmatchfreq2 = 0.0;
+       for (i = 0; i < nvalues2; i++)
+       {
+           if (hasmatch2[i])
+               matchfreq2 += numbers2[i];
+           else
+               unmatchfreq2 += numbers2[i];
+       }
+       CLAMP_PROBABILITY(matchfreq2);
+       CLAMP_PROBABILITY(unmatchfreq2);
+       pfree(hasmatch1);
+       pfree(hasmatch2);
  
-               for (j = 0; j < nvalues2; j++)
-               {
-                   if (hasmatch2[j])
-                       continue;
-                   if (DatumGetBool(FunctionCall2(&eqproc,
-                                                  values1[i],
-                                                  values2[j])))
-                   {
-                       hasmatch1[i] = hasmatch2[j] = true;
-                       matchprodfreq += numbers1[i] * numbers2[j];
-                       nmatches++;
-                       break;
-                   }
-               }
-           }
-           CLAMP_PROBABILITY(matchprodfreq);
-           /* Sum up frequencies of matched and unmatched MCVs */
-           matchfreq1 = unmatchfreq1 = 0.0;
-           for (i = 0; i < nvalues1; i++)
-           {
-               if (hasmatch1[i])
-                   matchfreq1 += numbers1[i];
-               else
-                   unmatchfreq1 += numbers1[i];
-           }
-           CLAMP_PROBABILITY(matchfreq1);
-           CLAMP_PROBABILITY(unmatchfreq1);
-           matchfreq2 = unmatchfreq2 = 0.0;
-           for (i = 0; i < nvalues2; i++)
-           {
-               if (hasmatch2[i])
-                   matchfreq2 += numbers2[i];
-               else
-                   unmatchfreq2 += numbers2[i];
-           }
-           CLAMP_PROBABILITY(matchfreq2);
-           CLAMP_PROBABILITY(unmatchfreq2);
-           pfree(hasmatch1);
-           pfree(hasmatch2);
+       /*
+        * Compute total frequency of non-null values that are not in
+        * the MCV lists.
+        */
+       otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1;
+       otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2;
+       CLAMP_PROBABILITY(otherfreq1);
+       CLAMP_PROBABILITY(otherfreq2);
  
-           /*
-            * Compute total frequency of non-null values that are not in
-            * the MCV lists.
-            */
-           otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1;
-           otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2;
-           CLAMP_PROBABILITY(otherfreq1);
-           CLAMP_PROBABILITY(otherfreq2);
+       /*
+        * We can estimate the total selectivity from the point of
+        * view of relation 1 as: the known selectivity for matched
+        * MCVs, plus unmatched MCVs that are assumed to match against
+        * random members of relation 2's non-MCV population, plus
+        * non-MCV values that are assumed to match against random
+        * members of relation 2's unmatched MCVs plus non-MCV values.
+        */
+       totalsel1 = matchprodfreq;
+       if (nd2 > nvalues2)
+           totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
+       if (nd2 > nmatches)
+           totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
+               (nd2 - nmatches);
+       /* Same estimate from the point of view of relation 2. */
+       totalsel2 = matchprodfreq;
+       if (nd1 > nvalues1)
+           totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
+       if (nd1 > nmatches)
+           totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
+               (nd1 - nmatches);
  
-           /*
-            * We can estimate the total selectivity from the point of
-            * view of relation 1 as: the known selectivity for matched
-            * MCVs, plus unmatched MCVs that are assumed to match against
-            * random members of relation 2's non-MCV population, plus
-            * non-MCV values that are assumed to match against random
-            * members of relation 2's unmatched MCVs plus non-MCV values.
-            */
-           totalsel1 = matchprodfreq;
-           if (nd2 > nvalues2)
-               totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
-           if (nd2 > nmatches)
-               totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
-                   (nd2 - nmatches);
-           /* Same estimate from the point of view of relation 2. */
-           totalsel2 = matchprodfreq;
-           if (nd1 > nvalues1)
-               totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
-           if (nd1 > nmatches)
-               totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
-                   (nd1 - nmatches);
+       /*
+        * Use the smaller of the two estimates.  This can be
+        * justified in essentially the same terms as given below for
+        * the no-stats case: to a first approximation, we are
+        * estimating from the point of view of the relation with
+        * smaller nd.
+        */
+       selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
+   }
+   else
+   {
+       /*
+        * We do not have MCV lists for both sides.  Estimate the join
+        * selectivity as
+        * MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2). This is
+        * plausible if we assume that the join operator is strict and
+        * the non-null values are about equally distributed: a given
+        * non-null tuple of rel1 will join to either zero or
+        * N2*(1-nullfrac2)/nd2 rows of rel2, so total join rows are
+        * at most N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join
+        * selectivity of not more than
+        * (1-nullfrac1)*(1-nullfrac2)/nd2. By the same logic it is
+        * not more than (1-nullfrac1)*(1-nullfrac2)/nd1, so the
+        * expression with MIN() is an upper bound.  Using the MIN()
+        * means we estimate from the point of view of the relation
+        * with smaller nd (since the larger nd is determining the
+        * MIN).  It is reasonable to assume that most tuples in this
+        * rel will have join partners, so the bound is probably
+        * reasonably tight and should be taken as-is.
+        *
+        * XXX Can we be smarter if we have an MCV list for just one
+        * side? It seems that if we assume equal distribution for the
+        * other side, we end up with the same answer anyway.
+        */
+       double      nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
+       double      nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
  
-           /*
-            * Use the smaller of the two estimates.  This can be
-            * justified in essentially the same terms as given below for
-            * the no-stats case: to a first approximation, we are
-            * estimating from the point of view of the relation with
-            * smaller nd.
-            */
-           selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
-       }
+       selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
+       if (nd1 > nd2)
+           selec /= nd1;
         else
-       {
-           /*
-            * We do not have MCV lists for both sides.  Estimate the join
-            * selectivity as
-            * MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2). This is
-            * plausible if we assume that the join operator is strict and
-            * the non-null values are about equally distributed: a given
-            * non-null tuple of rel1 will join to either zero or
-            * N2*(1-nullfrac2)/nd2 rows of rel2, so total join rows are
-            * at most N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join
-            * selectivity of not more than
-            * (1-nullfrac1)*(1-nullfrac2)/nd2. By the same logic it is
-            * not more than (1-nullfrac1)*(1-nullfrac2)/nd1, so the
-            * expression with MIN() is an upper bound.  Using the MIN()
-            * means we estimate from the point of view of the relation
-            * with smaller nd (since the larger nd is determining the
-            * MIN).  It is reasonable to assume that most tuples in this
-            * rel will have join partners, so the bound is probably
-            * reasonably tight and should be taken as-is.
-            *
-            * XXX Can we be smarter if we have an MCV list for just one
-            * side? It seems that if we assume equal distribution for the
-            * other side, we end up with the same answer anyway.
-            */
-           double      nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
-           double      nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
+           selec /= nd2;
+   }
  
-           selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
-           if (nd1 > nd2)
-               selec /= nd1;
-           else
-               selec /= nd2;
-       }
+   if (have_mcvs1)
+       free_attstatsslot(vardata1.atttype, values1, nvalues1,
+                         numbers1, nnumbers1);
+   if (have_mcvs2)
+       free_attstatsslot(vardata2.atttype, values2, nvalues2,
+                         numbers2, nnumbers2);
  
-       if (have_mcvs1)
-           free_attstatsslot(var1->vartype, values1, nvalues1,
-                             numbers1, nnumbers1);
-       if (have_mcvs2)
-           free_attstatsslot(var2->vartype, values2, nvalues2,
-                             numbers2, nnumbers2);
-       if (HeapTupleIsValid(statsTuple1))
-           ReleaseSysCache(statsTuple1);
-       if (HeapTupleIsValid(statsTuple2))
-           ReleaseSysCache(statsTuple2);
-   }
+   ReleaseVariableStats(vardata1);
+   ReleaseVariableStats(vardata2);
  
     CLAMP_PROBABILITY(selec);
  
@@ -1860,8 +1773,10 @@ mergejoinscansel(Query *root, Node *clause,
                  Selectivity *leftscan,
                  Selectivity *rightscan)
  {
-   Var        *left,
+   Node       *left,
                *right;
+   VariableStatData leftvar,
+               rightvar;
     Oid         lefttype,
                 righttype;
     Oid         opno,
@@ -1883,42 +1798,31 @@ mergejoinscansel(Query *root, Node *clause,
     if (!is_opclause(clause))
         return;                 /* shouldn't happen */
     opno = ((OpExpr *) clause)->opno;
-   left = (Var *) get_leftop((Expr *) clause);
-   right = (Var *) get_rightop((Expr *) clause);
+   left = get_leftop((Expr *) clause);
+   right = get_rightop((Expr *) clause);
     if (!right)
         return;                 /* shouldn't happen */
  
-   /* Save the direct input types of the operator */
-   lefttype = exprType((Node *) left);
-   righttype = exprType((Node *) right);
+   /* Look for stats for the inputs */
+   examine_variable(root, left, 0, &leftvar);
+   examine_variable(root, right, 0, &rightvar);
  
-   /*
-    * Now skip any binary-compatible relabeling; there can only be one
-    * level since constant-expression folder eliminates adjacent
-    * RelabelTypes.
-    */
-   if (IsA(left, RelabelType))
-       left = (Var *) ((RelabelType *) left)->arg;
-   if (IsA(right, RelabelType))
-       right = (Var *) ((RelabelType *) right)->arg;
-
-   /* Can't do anything if inputs are not Vars */
-   if (!IsA(left, Var) ||
-       !IsA(right, Var))
-       return;
+   /* Get the direct input types of the operator */
+   lefttype = exprType(left);
+   righttype = exprType(right);
  
     /* Verify mergejoinability and get left and right "<" operators */
     if (!op_mergejoinable(opno,
                           &lsortop,
                           &rsortop))
-       return;                 /* shouldn't happen */
+       goto fail;              /* shouldn't happen */
  
-   /* Try to get maximum values of both vars */
-   if (!get_var_maximum(root, left, lsortop, &leftmax))
-       return;                 /* no max available from stats */
+   /* Try to get maximum values of both inputs */
+   if (!get_variable_maximum(root, &leftvar, lsortop, &leftmax))
+       goto fail;              /* no max available from stats */
  
-   if (!get_var_maximum(root, right, rsortop, &rightmax))
-       return;                 /* no max available from stats */
+   if (!get_variable_maximum(root, &rightvar, rsortop, &rightmax))
+       goto fail;              /* no max available from stats */
  
     /* Look up the "left < right" and "left > right" operators */
     op_mergejoin_crossops(opno, &ltop, &gtop, NULL, NULL);
@@ -1926,30 +1830,30 @@ mergejoinscansel(Query *root, Node *clause,
     /* Look up the "left <= right" operator */
     leop = get_negator(gtop);
     if (!OidIsValid(leop))
-       return;                 /* insufficient info in catalogs */
+       goto fail;              /* insufficient info in catalogs */
  
     /* Look up the "right > left" operator */
     revgtop = get_commutator(ltop);
     if (!OidIsValid(revgtop))
-       return;                 /* insufficient info in catalogs */
+       goto fail;              /* insufficient info in catalogs */
  
     /* Look up the "right <= left" operator */
     revleop = get_negator(revgtop);
     if (!OidIsValid(revleop))
-       return;                 /* insufficient info in catalogs */
+       goto fail;              /* insufficient info in catalogs */
  
     /*
      * Now, the fraction of the left variable that will be scanned is the
      * fraction that's <= the right-side maximum value.  But only believe
      * non-default estimates, else stick with our 1.0.
      */
-   selec = scalarineqsel(root, leop, false, left,
+   selec = scalarineqsel(root, leop, false, &leftvar,
                           rightmax, righttype);
     if (selec != DEFAULT_INEQ_SEL)
         *leftscan = selec;
  
     /* And similarly for the right variable. */
-   selec = scalarineqsel(root, revleop, false, right,
+   selec = scalarineqsel(root, revleop, false, &rightvar,
                           leftmax, lefttype);
     if (selec != DEFAULT_INEQ_SEL)
         *rightscan = selec;
@@ -1966,6 +1870,10 @@ mergejoinscansel(Query *root, Node *clause,
         *rightscan = 1.0;
     else
         *leftscan = *rightscan = 1.0;
+
+fail:
+   ReleaseVariableStats(leftvar);
+   ReleaseVariableStats(rightvar);
  }
  
  /*
@@ -2076,25 +1984,14 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
     foreach(l, allvars)
     {
         Var        *var = (Var *) lfirst(l);
-       Oid         relid = getrelid(var->varno, root->rtable);
-       HeapTuple   statsTuple = NULL;
-       Form_pg_statistic stats = NULL;
+       VariableStatData vardata;
         double      ndistinct;
         bool        keep = true;
         List       *l2;
  
-       if (OidIsValid(relid))
-       {
-           statsTuple = SearchSysCache(STATRELATT,
-                                       ObjectIdGetDatum(relid),
-                                       Int16GetDatum(var->varattno),
-                                       0, 0);
-           if (HeapTupleIsValid(statsTuple))
-               stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
-       }
-       ndistinct = get_att_numdistinct(root, var, stats);
-       if (HeapTupleIsValid(statsTuple))
-           ReleaseSysCache(statsTuple);
+       examine_variable(root, (Node *) var, 0, &vardata);
+       ndistinct = get_variable_numdistinct(&vardata);
+       ReleaseVariableStats(vardata);
  
         /* cannot use foreach here because of possible lremove */
         l2 = varinfos;
@@ -2201,143 +2098,152 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
     return numdistinct;
  }
  
-
-/*-------------------------------------------------------------------------
+/*
+ * Estimate hash bucketsize fraction (ie, number of entries in a bucket
+ * divided by total tuples in relation) if the specified expression is used
+ * as a hash key.
   *
- * Support routines
+ * XXX This is really pretty bogus since we're effectively assuming that the
+ * distribution of hash keys will be the same after applying restriction
+ * clauses as it was in the underlying relation.  However, we are not nearly
+ * smart enough to figure out how the restrict clauses might change the
+ * distribution, so this will have to do for now.
   *
- *-------------------------------------------------------------------------
- */
-
-/*
- * get_var_maximum
- *     Estimate the maximum value of the specified variable.
- *     If successful, store value in *max and return TRUE.
- *     If no data available, return FALSE.
+ * We are passed the number of buckets the executor will use for the given
+ * input relation. If the data were perfectly distributed, with the same
+ * number of tuples going into each available bucket, then the bucketsize
+ * fraction would be 1/nbuckets.  But this happy state of affairs will occur
+ * only if (a) there are at least nbuckets distinct data values, and (b)
+ * we have a not-too-skewed data distribution. Otherwise the buckets will
+ * be nonuniformly occupied.  If the other relation in the join has a key
+ * distribution similar to this one's, then the most-loaded buckets are
+ * exactly those that will be probed most often.  Therefore, the "average"
+ * bucket size for costing purposes should really be taken as something close
+ * to the "worst case" bucket size.  We try to estimate this by adjusting the
+ * fraction if there are too few distinct data values, and then scaling up
+ * by the ratio of the most common value's frequency to the average frequency.
   *
- * sortop is the "<" comparison operator to use.  (To extract the
- * minimum instead of the maximum, just pass the ">" operator instead.)
+ * If no statistics are available, use a default estimate of 0.1.  This will
+ * discourage use of a hash rather strongly if the inner relation is large,
+ * which is what we want.  We do not want to hash unless we know that the
+ * inner rel is well-dispersed (or the alternatives seem much worse).
   */
-static bool
-get_var_maximum(Query *root, Var *var, Oid sortop, Datum *max)
+Selectivity
+estimate_hash_bucketsize(Query *root, Node *hashkey, int nbuckets)
  {
-   Datum       tmax = 0;
-   bool        have_max = false;
-   Oid         relid;
-   HeapTuple   statsTuple;
-   Form_pg_statistic stats;
-   int16       typLen;
-   bool        typByVal;
-   Datum      *values;
-   int         nvalues;
-   int         i;
+   VariableStatData vardata;
+   double      estfract,
+               ndistinct,
+               stanullfrac,
+               mcvfreq,
+               avgfreq;
+   float4     *numbers;
+   int         nnumbers;
  
-   relid = getrelid(var->varno, root->rtable);
-   if (relid == InvalidOid)
-       return false;
+   examine_variable(root, hashkey, 0, &vardata);
  
-   /* get stats for the attribute */
-   statsTuple = SearchSysCache(STATRELATT,
-                               ObjectIdGetDatum(relid),
-                               Int16GetDatum(var->varattno),
-                               0, 0);
-   if (!HeapTupleIsValid(statsTuple))
+   /* Get number of distinct values and fraction that are null */
+   ndistinct = get_variable_numdistinct(&vardata);
+
+   if (HeapTupleIsValid(vardata.statsTuple))
     {
-       /* no stats available, so default result */
-       return false;
+       Form_pg_statistic stats;
+
+       stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
+       stanullfrac = stats->stanullfrac;
+   }
+   else
+   {
+       /*
+        * Believe a default ndistinct only if it came from stats.
+        * Otherwise punt and return 0.1, per comments above.
+        */
+       if (ndistinct == DEFAULT_NUM_DISTINCT)
+       {
+           ReleaseVariableStats(vardata);
+           return (Selectivity) 0.1;
+       }
+
+       stanullfrac = 0.0;
     }
-   stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
  
-   get_typlenbyval(var->vartype, &typLen, &typByVal);
+   /* Compute avg freq of all distinct data values in raw relation */
+   avgfreq = (1.0 - stanullfrac) / ndistinct;
  
     /*
-    * If there is a histogram, grab the last or first value as
-    * appropriate.
+    * Adjust ndistinct to account for restriction clauses.  Observe we
+    * are assuming that the data distribution is affected uniformly by
+    * the restriction clauses!
      *
-    * If there is a histogram that is sorted with some other operator than
-    * the one we want, fail --- this suggests that there is data we can't
-    * use.
+    * XXX Possibly better way, but much more expensive: multiply by
+    * selectivity of rel's restriction clauses that mention the target
+    * Var.
      */
-   if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
-                        STATISTIC_KIND_HISTOGRAM, sortop,
-                        &values, &nvalues,
-                        NULL, NULL))
-   {
-       if (nvalues > 0)
-       {
-           tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
-           have_max = true;
-       }
-       free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
-   }
+   if (vardata.rel)
+       ndistinct *= vardata.rel->rows / vardata.rel->tuples;
+
+   /*
+    * Initial estimate of bucketsize fraction is 1/nbuckets as long as
+    * the number of buckets is less than the expected number of distinct
+    * values; otherwise it is 1/ndistinct.
+    */
+   if (ndistinct > (double) nbuckets)
+       estfract = 1.0 / (double) nbuckets;
     else
-   {
-       Oid         rsortop = get_commutator(sortop);
+       estfract = 1.0 / ndistinct;
  
-       if (OidIsValid(rsortop) &&
-           get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
-                            STATISTIC_KIND_HISTOGRAM, rsortop,
-                            &values, &nvalues,
-                            NULL, NULL))
-       {
-           if (nvalues > 0)
-           {
-               tmax = datumCopy(values[0], typByVal, typLen);
-               have_max = true;
-           }
-           free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
-       }
-       else if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
-                                 STATISTIC_KIND_HISTOGRAM, InvalidOid,
-                                 &values, &nvalues,
-                                 NULL, NULL))
+   /*
+    * Look up the frequency of the most common value, if available.
+    */
+   mcvfreq = 0.0;
+
+   if (HeapTupleIsValid(vardata.statsTuple))
+   {
+       if (get_attstatsslot(vardata.statsTuple,
+                            vardata.atttype, vardata.atttypmod,
+                            STATISTIC_KIND_MCV, InvalidOid,
+                            NULL, NULL, &numbers, &nnumbers))
         {
-           free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
-           ReleaseSysCache(statsTuple);
-           return false;
+           /*
+            * The first MCV stat is for the most common value.
+            */
+           if (nnumbers > 0)
+               mcvfreq = numbers[0];
+           free_attstatsslot(vardata.atttype, NULL, 0,
+                             numbers, nnumbers);
         }
     }
  
     /*
-    * If we have most-common-values info, look for a large MCV.  This is
-    * needed even if we also have a histogram, since the histogram
-    * excludes the MCVs.  However, usually the MCVs will not be the
-    * extreme values, so avoid unnecessary data copying.
+    * Adjust estimated bucketsize upward to account for skewed
+    * distribution.
      */
-   if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
-                        STATISTIC_KIND_MCV, InvalidOid,
-                        &values, &nvalues,
-                        NULL, NULL))
-   {
-       bool        large_mcv = false;
-       FmgrInfo    opproc;
-
-       fmgr_info(get_opcode(sortop), &opproc);
+   if (avgfreq > 0.0 && mcvfreq > avgfreq)
+       estfract *= mcvfreq / avgfreq;
  
-       for (i = 0; i < nvalues; i++)
-       {
-           if (!have_max)
-           {
-               tmax = values[i];
-               large_mcv = have_max = true;
-           }
-           else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
-           {
-               tmax = values[i];
-               large_mcv = true;
-           }
-       }
-       if (large_mcv)
-           tmax = datumCopy(tmax, typByVal, typLen);
-       free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
-   }
+   /*
+    * Clamp bucketsize to sane range (the above adjustment could easily
+    * produce an out-of-range result).  We set the lower bound a little
+    * above zero, since zero isn't a very sane result.
+    */
+   if (estfract < 1.0e-6)
+       estfract = 1.0e-6;
+   else if (estfract > 1.0)
+       estfract = 1.0;
  
-   ReleaseSysCache(statsTuple);
+   ReleaseVariableStats(vardata);
  
-   *max = tmax;
-   return have_max;
+   return (Selectivity) estfract;
  }
  
+
+/*-------------------------------------------------------------------------
+ *
+ * Support routines
+ *
+ *-------------------------------------------------------------------------
+ */
+
  /*
   * convert_to_scalar
   *   Convert non-NULL values of the indicated types to the comparison
@@ -2903,185 +2809,522 @@ convert_timevalue_to_scalar(Datum value, Oid typid)
  
  
  /*
- * get_att_numdistinct
- *   Estimate the number of distinct values of an attribute.
+ * get_restriction_variable
+ *     Examine the args of a restriction clause to see if it's of the
+ *     form (variable op pseudoconstant) or (pseudoconstant op variable),
+ *     where "variable" could be either a Var or an expression in vars of a
+ *     single relation.  If so, extract information about the variable,
+ *     and also indicate which side it was on and the other argument.
   *
- * var: identifies the attribute to examine.
- * stats: pg_statistic tuple for attribute, or NULL if not available.
+ * Inputs:
+ * root: the Query
+ * args: clause argument list
+ * varRelid: see specs for restriction selectivity functions
   *
- * NB: be careful to produce an integral result, since callers may compare
- * the result to exact integer counts.
+ * Outputs: (these are valid only if TRUE is returned)
+ * *vardata: gets information about variable (see examine_variable)
+ * *other: gets other clause argument, stripped of binary relabeling
+ * *varonleft: set TRUE if variable is on the left, FALSE if on the right
+ *
+ * Returns TRUE if a variable is identified, otherwise FALSE.
+ *
+ * Note: if there are Vars on both sides of the clause, we must fail, because
+ * callers are expecting that the other side will act like a pseudoconstant.
   */
-static double
-get_att_numdistinct(Query *root, Var *var, Form_pg_statistic stats)
+static bool
+get_restriction_variable(Query *root, List *args, int varRelid,
+                        VariableStatData *vardata, Node **other,
+                        bool *varonleft)
  {
-   RelOptInfo *rel;
-   double      ntuples;
-
-   /*
-    * Special-case boolean columns: presumably, two distinct values.
-    *
-    * Are there any other cases we should wire in special estimates for?
-    */
-   if (var->vartype == BOOLOID)
-       return 2.0;
+   Node       *left,
+              *right;
+   VariableStatData rdata;
  
-   /*
-    * Otherwise we need to get the relation size.
-    */
-   rel = find_base_rel(root, var->varno);
-   ntuples = rel->tuples;
+   /* Fail if not a binary opclause (probably shouldn't happen) */
+   if (length(args) != 2)
+       return false;
  
-   if (ntuples <= 0.0)
-       return DEFAULT_NUM_DISTINCT;    /* no data available; return a
-                                        * default */
+   left = (Node *) lfirst(args);
+   right = (Node *) lsecond(args);
  
     /*
-    * Look to see if there is a unique index on the attribute. If so, we
-    * assume it's distinct, ignoring pg_statistic info which could be out
-    * of date.
+    * Examine both sides.  Note that when varRelid is nonzero, Vars of
+    * other relations will be treated as pseudoconstants.
      */
-   if (has_unique_index(rel, var->varattno))
-       return ntuples;
+   examine_variable(root, left, varRelid, vardata);
+   examine_variable(root, right, varRelid, &rdata);
  
     /*
-    * If ANALYZE determined a fixed or scaled estimate, use it.
+    * If one side is a variable and the other not, we win.
      */
-   if (stats)
+   if (vardata->rel && rdata.rel == NULL)
     {
-       if (stats->stadistinct > 0.0)
-           return stats->stadistinct;
-       if (stats->stadistinct < 0.0)
-           return floor((-stats->stadistinct * ntuples) + 0.5);
+       *varonleft = true;
+       *other = rdata.var;
+       /* Assume we need no ReleaseVariableStats(rdata) here */
+       return true;
     }
  
-   /*
-    * ANALYZE does not compute stats for system attributes, but some of
-    * them can reasonably be assumed unique anyway.
-    */
-   switch (var->varattno)
+   if (vardata->rel == NULL && rdata.rel)
     {
-       case ObjectIdAttributeNumber:
-       case SelfItemPointerAttributeNumber:
-           return ntuples;
-       case TableOidAttributeNumber:
-           return 1.0;
+       *varonleft = false;
+       *other = vardata->var;
+       /* Assume we need no ReleaseVariableStats(*vardata) here */
+       *vardata = rdata;
+       return true;
     }
  
-   /*
-    * Estimate ndistinct = ntuples if the table is small, else use
-    * default.
-    */
-   if (ntuples < DEFAULT_NUM_DISTINCT)
-       return ntuples;
+   /* Ooops, clause has wrong structure (probably var op var) */
+   ReleaseVariableStats(*vardata);
+   ReleaseVariableStats(rdata);
  
-   return DEFAULT_NUM_DISTINCT;
+   return false;
  }
  
  /*
- * get_restriction_var
- *     Examine the args of a restriction clause to see if it's of the
- *     form (var op something) or (something op var).  If so, extract
- *     and return the var and the other argument.
- *
- * Inputs:
- * args: clause argument list
- * varRelid: see specs for restriction selectivity functions
- *
- * Outputs: (these are set only if TRUE is returned)
- * *var: gets Var node
- * *other: gets other clause argument
- * *varonleft: set TRUE if var is on the left, FALSE if on the right
- *
- * Returns TRUE if a Var is identified, otherwise FALSE.
+ * get_join_variables
+ *     Apply examine_variable() to each side of a join clause.
   */
-static bool
-get_restriction_var(List *args,
-                   int varRelid,
-                   Var **var,
-                   Node **other,
-                   bool *varonleft)
+static void
+get_join_variables(Query *root, List *args,
+                  VariableStatData *vardata1, VariableStatData *vardata2)
  {
     Node       *left,
                *right;
  
     if (length(args) != 2)
-       return false;
+       elog(ERROR, "join operator should take two arguments");
  
     left = (Node *) lfirst(args);
     right = (Node *) lsecond(args);
  
+   examine_variable(root, left, 0, vardata1);
+   examine_variable(root, right, 0, vardata2);
+}
+
+/*
+ * examine_variable
+ *     Try to look up statistical data about an expression.
+ *     Fill in a VariableStatData struct to describe the expression.
+ *
+ * Inputs:
+ * root: the Query
+ * node: the expression tree to examine
+ * varRelid: see specs for restriction selectivity functions
+ *
+ * Outputs: *vardata is filled as follows:
+ * var: the input expression (with any binary relabeling stripped)
+ * rel: RelOptInfo for relation containing variable; NULL if expression
+ *     contains no Vars (NOTE this could point to a RelOptInfo of a
+ *     subquery, not one in the current query).
+ * statsTuple: the pg_statistic entry for the variable, if one exists;
+ *     otherwise NULL.
+ * atttype, atttypmod: type data to pass to get_attstatsslot().  This is
+ *     commonly the same as the exposed type of the variable argument,
+ *     but can be different in binary-compatible-type cases.
+ *
+ * Caller is responsible for doing ReleaseVariableStats() before exiting.
+ */
+static void
+examine_variable(Query *root, Node *node, int varRelid,
+                VariableStatData *vardata)
+{
+   Relids      varnos;
+   RelOptInfo *onerel;
+
+   /* Make sure we don't return dangling pointers in vardata */
+   MemSet(vardata, 0, sizeof(VariableStatData));
+
     /* Ignore any binary-compatible relabeling */
  
-   if (IsA(left, RelabelType))
-       left = (Node *) ((RelabelType *) left)->arg;
-   if (IsA(right, RelabelType))
-       right = (Node *) ((RelabelType *) right)->arg;
+   if (IsA(node, RelabelType))
+       node = (Node *) ((RelabelType *) node)->arg;
  
-   /* Look for the var */
+   vardata->var = node;
  
-   if (IsA(left, Var) &&
-       (varRelid == 0 || varRelid == ((Var *) left)->varno))
+   /* Fast path for a simple Var */
+
+   if (IsA(node, Var) &&
+       (varRelid == 0 || varRelid == ((Var *) node)->varno))
     {
-       *var = (Var *) left;
-       *other = right;
-       *varonleft = true;
+       Var        *var = (Var *) node;
+       Oid         relid;
+
+       vardata->rel = find_base_rel(root, var->varno);
+       vardata->atttype = var->vartype;
+       vardata->atttypmod = var->vartypmod;
+
+       relid = getrelid(var->varno, root->rtable);
+
+       if (OidIsValid(relid))
+       {
+           vardata->statsTuple = SearchSysCache(STATRELATT,
+                                                ObjectIdGetDatum(relid),
+                                                Int16GetDatum(var->varattno),
+                                                0, 0);
+       }
+       else
+       {
+           /*
+            * XXX This means the Var comes from a JOIN or sub-SELECT.  Later
+            * add code to dig down into the join etc and see if we can trace
+            * the variable to something with stats.  (But beware of
+            * sub-SELECTs with DISTINCT/GROUP BY/etc.  Perhaps there are
+            * no cases where this would really be useful, because we'd have
+            * flattened the subselect if it is??)
+            */
+       }
+
+       return;
     }
-   else if (IsA(right, Var) &&
-            (varRelid == 0 || varRelid == ((Var *) right)->varno))
+
+   /*
+    * Okay, it's a more complicated expression.  Determine variable
+    * membership.  Note that when varRelid isn't zero, only vars of
+    * that relation are considered "real" vars.
+    */
+   varnos = pull_varnos(node);
+
+   onerel = NULL;
+
+   switch (bms_membership(varnos))
     {
-       *var = (Var *) right;
-       *other = left;
-       *varonleft = false;
+       case BMS_EMPTY_SET:
+           /* No Vars at all ... must be pseudo-constant clause */
+           break;
+       case BMS_SINGLETON:
+           if (varRelid == 0 || bms_is_member(varRelid, varnos))
+           {
+               onerel = find_base_rel(root,
+                        (varRelid ? varRelid : bms_singleton_member(varnos)));
+               vardata->rel = onerel;
+           }
+           /* else treat it as a constant */
+           break;
+       case BMS_MULTIPLE:
+           if (varRelid == 0)
+           {
+               /* treat it as a variable of a join relation */
+               vardata->rel = find_join_rel(root, varnos);
+           }
+           else if (bms_is_member(varRelid, varnos))
+           {
+               /* ignore the vars belonging to other relations */
+               vardata->rel = find_base_rel(root, varRelid);
+               /* note: no point in expressional-index search here */
+           }
+           /* else treat it as a constant */
+           break;
+   }
+
+   bms_free(varnos);
+
+   vardata->atttype = exprType(node);
+   vardata->atttypmod = exprTypmod(node);
+
+   if (onerel)
+   {
+       /*
+        * We have an expression in vars of a single relation.  Try to
+        * match it to expressional index columns, in hopes of finding
+        * some statistics.
+        *
+        * XXX it's conceivable that there are multiple matches with
+        * different index opclasses; if so, we need to pick one that
+        * matches the operator we are estimating for.  FIXME later.
+        */
+       List       *ilist;
+
+       foreach(ilist, onerel->indexlist)
+       {
+           IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist);
+           List       *indexprs;
+           int         pos;
+
+           indexprs = index->indexprs;
+           if (indexprs == NIL)
+               continue;       /* no expressions here... */
+
+           /*
+            * Ignore partial indexes since they probably don't reflect
+            * whole-relation statistics.  Possibly reconsider this later.
+            */
+           if (index->indpred)
+               continue;
+
+           for (pos = 0; pos < index->ncolumns; pos++)
+           {
+               if (index->indexkeys[pos] == 0)
+               {
+                   Node       *indexkey;
+
+                   if (indexprs == NIL)
+                       elog(ERROR, "too few entries in indexprs list");
+                   indexkey = (Node *) lfirst(indexprs);
+                   if (indexkey && IsA(indexkey, RelabelType))
+                       indexkey = (Node *) ((RelabelType *) indexkey)->arg;
+                   if (equal(node, indexkey))
+                   {
+                       /*
+                        * Found a match ... is it a unique index?
+                        * Tests here should match has_unique_index().
+                        */
+                       if (index->unique &&
+                           index->ncolumns == 1 &&
+                           index->indpred == NIL)
+                           vardata->isunique = true;
+                       /* Has it got stats? */
+                       vardata->statsTuple = SearchSysCache(STATRELATT,
+                                                            ObjectIdGetDatum(index->indexoid),
+                                                            Int16GetDatum(pos + 1),
+                                                            0, 0);
+                       if (vardata->statsTuple)
+                           break;
+                   }
+                   indexprs = lnext(indexprs);
+               }
+           }
+           if (vardata->statsTuple)
+               break;
+       }
+   }
+}
+
+/*
+ * get_variable_numdistinct
+ *   Estimate the number of distinct values of a variable.
+ *
+ * vardata: results of examine_variable
+ *
+ * NB: be careful to produce an integral result, since callers may compare
+ * the result to exact integer counts.
+ */
+static double
+get_variable_numdistinct(VariableStatData *vardata)
+{
+   double      stadistinct;
+   double      ntuples;
+
+   /*
+    * Determine the stadistinct value to use.  There are cases where
+    * we can get an estimate even without a pg_statistic entry, or
+    * can get a better value than is in pg_statistic.
+    */
+   if (HeapTupleIsValid(vardata->statsTuple))
+   {
+       /* Use the pg_statistic entry */
+       Form_pg_statistic stats;
+
+       stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+       stadistinct = stats->stadistinct;
+   }
+   else if (vardata->atttype == BOOLOID)
+   {
+       /*
+        * Special-case boolean columns: presumably, two distinct values.
+        *
+        * Are there any other datatypes we should wire in special
+        * estimates for?
+        */
+       stadistinct = 2.0;
     }
     else
     {
-       /* Duh, it's too complicated for me... */
-       return false;
+       /*
+        * We don't keep statistics for system columns, but in some
+        * cases we can infer distinctness anyway.
+        */
+       if (vardata->var && IsA(vardata->var, Var))
+       {
+           switch (((Var *) vardata->var)->varattno)
+           {
+               case ObjectIdAttributeNumber:
+               case SelfItemPointerAttributeNumber:
+                   stadistinct = -1.0;         /* unique */
+                   break;
+               case TableOidAttributeNumber:
+                   stadistinct = 1.0;          /* only 1 value */
+                   break;
+               default:
+                   stadistinct = 0.0;          /* means "unknown" */
+                   break;
+           }
+       }
+       else
+           stadistinct = 0.0;                  /* means "unknown" */
+       /*
+        * XXX consider using estimate_num_groups on expressions?
+        */
+   }
+
+   /*
+    * If there is a unique index for the variable, assume it is unique
+    * no matter what pg_statistic says (the statistics could be out
+    * of date).  Can skip search if we already think it's unique.
+    */
+   if (stadistinct != -1.0)
+   {
+       if (vardata->isunique)
+           stadistinct = -1.0;
+       else if (vardata->var && IsA(vardata->var, Var) &&
+                vardata->rel &&
+                has_unique_index(vardata->rel, 
+                                 ((Var *) vardata->var)->varattno))
+           stadistinct = -1.0;
     }
  
-   return true;
+   /*
+    * If we had an absolute estimate, use that.
+    */
+   if (stadistinct > 0.0)
+       return stadistinct;
+
+   /*
+    * Otherwise we need to get the relation size; punt if not available.
+    */
+   if (vardata->rel == NULL)
+       return DEFAULT_NUM_DISTINCT;
+   ntuples = vardata->rel->tuples;
+   if (ntuples <= 0.0)
+       return DEFAULT_NUM_DISTINCT;
+
+   /*
+    * If we had a relative estimate, use that.
+    */
+   if (stadistinct < 0.0)
+       return floor((-stadistinct * ntuples) + 0.5);
+
+   /*
+    * With no data, estimate ndistinct = ntuples if the table is small,
+    * else use default.
+    */
+   if (ntuples < DEFAULT_NUM_DISTINCT)
+       return ntuples;
+
+   return DEFAULT_NUM_DISTINCT;
  }
  
  /*
- * get_join_vars
+ * get_variable_maximum
+ *     Estimate the maximum value of the specified variable.
+ *     If successful, store value in *max and return TRUE.
+ *     If no data available, return FALSE.
   *
- * Extract the two Vars from a join clause's argument list.  Returns
- * NULL for arguments that are not simple vars.
+ * sortop is the "<" comparison operator to use.  (To extract the
+ * minimum instead of the maximum, just pass the ">" operator instead.)
   */
-static void
-get_join_vars(List *args, Var **var1, Var **var2)
+static bool
+get_variable_maximum(Query *root, VariableStatData *vardata,
+                    Oid sortop, Datum *max)
  {
-   Node       *left,
-              *right;
+   Datum       tmax = 0;
+   bool        have_max = false;
+   Form_pg_statistic stats;
+   int16       typLen;
+   bool        typByVal;
+   Datum      *values;
+   int         nvalues;
+   int         i;
  
-   if (length(args) != 2)
+   if (!HeapTupleIsValid(vardata->statsTuple))
     {
-       *var1 = NULL;
-       *var2 = NULL;
-       return;
+       /* no stats available, so default result */
+       return false;
     }
+   stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
  
-   left = (Node *) lfirst(args);
-   right = (Node *) lsecond(args);
+   get_typlenbyval(vardata->atttype, &typLen, &typByVal);
  
-   /* Ignore any binary-compatible relabeling */
-   if (IsA(left, RelabelType))
-       left = (Node *) ((RelabelType *) left)->arg;
-   if (IsA(right, RelabelType))
-       right = (Node *) ((RelabelType *) right)->arg;
-
-   if (IsA(left, Var))
-       *var1 = (Var *) left;
+   /*
+    * If there is a histogram, grab the last or first value as
+    * appropriate.
+    *
+    * If there is a histogram that is sorted with some other operator than
+    * the one we want, fail --- this suggests that there is data we can't
+    * use.
+    */
+   if (get_attstatsslot(vardata->statsTuple,
+                        vardata->atttype, vardata->atttypmod,
+                        STATISTIC_KIND_HISTOGRAM, sortop,
+                        &values, &nvalues,
+                        NULL, NULL))
+   {
+       if (nvalues > 0)
+       {
+           tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
+           have_max = true;
+       }
+       free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
+   }
     else
-       *var1 = NULL;
+   {
+       Oid         rsortop = get_commutator(sortop);
  
-   if (IsA(right, Var))
-       *var2 = (Var *) right;
-   else
-       *var2 = NULL;
+       if (OidIsValid(rsortop) &&
+           get_attstatsslot(vardata->statsTuple,
+                            vardata->atttype, vardata->atttypmod,
+                            STATISTIC_KIND_HISTOGRAM, rsortop,
+                            &values, &nvalues,
+                            NULL, NULL))
+       {
+           if (nvalues > 0)
+           {
+               tmax = datumCopy(values[0], typByVal, typLen);
+               have_max = true;
+           }
+           free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
+       }
+       else if (get_attstatsslot(vardata->statsTuple,
+                                 vardata->atttype, vardata->atttypmod,
+                                 STATISTIC_KIND_HISTOGRAM, InvalidOid,
+                                 &values, &nvalues,
+                                 NULL, NULL))
+       {
+           free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
+           return false;
+       }
+   }
+
+   /*
+    * If we have most-common-values info, look for a large MCV.  This is
+    * needed even if we also have a histogram, since the histogram
+    * excludes the MCVs.  However, usually the MCVs will not be the
+    * extreme values, so avoid unnecessary data copying.
+    */
+   if (get_attstatsslot(vardata->statsTuple,
+                        vardata->atttype, vardata->atttypmod,
+                        STATISTIC_KIND_MCV, InvalidOid,
+                        &values, &nvalues,
+                        NULL, NULL))
+   {
+       bool        large_mcv = false;
+       FmgrInfo    opproc;
+
+       fmgr_info(get_opcode(sortop), &opproc);
+
+       for (i = 0; i < nvalues; i++)
+       {
+           if (!have_max)
+           {
+               tmax = values[i];
+               large_mcv = have_max = true;
+           }
+           else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
+           {
+               tmax = values[i];
+               large_mcv = true;
+           }
+       }
+       if (large_mcv)
+           tmax = datumCopy(tmax, typByVal, typLen);
+       free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
+   }
+
+   *max = tmax;
+   return have_max;
  }
  
+
  /*-------------------------------------------------------------------------
   *
   * Pattern analysis functions
@@ -3387,10 +3630,11 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
   * Estimate the selectivity of a fixed prefix for a pattern match.
   *
   * A fixed prefix "foo" is estimated as the selectivity of the expression
- * "var >= 'foo' AND var < 'fop'" (see also indxqual.c).
+ * "variable >= 'foo' AND variable < 'fop'" (see also indxqual.c).
   *
   * We use the >= and < operators from the specified btree opclass to do the
- * estimation. The given Var and Const must be of the associated datatype.
+ * estimation. The given variable and Const must be of the associated
+ * datatype.
   *
   * XXX Note: we make use of the upper bound to estimate operator selectivity
   * even if the locale is such that we cannot rely on the upper-bound string.
@@ -3398,7 +3642,8 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
   * more useful to use the upper-bound code than not.
   */
  static Selectivity
-prefix_selectivity(Query *root, Var *var, Oid opclass, Const *prefixcon)
+prefix_selectivity(Query *root, VariableStatData *vardata,
+                  Oid opclass, Const *prefixcon)
  {
     Selectivity prefixsel;
     Oid         cmpopr;
@@ -3409,7 +3654,7 @@ prefix_selectivity(Query *root, Var *var, Oid opclass, Const *prefixcon)
                                 BTGreaterEqualStrategyNumber);
     if (cmpopr == InvalidOid)
         elog(ERROR, "no >= operator for opclass %u", opclass);
-   cmpargs = makeList2(var, prefixcon);
+   cmpargs = makeList2(vardata->var, prefixcon);
     /* Assume scalargtsel is appropriate for all supported types */
     prefixsel = DatumGetFloat8(DirectFunctionCall4(scalargtsel,
                                                    PointerGetDatum(root),
@@ -3431,7 +3676,7 @@ prefix_selectivity(Query *root, Var *var, Oid opclass, Const *prefixcon)
                                     BTLessStrategyNumber);
         if (cmpopr == InvalidOid)
             elog(ERROR, "no < operator for opclass %u", opclass);
-       cmpargs = makeList2(var, greaterstrcon);
+       cmpargs = makeList2(vardata->var, greaterstrcon);
         /* Assume scalarltsel is appropriate for all supported types */
         topsel = DatumGetFloat8(DirectFunctionCall4(scalarltsel,
                                                     PointerGetDatum(root),
@@ -3446,7 +3691,7 @@ prefix_selectivity(Query *root, Var *var, Oid opclass, Const *prefixcon)
         prefixsel = topsel + prefixsel - 1.0;
  
         /* Adjust for double-exclusion of NULLs */
-       prefixsel += nulltestsel(root, IS_NULL, (Node *) var, var->varno);
+       prefixsel += nulltestsel(root, IS_NULL, vardata->var, 0);
  
         /*
          * A zero or slightly negative prefixsel should be converted into
@@ -4034,56 +4279,69 @@ btcostestimate(PG_FUNCTION_ARGS)
     Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
     Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
     double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
+   Oid         relid;
+   AttrNumber  colnum;
+   HeapTuple   tuple;
  
     genericcostestimate(root, rel, index, indexQuals,
                         indexStartupCost, indexTotalCost,
                         indexSelectivity, indexCorrelation);
  
     /*
-    * If the first column is a simple variable, and we can get an
-    * estimate for its ordering correlation C from pg_statistic, estimate
-    * the index correlation as C / number-of-columns. (The idea here is
+    * If we can get an estimate of the first column's ordering correlation C
+    * from pg_statistic, estimate the index correlation as C for a single-
+    * column index, or C * 0.75 for multiple columns.  (The idea here is
      * that multiple columns dilute the importance of the first column's
-    * ordering, but don't negate it entirely.)
+    * ordering, but don't negate it entirely.  Before 7.5 we divided the
+    * correlation by the number of columns, but that seems too strong.)
      */
     if (index->indexkeys[0] != 0)
     {
-       Oid         relid;
-       HeapTuple   tuple;
-
+       /* Simple variable --- look to stats for the underlying table */
         relid = getrelid(rel->relid, root->rtable);
         Assert(relid != InvalidOid);
-       tuple = SearchSysCache(STATRELATT,
-                              ObjectIdGetDatum(relid),
-                              Int16GetDatum(index->indexkeys[0]),
-                              0, 0);
-       if (HeapTupleIsValid(tuple))
+       colnum = index->indexkeys[0];
+   }
+   else
+   {
+       /* Expression --- maybe there are stats for the index itself */
+       relid = index->indexoid;
+       colnum = 1;
+   }
+
+   tuple = SearchSysCache(STATRELATT,
+                          ObjectIdGetDatum(relid),
+                          Int16GetDatum(colnum),
+                          0, 0);
+
+   if (HeapTupleIsValid(tuple))
+   {
+       Oid         typid;
+       int32       typmod;
+       float4     *numbers;
+       int         nnumbers;
+
+       /* XXX this code would break with different storage type */
+       get_atttypetypmod(relid, colnum, &typid, &typmod);
+
+       if (get_attstatsslot(tuple, typid, typmod,
+                            STATISTIC_KIND_CORRELATION,
+                            index->ordering[0],
+                            NULL, NULL, &numbers, &nnumbers))
         {
-           Oid         typid;
-           int32       typmod;
-           float4     *numbers;
-           int         nnumbers;
-
-           get_atttypetypmod(relid, index->indexkeys[0],
-                             &typid, &typmod);
-           if (get_attstatsslot(tuple, typid, typmod,
-                                STATISTIC_KIND_CORRELATION,
-                                index->ordering[0],
-                                NULL, NULL, &numbers, &nnumbers))
-           {
-               double      varCorrelation;
-               int         nKeys;
+           double      varCorrelation;
  
-               Assert(nnumbers == 1);
-               varCorrelation = numbers[0];
-               nKeys = index->ncolumns;
+           Assert(nnumbers == 1);
+           varCorrelation = numbers[0];
  
-               *indexCorrelation = varCorrelation / nKeys;
+           if (index->ncolumns > 1)
+               *indexCorrelation = varCorrelation * 0.75;
+           else
+               *indexCorrelation = varCorrelation;
  
-               free_attstatsslot(typid, NULL, 0, numbers, nnumbers);
-           }
-           ReleaseSysCache(tuple);
+           free_attstatsslot(typid, NULL, 0, numbers, nnumbers);
         }
+       ReleaseSysCache(tuple);
     }
  
     PG_RETURN_VOID();
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h

index 379e2ba7a5e614e3fb38352e7523d4f99a9a71db..3186b8d1c1f6e7aca0f43d265279cb4b1eb65e43 100644 (file)
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.53 2003/11/29 22:41:07 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.54 2004/02/17 00:52:53 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -77,6 +77,7 @@ extern HashPath *create_hashjoin_path(Query *root,
  extern void build_base_rel(Query *root, int relid);
  extern RelOptInfo *build_other_rel(Query *root, int relid);
  extern RelOptInfo *find_base_rel(Query *root, int relid);
+extern RelOptInfo *find_join_rel(Query *root, Relids relids);
  extern RelOptInfo *build_join_rel(Query *root,
                Relids joinrelids,
                RelOptInfo *outer_rel,
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h

index 873af8b9876bc15b642ebd96a5b231e0a1f9a15d..797e0a4c700cc735f5a9d39c3d9df47b56aea5a0 100644 (file)
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -8,7 +8,7 @@
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.16 2003/11/29 22:41:16 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.17 2004/02/17 00:52:53 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -77,6 +77,9 @@ extern void mergejoinscansel(Query *root, Node *clause,
  extern double estimate_num_groups(Query *root, List *groupExprs,
                     double input_rows);
  
+extern Selectivity estimate_hash_bucketsize(Query *root, Node *hashkey,
+                                           int nbuckets);
+
  extern Datum btcostestimate(PG_FUNCTION_ARGS);
  extern Datum rtcostestimate(PG_FUNCTION_ARGS);
  extern Datum hashcostestimate(PG_FUNCTION_ARGS);
author	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 17 Feb 2004 00:52:53 +0000 (00:52 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 17 Feb 2004 00:52:53 +0000 (00:52 +0000)
src/backend/optimizer/path/costsize.c		patch \| blob \| blame \| history
src/backend/optimizer/util/relnode.c		patch \| blob \| blame \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/include/optimizer/pathnode.h		patch \| blob \| blame \| history
src/include/utils/selfuncs.h		patch \| blob \| blame \| history