Implement choice between hash-based and sort-based grouping for doing
authorTom Lane <tgl@sss.pgh.pa.us>
Wed, 22 Jan 2003 00:07:00 +0000 (00:07 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Wed, 22 Jan 2003 00:07:00 +0000 (00:07 +0000)
DISTINCT processing on the output of an IN sub-select.

src/backend/optimizer/plan/createplan.c
src/backend/optimizer/util/pathnode.c

index b7b1204e76e30f2ceb77244e39e54e2b48b9edd6..eb7e922d9a1b47bd4ca7058f31b79533ebea8234 100644 (file)
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.132 2003/01/20 18:54:52 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.133 2003/01/22 00:07:00 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
+#include <limits.h>
 
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
@@ -418,6 +419,7 @@ create_unique_plan(Query *root, UniquePath *best_path)
        Plan       *plan;
        Plan       *subplan;
        List       *sub_targetlist;
+       List       *my_tlist;
        List       *l;
 
        subplan = create_plan(root, best_path->subpath);
@@ -474,21 +476,39 @@ create_unique_plan(Query *root, UniquePath *best_path)
                        subplan->targetlist = newtlist;
        }
 
+       my_tlist = new_unsorted_tlist(subplan->targetlist);
+
        if (best_path->use_hash)
        {
-               elog(ERROR, "create_unique_plan: hash case not implemented yet");
-               plan = NULL;
+               int             numGroupCols = length(my_tlist);
+               long    numGroups;
+               AttrNumber *groupColIdx;
+               int             i;
+
+               numGroups = (long) Min(best_path->rows, (double) LONG_MAX);
+
+               groupColIdx = (AttrNumber *) palloc(numGroupCols * sizeof(AttrNumber));
+               for (i = 0; i < numGroupCols; i++)
+                       groupColIdx[i] = i+1;
+
+               plan = (Plan *) make_agg(root,
+                                                                my_tlist,
+                                                                NIL,
+                                                                AGG_HASHED,
+                                                                numGroupCols,
+                                                                groupColIdx,
+                                                                numGroups,
+                                                                0,
+                                                                subplan);
        }
        else
        {
-               List       *sort_tlist;
                List       *sortList;
 
-               sort_tlist = new_unsorted_tlist(subplan->targetlist);
-               sortList = addAllTargetsToSortList(NIL, sort_tlist);
-               plan = (Plan *) make_sort_from_sortclauses(root, sort_tlist,
+               sortList = addAllTargetsToSortList(NIL, my_tlist);
+               plan = (Plan *) make_sort_from_sortclauses(root, my_tlist,
                                                                                                   subplan, sortList);
-               plan = (Plan *) make_unique(sort_tlist, plan, sortList);
+               plan = (Plan *) make_unique(my_tlist, plan, sortList);
        }
 
        plan->plan_rows = best_path->rows;
index a5cc94e831b0dacd93caaff9c569206c37382121..3e8d37cb28968212af3920969bcc7d72747722a9 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/optimizer/util/pathnode.c,v 1.84 2003/01/20 18:54:56 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/optimizer/util/pathnode.c,v 1.85 2003/01/22 00:07:00 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 
 #include <math.h>
 
+#include "catalog/pg_operator.h"
 #include "executor/executor.h"
+#include "miscadmin.h"
 #include "nodes/plannodes.h"
 #include "optimizer/cost.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "optimizer/restrictinfo.h"
+#include "parser/parse_expr.h"
+#include "parser/parse_oper.h"
 #include "utils/memutils.h"
 #include "utils/selfuncs.h"
+#include "utils/syscache.h"
+
+
+static bool hash_safe_tlist(List *tlist);
 
 
 /*****************************************************************************
@@ -506,6 +514,7 @@ create_unique_path(Query *root, RelOptInfo *rel, Path *subpath)
 {
        UniquePath *pathnode;
        Path            sort_path;              /* dummy for result of cost_sort */
+       Path            agg_path;               /* dummy for result of cost_agg */
        MemoryContext oldcontext;
        List       *sub_targetlist;
        List       *l;
@@ -587,16 +596,80 @@ create_unique_path(Query *root, RelOptInfo *rel, Path *subpath)
         */
        sort_path.total_cost += cpu_operator_cost * rel->rows * numCols;
 
-       pathnode->use_hash = false;     /* for now */
+       /*
+        * Is it safe to use a hashed implementation?  If so, estimate and
+        * compare costs.  We only try this if we know the targetlist for
+        * sure (else we can't be sure about the datatypes involved).
+        */
+       pathnode->use_hash = false;
+       if (enable_hashagg && sub_targetlist && hash_safe_tlist(sub_targetlist))
+       {
+               /*
+                * Estimate the overhead per hashtable entry at 64 bytes (same
+                * as in planner.c).
+                */
+               int             hashentrysize = rel->width + 64;
 
-       pathnode->path.startup_cost = sort_path.startup_cost;
-       pathnode->path.total_cost = sort_path.total_cost;
+               if (hashentrysize * pathnode->rows <= SortMem * 1024L)
+               {
+                       cost_agg(&agg_path, root,
+                                        AGG_HASHED, 0,
+                                        numCols, pathnode->rows,
+                                        subpath->startup_cost,
+                                        subpath->total_cost,
+                                        rel->rows);
+                       if (agg_path.total_cost < sort_path.total_cost)
+                               pathnode->use_hash = true;
+               }
+       }
+
+       if (pathnode->use_hash)
+       {
+               pathnode->path.startup_cost = agg_path.startup_cost;
+               pathnode->path.total_cost = agg_path.total_cost;
+       }
+       else
+       {
+               pathnode->path.startup_cost = sort_path.startup_cost;
+               pathnode->path.total_cost = sort_path.total_cost;
+       }
 
        rel->cheapest_unique_path = (Path *) pathnode;
 
        return pathnode;
 }
 
+/*
+ * hash_safe_tlist - can datatypes of given tlist be hashed?
+ *
+ * We assume hashed aggregation will work if the datatype's equality operator
+ * is marked hashjoinable.
+ *
+ * XXX this probably should be somewhere else.  See also hash_safe_grouping
+ * in plan/planner.c.
+ */
+static bool
+hash_safe_tlist(List *tlist)
+{
+       List       *tl;
+
+       foreach(tl, tlist)
+       {
+               Node       *expr = (Node *) lfirst(tl);
+               Operator        optup;
+               bool            oprcanhash;
+
+               optup = equality_oper(exprType(expr), true);
+               if (!optup)
+                       return false;
+               oprcanhash = ((Form_pg_operator) GETSTRUCT(optup))->oprcanhash;
+               ReleaseSysCache(optup);
+               if (!oprcanhash)
+                       return false;
+       }
+       return true;
+}
+
 /*
  * create_subqueryscan_path
  *       Creates a path corresponding to a sequential scan of a subquery,