Improve estimate of distinct values in estimate_num_groups().

author Dean Rasheed <dean.a.rasheed@gmail.com>

Mon, 4 Apr 2016 11:41:56 +0000 (12:41 +0100)

committer Dean Rasheed <dean.a.rasheed@gmail.com>

Mon, 4 Apr 2016 11:41:56 +0000 (12:41 +0100)
author Dean Rasheed <dean.a.rasheed@gmail.com>
Mon, 4 Apr 2016 11:41:56 +0000 (12:41 +0100)
committer Dean Rasheed <dean.a.rasheed@gmail.com>
Mon, 4 Apr 2016 11:41:56 +0000 (12:41 +0100)
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index a6555e91b77bf62a5f166d1a863f3e0c242e7b8d..cc2a9a1b6c50515dc7b9cd2538c2506ffe5b3b27 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3237,15 +3237,15 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
   *             restriction selectivity of the equality in the next step.
   *     4.  For Vars within a single source rel, we multiply together the numbers
   *             of values, clamp to the number of rows in the rel (divided by 10 if
- *             more than one Var), and then multiply by the selectivity of the
- *             restriction clauses for that rel.  When there's more than one Var,
- *             the initial product is probably too high (it's the worst case) but
- *             clamping to a fraction of the rel's rows seems to be a helpful
- *             heuristic for not letting the estimate get out of hand.  (The factor
- *             of 10 is derived from pre-Postgres-7.4 practice.)  Multiplying
- *             by the restriction selectivity is effectively assuming that the
- *             restriction clauses are independent of the grouping, which is a crummy
- *             assumption, but it's hard to do better.
+ *             more than one Var), and then multiply by a factor based on the
+ *             selectivity of the restriction clauses for that rel.  When there's
+ *             more than one Var, the initial product is probably too high (it's the
+ *             worst case) but clamping to a fraction of the rel's rows seems to be a
+ *             helpful heuristic for not letting the estimate get out of hand.  (The
+ *             factor of 10 is derived from pre-Postgres-7.4 practice.)  The factor
+ *             we multiply by to adjust for the restriction selectivity assumes that
+ *             the restriction clauses are independent of the grouping, which may not
+ *             be a valid assumption, but it's hard to do better.
   *     5.  If there are Vars from multiple rels, we repeat step 4 for each such
   *             rel, and multiply the results together.
   * Note that rels not containing grouped Vars are ignored completely, as are
@@ -3439,9 +3439,51 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
                                 reldistinct = clamp;
  
                         /*
-                        * Multiply by restriction selectivity.
+                        * Update the estimate based on the restriction selectivity,
+                        * guarding against division by zero when reldistinct is zero.
+                        * Also skip this if we know that we are returning all rows.
                          */
-                       reldistinct *= rel->rows / rel->tuples;
+                       if (reldistinct > 0 && rel->rows < rel->tuples)
+                       {
+                               /*
+                                * Given a table containing N rows with n distinct values in a
+                                * uniform distribution, if we select p rows at random then
+                                * the expected number of distinct values selected is
+                                *
+                                * n * (1 - product((N-N/n-i)/(N-i), i=0..p-1))
+                                *
+                                * = n * (1 - (N-N/n)! / (N-N/n-p)! * (N-p)! / N!)
+                                *
+                                * See "Approximating block accesses in database
+                                * organizations", S. B. Yao, Communications of the ACM,
+                                * Volume 20 Issue 4, April 1977 Pages 260-261.
+                                *
+                                * Alternatively, re-arranging the terms from the factorials,
+                                * this may be written as
+                                *
+                                * n * (1 - product((N-p-i)/(N-i), i=0..N/n-1))
+                                *
+                                * This form of the formula is more efficient to compute in
+                                * the common case where p is larger than N/n.  Additionally,
+                                * as pointed out by Dell'Era, if i << N for all terms in the
+                                * product, it can be approximated by
+                                *
+                                * n * (1 - ((N-p)/N)^(N/n))
+                                *
+                                * See "Expected distinct values when selecting from a bag
+                                * without replacement", Alberto Dell'Era,
+                                * http://www.adellera.it/investigations/distinct_balls/.
+                                *
+                                * The condition i << N is equivalent to n >> 1, so this is a
+                                * good approximation when the number of distinct values in
+                                * the table is large.  It turns out that this formula also
+                                * works well even when n is small.
+                                */
+                               reldistinct *=
+                                       (1 - pow((rel->tuples - rel->rows) / rel->tuples,
+                                                        rel->tuples / reldistinct));
+                       }
+                       reldistinct = clamp_row_est(reldistinct);
  
                         /*
                          * Update estimate of total distinct groups.
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out

index de64ca7ec72c0c2ae7961b1d36f46a8589fe4205..0fc93d9d7267e40ceef81da6a0d657649a8a6782 100644 (file)
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -807,27 +807,24 @@ select * from int4_tbl where
  explain (verbose, costs off)
  select * from int4_tbl o where (f1, f1) in
    (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1);
-                              QUERY PLAN                              
-----------------------------------------------------------------------
- Hash Join
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Hash Semi Join
     Output: o.f1
     Hash Cond: (o.f1 = "ANY_subquery".f1)
     ->  Seq Scan on public.int4_tbl o
           Output: o.f1
     ->  Hash
           Output: "ANY_subquery".f1, "ANY_subquery".g
-         ->  HashAggregate
+         ->  Subquery Scan on "ANY_subquery"
                 Output: "ANY_subquery".f1, "ANY_subquery".g
-               Group Key: "ANY_subquery".f1, "ANY_subquery".g
-               ->  Subquery Scan on "ANY_subquery"
-                     Output: "ANY_subquery".f1, "ANY_subquery".g
-                     Filter: ("ANY_subquery".f1 = "ANY_subquery".g)
-                     ->  HashAggregate
-                           Output: i.f1, (generate_series(1, 2) / 10)
-                           Group Key: i.f1
-                           ->  Seq Scan on public.int4_tbl i
-                                 Output: i.f1
-(18 rows)
+               Filter: ("ANY_subquery".f1 = "ANY_subquery".g)
+               ->  HashAggregate
+                     Output: i.f1, (generate_series(1, 2) / 10)
+                     Group Key: i.f1
+                     ->  Seq Scan on public.int4_tbl i
+                           Output: i.f1
+(15 rows)
  
  select * from int4_tbl o where (f1, f1) in
    (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1);
author	Dean Rasheed <dean.a.rasheed@gmail.com>
	Mon, 4 Apr 2016 11:41:56 +0000 (12:41 +0100)
committer	Dean Rasheed <dean.a.rasheed@gmail.com>
	Mon, 4 Apr 2016 11:41:56 +0000 (12:41 +0100)
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/test/regress/expected/subselect.out		patch \| blob \| blame \| history