Fix overenthusiastic optimization of 'x IN (SELECT DISTINCT ...)' and related

author Tom Lane <tgl@sss.pgh.pa.us>

Fri, 15 Jul 2005 17:09:26 +0000 (17:09 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Fri, 15 Jul 2005 17:09:26 +0000 (17:09 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Fri, 15 Jul 2005 17:09:26 +0000 (17:09 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Fri, 15 Jul 2005 17:09:26 +0000 (17:09 +0000)
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c

index 4c52561521794e08b51fdbd073641e44e0cdb5ea..4f3e8d6d9db8f359f9fbb5db82cb8a0ac7d6a2c7 100644 (file)
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.122 2005/06/05 22:32:56 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.123 2005/07/15 17:09:25 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -34,7 +34,8 @@
  #include "utils/syscache.h"
  
  
-static bool is_distinct_query(Query *query);
+static List *translate_sub_tlist(List *tlist, int relid);
+static bool query_is_distinct_for(Query *query, List *colnos);
  static bool hash_safe_tlist(List *tlist);
  
  
@@ -800,14 +801,41 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
         pathnode->subpath = subpath;
  
         /*
-        * If the input is a subquery whose output must be unique already, we
-        * don't need to do anything.
+        * Try to identify the targetlist that will actually be unique-ified.
+        * In current usage, this routine is only used for sub-selects of IN
+        * clauses, so we should be able to find the tlist in in_info_list.
+        */
+       sub_targetlist = NIL;
+       foreach(l, root->in_info_list)
+       {
+               InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
+
+               if (bms_equal(ininfo->righthand, rel->relids))
+               {
+                       sub_targetlist = ininfo->sub_targetlist;
+                       break;
+               }
+       }
+
+       /*
+        * If the input is a subquery whose output must be unique already,
+        * then we don't need to do anything.  The test for uniqueness has
+        * to consider exactly which columns we are extracting; for example
+        * "SELECT DISTINCT x,y" doesn't guarantee that x alone is distinct.
+        * So we cannot check for this optimization unless we found our own
+        * targetlist above, and it consists only of simple Vars referencing
+        * subquery outputs.  (Possibly we could do something with expressions
+        * in the subquery outputs, too, but for now keep it simple.)
          */
-       if (rel->rtekind == RTE_SUBQUERY)
+       if (sub_targetlist && rel->rtekind == RTE_SUBQUERY)
         {
                 RangeTblEntry *rte = rt_fetch(rel->relid, root->parse->rtable);
+               List   *sub_tlist_colnos;
+
+               sub_tlist_colnos = translate_sub_tlist(sub_targetlist, rel->relid);
  
-               if (is_distinct_query(rte->subquery))
+               if (sub_tlist_colnos &&
+                       query_is_distinct_for(rte->subquery, sub_tlist_colnos))
                 {
                         pathnode->umethod = UNIQUE_PATH_NOOP;
                         pathnode->rows = rel->rows;
@@ -821,23 +849,6 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
                 }
         }
  
-       /*
-        * Try to identify the targetlist that will actually be unique-ified.
-        * In current usage, this routine is only used for sub-selects of IN
-        * clauses, so we should be able to find the tlist in in_info_list.
-        */
-       sub_targetlist = NIL;
-       foreach(l, root->in_info_list)
-       {
-               InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
-
-               if (bms_equal(ininfo->righthand, rel->relids))
-               {
-                       sub_targetlist = ininfo->sub_targetlist;
-                       break;
-               }
-       }
-
         /*
          * If we know the targetlist, try to estimate number of result rows;
          * otherwise punt.
@@ -913,46 +924,83 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
  }
  
  /*
- * is_distinct_query - does query never return duplicate rows?
+ * translate_sub_tlist - get subquery column numbers represented by tlist
+ *
+ * The given targetlist should contain only Vars referencing the given relid.
+ * Extract their varattnos (ie, the column numbers of the subquery) and return
+ * as an integer List.
+ *
+ * If any of the tlist items is not a simple Var, we cannot determine whether
+ * the subquery's uniqueness condition (if any) matches ours, so punt and
+ * return NIL.
   */
-static bool
-is_distinct_query(Query *query)
+static List *
+translate_sub_tlist(List *tlist, int relid)
  {
-       /* DISTINCT (but not DISTINCT ON) guarantees uniqueness */
-       if (has_distinct_clause(query))
-               return true;
+       List       *result = NIL;
+       ListCell   *l;
  
-       /* UNION, INTERSECT, EXCEPT guarantee uniqueness, except with ALL */
-       if (query->setOperations)
+       foreach(l, tlist)
         {
-               SetOperationStmt *topop = (SetOperationStmt *) query->setOperations;
+               Var        *var = (Var *) lfirst(l);
  
-               Assert(IsA(topop, SetOperationStmt));
-               Assert(topop->op != SETOP_NONE);
+               if (!var || !IsA(var, Var) ||
+                       var->varno != relid)
+                       return NIL;                     /* punt */
  
-               if (!topop->all)
+               result = lappend_int(result, var->varattno);
+       }
+       return result;
+}
+
+/*
+ * query_is_distinct_for - does query never return duplicates of the
+ *             specified columns?
+ *
+ * colnos is an integer list of output column numbers (resno's).  We are
+ * interested in whether rows consisting of just these columns are certain
+ * to be distinct.
+ */
+static bool
+query_is_distinct_for(Query *query, List *colnos)
+{
+       ListCell   *l;
+
+       /*
+        * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
+        * columns in the DISTINCT clause appear in colnos.
+        */
+       if (query->distinctClause)
+       {
+               foreach(l, query->distinctClause)
+               {
+                       SortClause *scl = (SortClause *) lfirst(l);
+                       TargetEntry *tle = get_sortgroupclause_tle(scl,
+                                                                                                          query->targetList);
+
+                       if (!list_member_int(colnos, tle->resno))
+                               break;                  /* exit early if no match */
+               }
+               if (l == NULL)                  /* had matches for all? */
                         return true;
         }
  
         /*
-        * GROUP BY guarantees uniqueness if all the grouped columns appear in
-        * the output.  In our implementation this means checking they are non
-        * resjunk columns.
+        * Similarly, GROUP BY guarantees uniqueness if all the grouped columns
+        * appear in colnos.
          */
         if (query->groupClause)
         {
-               ListCell   *gl;
-
-               foreach(gl, query->groupClause)
+               foreach(l, query->groupClause)
                 {
-                       GroupClause *grpcl = (GroupClause *) lfirst(gl);
+                       GroupClause *grpcl = (GroupClause *) lfirst(l);
                         TargetEntry *tle = get_sortgroupclause_tle(grpcl,
                                                                                                            query->targetList);
  
-                       if (tle->resjunk)
-                               break;
+                       if (!list_member_int(colnos, tle->resno))
+                               break;                  /* exit early if no match */
                 }
-               if (!gl)                                /* got to the end? */
+               if (l == NULL)                  /* had matches for all? */
                         return true;
         }
         else
@@ -965,6 +1013,33 @@ is_distinct_query(Query *query)
                         return true;
         }
  
+       /*
+        * UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row,
+        * except with ALL
+        */
+       if (query->setOperations)
+       {
+               SetOperationStmt *topop = (SetOperationStmt *) query->setOperations;
+
+               Assert(IsA(topop, SetOperationStmt));
+               Assert(topop->op != SETOP_NONE);
+
+               if (!topop->all)
+               {
+                       /* We're good if all the nonjunk output columns are in colnos */
+                       foreach(l, query->targetList)
+                       {
+                               TargetEntry *tle = (TargetEntry *) lfirst(l);
+
+                               if (!tle->resjunk &&
+                                       !list_member_int(colnos, tle->resno))
+                                       break;          /* exit early if no match */
+                       }
+                       if (l == NULL)          /* had matches for all? */
+                               return true;
+               }
+       }
+
         /*
          * XXX Are there any other cases in which we can easily see the result
          * must be distinct?
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out

index 07e727de482594b5d8d2e102834c29a126a3f630..d99656eac774a94beea91a16cb97f1cf362feb6b 100644 (file)
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -202,6 +202,63 @@ select count(distinct ss.ten) from
      10
  (1 row)
  
+--
+-- Test cases to check for overenthusiastic optimization of
+-- "IN (SELECT DISTINCT ...)" and related cases.  Per example from
+-- Luca Pireddu and Michael Fuhr.
+--
+CREATE TEMP TABLE foo (id integer);
+CREATE TEMP TABLE bar (id1 integer, id2 integer);
+INSERT INTO foo VALUES (1);
+INSERT INTO bar VALUES (1, 1);
+INSERT INTO bar VALUES (2, 2);
+INSERT INTO bar VALUES (3, 1);
+-- These cases require an extra level of distinct-ing above subquery s
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION
+                      SELECT id1, id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
+-- These cases do not
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id2 FROM bar UNION
+                      SELECT id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
  --
  -- Test case to catch problems with multiply nested sub-SELECTs not getting
  -- recalculated properly.  Per bug report from Didier Moens.
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql

index 5cba9ca74d00c75afc1571da406f26f8263090e9..a07cc337596c97c01605d0ec5396ba9b208b4eb5 100644 (file)
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -95,6 +95,39 @@ select count(distinct ss.ten) from
    (select ten from tenk1 a
     where unique1 IN (select distinct hundred from tenk1 b)) ss;
  
+--
+-- Test cases to check for overenthusiastic optimization of
+-- "IN (SELECT DISTINCT ...)" and related cases.  Per example from
+-- Luca Pireddu and Michael Fuhr.
+--
+
+CREATE TEMP TABLE foo (id integer);
+CREATE TEMP TABLE bar (id1 integer, id2 integer);
+
+INSERT INTO foo VALUES (1);
+
+INSERT INTO bar VALUES (1, 1);
+INSERT INTO bar VALUES (2, 2);
+INSERT INTO bar VALUES (3, 1);
+
+-- These cases require an extra level of distinct-ing above subquery s
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s);
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s);
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION
+                      SELECT id1, id2 FROM bar) AS s);
+
+-- These cases do not
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s);
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s);
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id2 FROM bar UNION
+                      SELECT id2 FROM bar) AS s);
+
  --
  -- Test case to catch problems with multiply nested sub-SELECTs not getting
  -- recalculated properly.  Per bug report from Didier Moens.
author	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 15 Jul 2005 17:09:26 +0000 (17:09 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 15 Jul 2005 17:09:26 +0000 (17:09 +0000)
src/backend/optimizer/util/pathnode.c		patch \| blob \| blame \| history
src/test/regress/expected/subselect.out		patch \| blob \| blame \| history
src/test/regress/sql/subselect.sql		patch \| blob \| blame \| history