Allow left join removals and unique joins on partitioned tables
authorDavid Rowley <drowley@postgresql.org>
Mon, 9 Jan 2023 04:15:08 +0000 (17:15 +1300)
committerDavid Rowley <drowley@postgresql.org>
Mon, 9 Jan 2023 04:15:08 +0000 (17:15 +1300)
This allows left join removals and unique joins to work with partitioned
tables.  The planner just lacked sufficient proofs that a given join
would not cause any row duplication.  Unique indexes currently serve as
that proof, so have get_relation_info() populate the indexlist for
partitioned tables too.

Author: Arne Roland
Reviewed-by: Alvaro Herrera, Zhihong Yu, Amit Langote, David Rowley
Discussion: https://postgr.es/m/c3b2408b7a39433b8230bbcd02e9f302@index.de

src/backend/optimizer/util/plancat.c
src/backend/utils/adt/selfuncs.c
src/include/nodes/pathnodes.h
src/test/regress/expected/join.out
src/test/regress/expected/partition_join.out
src/test/regress/sql/join.sql
src/test/regress/sql/partition_join.sql

index 9f158f2421b0f7be3b6da784b098298d557bc410..d58c4a10782740b8d6dcfe7377fe79bf341ce4f9 100644 (file)
@@ -109,7 +109,9 @@ static void set_baserel_partition_constraint(Relation relation,
  * If inhparent is true, all we need to do is set up the attr arrays:
  * the RelOptInfo actually represents the appendrel formed by an inheritance
  * tree, and so the parent rel's physical size and index information isn't
- * important for it.
+ * important for it, however, for partitioned tables, we do populate the
+ * indexlist as the planner uses unique indexes as unique proofs for certain
+ * optimizations.
  */
 void
 get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
@@ -175,10 +177,14 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 
        /*
         * Make list of indexes.  Ignore indexes on system catalogs if told to.
-        * Don't bother with indexes for an inheritance parent, either.
+        * Don't bother with indexes from traditional inheritance parents.  For
+        * partitioned tables, we need a list of at least unique indexes as these
+        * serve as unique proofs for certain planner optimizations.  However,
+        * let's not discriminate here and just record all partitioned indexes
+        * whether they're unique indexes or not.
         */
-       if (inhparent ||
-               (IgnoreSystemIndexes && IsSystemRelation(relation)))
+       if ((inhparent && relation->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+               || (IgnoreSystemIndexes && IsSystemRelation(relation)))
                hasindex = false;
        else
                hasindex = relation->rd_rel->relhasindex;
@@ -231,16 +237,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
                                continue;
                        }
 
-                       /*
-                        * Ignore partitioned indexes, since they are not usable for
-                        * queries.
-                        */
-                       if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
-                       {
-                               index_close(indexRelation, NoLock);
-                               continue;
-                       }
-
                        /*
                         * If the index is valid, but cannot yet be used, ignore it; but
                         * mark the plan we are generating as transient. See
@@ -285,105 +281,129 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 
                        info->relam = indexRelation->rd_rel->relam;
 
-                       /* We copy just the fields we need, not all of rd_indam */
-                       amroutine = indexRelation->rd_indam;
-                       info->amcanorderbyop = amroutine->amcanorderbyop;
-                       info->amoptionalkey = amroutine->amoptionalkey;
-                       info->amsearcharray = amroutine->amsearcharray;
-                       info->amsearchnulls = amroutine->amsearchnulls;
-                       info->amcanparallel = amroutine->amcanparallel;
-                       info->amhasgettuple = (amroutine->amgettuple != NULL);
-                       info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
-                               relation->rd_tableam->scan_bitmap_next_block != NULL;
-                       info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
-                                                                 amroutine->amrestrpos != NULL);
-                       info->amcostestimate = amroutine->amcostestimate;
-                       Assert(info->amcostestimate != NULL);
-
-                       /* Fetch index opclass options */
-                       info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
-
                        /*
-                        * Fetch the ordering information for the index, if any.
+                        * We don't have an AM for partitioned indexes, so we'll just
+                        * NULLify the AM related fields for those.
                         */
-                       if (info->relam == BTREE_AM_OID)
+                       if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
                        {
+                               /* We copy just the fields we need, not all of rd_indam */
+                               amroutine = indexRelation->rd_indam;
+                               info->amcanorderbyop = amroutine->amcanorderbyop;
+                               info->amoptionalkey = amroutine->amoptionalkey;
+                               info->amsearcharray = amroutine->amsearcharray;
+                               info->amsearchnulls = amroutine->amsearchnulls;
+                               info->amcanparallel = amroutine->amcanparallel;
+                               info->amhasgettuple = (amroutine->amgettuple != NULL);
+                               info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
+                                       relation->rd_tableam->scan_bitmap_next_block != NULL;
+                               info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
+                                                                         amroutine->amrestrpos != NULL);
+                               info->amcostestimate = amroutine->amcostestimate;
+                               Assert(info->amcostestimate != NULL);
+
+                               /* Fetch index opclass options */
+                               info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
+
                                /*
-                                * If it's a btree index, we can use its opfamily OIDs
-                                * directly as the sort ordering opfamily OIDs.
+                                * Fetch the ordering information for the index, if any.
                                 */
-                               Assert(amroutine->amcanorder);
-
-                               info->sortopfamily = info->opfamily;
-                               info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
-                               info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
-
-                               for (i = 0; i < nkeycolumns; i++)
+                               if (info->relam == BTREE_AM_OID)
                                {
-                                       int16           opt = indexRelation->rd_indoption[i];
+                                       /*
+                                        * If it's a btree index, we can use its opfamily OIDs
+                                        * directly as the sort ordering opfamily OIDs.
+                                        */
+                                       Assert(amroutine->amcanorder);
 
-                                       info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
-                                       info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
-                               }
-                       }
-                       else if (amroutine->amcanorder)
-                       {
-                               /*
-                                * Otherwise, identify the corresponding btree opfamilies by
-                                * trying to map this index's "<" operators into btree.  Since
-                                * "<" uniquely defines the behavior of a sort order, this is
-                                * a sufficient test.
-                                *
-                                * XXX This method is rather slow and also requires the
-                                * undesirable assumption that the other index AM numbers its
-                                * strategies the same as btree.  It'd be better to have a way
-                                * to explicitly declare the corresponding btree opfamily for
-                                * each opfamily of the other index type.  But given the lack
-                                * of current or foreseeable amcanorder index types, it's not
-                                * worth expending more effort on now.
-                                */
-                               info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns);
-                               info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
-                               info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
+                                       info->sortopfamily = info->opfamily;
+                                       info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
+                                       info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
 
-                               for (i = 0; i < nkeycolumns; i++)
-                               {
-                                       int16           opt = indexRelation->rd_indoption[i];
-                                       Oid                     ltopr;
-                                       Oid                     btopfamily;
-                                       Oid                     btopcintype;
-                                       int16           btstrategy;
-
-                                       info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
-                                       info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
-
-                                       ltopr = get_opfamily_member(info->opfamily[i],
-                                                                                               info->opcintype[i],
-                                                                                               info->opcintype[i],
-                                                                                               BTLessStrategyNumber);
-                                       if (OidIsValid(ltopr) &&
-                                               get_ordering_op_properties(ltopr,
-                                                                                                  &btopfamily,
-                                                                                                  &btopcintype,
-                                                                                                  &btstrategy) &&
-                                               btopcintype == info->opcintype[i] &&
-                                               btstrategy == BTLessStrategyNumber)
+                                       for (i = 0; i < nkeycolumns; i++)
                                        {
-                                               /* Successful mapping */
-                                               info->sortopfamily[i] = btopfamily;
+                                               int16           opt = indexRelation->rd_indoption[i];
+
+                                               info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
+                                               info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
                                        }
-                                       else
+                               }
+                               else if (amroutine->amcanorder)
+                               {
+                                       /*
+                                        * Otherwise, identify the corresponding btree opfamilies
+                                        * by trying to map this index's "<" operators into btree.
+                                        * Since "<" uniquely defines the behavior of a sort
+                                        * order, this is a sufficient test.
+                                        *
+                                        * XXX This method is rather slow and also requires the
+                                        * undesirable assumption that the other index AM numbers
+                                        * its strategies the same as btree.  It'd be better to
+                                        * have a way to explicitly declare the corresponding
+                                        * btree opfamily for each opfamily of the other index
+                                        * type.  But given the lack of current or foreseeable
+                                        * amcanorder index types, it's not worth expending more
+                                        * effort on now.
+                                        */
+                                       info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns);
+                                       info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
+                                       info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
+
+                                       for (i = 0; i < nkeycolumns; i++)
                                        {
-                                               /* Fail ... quietly treat index as unordered */
-                                               info->sortopfamily = NULL;
-                                               info->reverse_sort = NULL;
-                                               info->nulls_first = NULL;
-                                               break;
+                                               int16           opt = indexRelation->rd_indoption[i];
+                                               Oid                     ltopr;
+                                               Oid                     btopfamily;
+                                               Oid                     btopcintype;
+                                               int16           btstrategy;
+
+                                               info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
+                                               info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
+
+                                               ltopr = get_opfamily_member(info->opfamily[i],
+                                                                                                       info->opcintype[i],
+                                                                                                       info->opcintype[i],
+                                                                                                       BTLessStrategyNumber);
+                                               if (OidIsValid(ltopr) &&
+                                                       get_ordering_op_properties(ltopr,
+                                                                                                          &btopfamily,
+                                                                                                          &btopcintype,
+                                                                                                          &btstrategy) &&
+                                                       btopcintype == info->opcintype[i] &&
+                                                       btstrategy == BTLessStrategyNumber)
+                                               {
+                                                       /* Successful mapping */
+                                                       info->sortopfamily[i] = btopfamily;
+                                               }
+                                               else
+                                               {
+                                                       /* Fail ... quietly treat index as unordered */
+                                                       info->sortopfamily = NULL;
+                                                       info->reverse_sort = NULL;
+                                                       info->nulls_first = NULL;
+                                                       break;
+                                               }
                                        }
                                }
+                               else
+                               {
+                                       info->sortopfamily = NULL;
+                                       info->reverse_sort = NULL;
+                                       info->nulls_first = NULL;
+                               }
                        }
                        else
                        {
+                               info->amcanorderbyop = false;
+                               info->amoptionalkey = false;
+                               info->amsearcharray = false;
+                               info->amsearchnulls = false;
+                               info->amcanparallel = false;
+                               info->amhasgettuple = false;
+                               info->amhasgetbitmap = false;
+                               info->amcanmarkpos = false;
+                               info->amcostestimate = NULL;
+
                                info->sortopfamily = NULL;
                                info->reverse_sort = NULL;
                                info->nulls_first = NULL;
@@ -416,31 +436,45 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
                         * the number-of-tuples estimate to equal the parent table; if it
                         * is partial then we have to use the same methods as we would for
                         * a table, except we can be sure that the index is not larger
-                        * than the table.
+                        * than the table.  We must ignore partitioned indexes here as as
+                        * there are not physical indexes.
                         */
-                       if (info->indpred == NIL)
+                       if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
                        {
-                               info->pages = RelationGetNumberOfBlocks(indexRelation);
-                               info->tuples = rel->tuples;
-                       }
-                       else
-                       {
-                               double          allvisfrac; /* dummy */
-
-                               estimate_rel_size(indexRelation, NULL,
-                                                                 &info->pages, &info->tuples, &allvisfrac);
-                               if (info->tuples > rel->tuples)
+                               if (info->indpred == NIL)
+                               {
+                                       info->pages = RelationGetNumberOfBlocks(indexRelation);
                                        info->tuples = rel->tuples;
-                       }
+                               }
+                               else
+                               {
+                                       double          allvisfrac; /* dummy */
 
-                       if (info->relam == BTREE_AM_OID)
-                       {
-                               /* For btrees, get tree height while we have the index open */
-                               info->tree_height = _bt_getrootheight(indexRelation);
+                                       estimate_rel_size(indexRelation, NULL,
+                                                                         &info->pages, &info->tuples, &allvisfrac);
+                                       if (info->tuples > rel->tuples)
+                                               info->tuples = rel->tuples;
+                               }
+
+                               if (info->relam == BTREE_AM_OID)
+                               {
+                                       /*
+                                        * For btrees, get tree height while we have the index
+                                        * open
+                                        */
+                                       info->tree_height = _bt_getrootheight(indexRelation);
+                               }
+                               else
+                               {
+                                       /* For other index types, just set it to "unknown" for now */
+                                       info->tree_height = -1;
+                               }
                        }
                        else
                        {
-                               /* For other index types, just set it to "unknown" for now */
+                               /* Zero these out for partitioned indexes */
+                               info->pages = 0;
+                               info->tuples = 0.0;
                                info->tree_height = -1;
                        }
 
index f50e58adbd63b264973cd80772f74cdd4ca10513..57de51f0db20b439f4880e523e654ae54c830921 100644 (file)
@@ -5994,6 +5994,10 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
        rte = root->simple_rte_array[rel->relid];
        Assert(rte->rtekind == RTE_RELATION);
 
+       /* ignore partitioned tables.  Any indexes here are not real indexes */
+       if (rte->relkind == RELKIND_PARTITIONED_TABLE)
+               return false;
+
        /* Search through the indexes to see if any match our problem */
        foreach(lc, rel->indexlist)
        {
index 1827e5064799082c887f66574b0f10d7a9ced3e0..c20b7298a3d879de8a3f0a7ec487495f39ecd348 100644 (file)
@@ -653,7 +653,7 @@ typedef struct PartitionSchemeData *PartitionScheme;
  *             lateral_referencers - relids of rels that reference this one laterally
  *                             (includes both direct and indirect lateral references)
  *             indexlist - list of IndexOptInfo nodes for relation's indexes
- *                                     (always NIL if it's not a table)
+ *                                     (always NIL if it's not a table or partitioned table)
  *             pages - number of disk pages in relation (zero if not a table)
  *             tuples - number of tuples in relation (not considering restrictions)
  *             allvisfrac - fraction of disk pages that are marked all-visible
@@ -1097,11 +1097,11 @@ struct IndexOptInfo
        Oid                *opfamily pg_node_attr(array_size(nkeycolumns));
        /* OIDs of opclass declared input data types */
        Oid                *opcintype pg_node_attr(array_size(nkeycolumns));
-       /* OIDs of btree opfamilies, if orderable */
+       /* OIDs of btree opfamilies, if orderable.  NULL if partitioned index */
        Oid                *sortopfamily pg_node_attr(array_size(nkeycolumns));
-       /* is sort order descending? */
+       /* is sort order descending? or NULL if partitioned index */
        bool       *reverse_sort pg_node_attr(array_size(nkeycolumns));
-       /* do NULLs come first in the sort order? */
+       /* do NULLs come first in the sort order? or NULL if partitioned index */
        bool       *nulls_first pg_node_attr(array_size(nkeycolumns));
        /* opclass-specific options for columns */
        bytea     **opclassoptions pg_node_attr(read_write_ignore);
@@ -1139,7 +1139,7 @@ struct IndexOptInfo
 
        /*
         * Remaining fields are copied from the index AM's API struct
-        * (IndexAmRoutine).
+        * (IndexAmRoutine).  These fields are not set for partitioned indexes.
         */
        bool            amcanorderbyop;
        bool            amoptionalkey;
index 3ddea3b683770432d4b27f99f4760d44c82b1754..c2b85d27950519c4a2c187b5133c627dae0dfdc1 100644 (file)
@@ -4860,6 +4860,16 @@ select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
          Filter: (a.id = i)
 (4 rows)
 
+CREATE TEMP TABLE parted_b (id int PRIMARY KEY) partition by range(id);
+CREATE TEMP TABLE parted_b1 partition of parted_b for values from (0) to (10);
+-- test join removals on a partitioned table
+explain (costs off)
+select a.* from a left join parted_b pb on a.b_id = pb.id;
+  QUERY PLAN   
+---------------
+ Seq Scan on a
+(1 row)
+
 rollback;
 create temp table parent (k int primary key, pd int);
 create temp table child (k int unique, cd int);
index c59caf1cb3d962daea4ee6a66853ca863e5bfb24..c649c4aeaae45cc16e654b07cb747ad05327eb2c 100644 (file)
@@ -4874,7 +4874,7 @@ ANALYZE fract_t;
 SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
                               QUERY PLAN                               
 -----------------------------------------------------------------------
  Limit
@@ -4891,7 +4891,7 @@ SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
 (11 rows)
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
                                    QUERY PLAN                                   
 --------------------------------------------------------------------------------
  Limit
index 9fc6ef43768eda46d0bde5334033bc86af219376..027927354c09f69361e04fb2922d960df8536867 100644 (file)
@@ -1709,6 +1709,13 @@ explain (costs off)
 select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
                          lateral generate_series(1, q.id) gs(i) where q.id = gs.i;
 
+CREATE TEMP TABLE parted_b (id int PRIMARY KEY) partition by range(id);
+CREATE TEMP TABLE parted_b1 partition of parted_b for values from (0) to (10);
+
+-- test join removals on a partitioned table
+explain (costs off)
+select a.* from a left join parted_b pb on a.b_id = pb.id;
+
 rollback;
 
 create temp table parent (k int primary key, pd int);
index 67f506361f8661600c80b4dad65f1fb198a99d0a..9e16f1ca550fae192065d64612b8800577b2b30e 100644 (file)
@@ -1157,10 +1157,10 @@ SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
 
 -- cleanup
 DROP TABLE fract_t;