Fix hash partition pruning with asymmetric partition sets.

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 28 Jan 2021 18:41:55 +0000 (13:41 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 28 Jan 2021 18:41:55 +0000 (13:41 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 28 Jan 2021 18:41:55 +0000 (13:41 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 28 Jan 2021 18:41:55 +0000 (13:41 -0500)
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c

index 1746cb87936cecaca7fff1f26f09db87518f1fa4..746cd1e9d7a1bfd3b9b3417cb65d7e7cc3e44662 100644 (file)
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -1323,16 +1323,14 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
     {
         case PARTITION_STRATEGY_HASH:
             {
-               int         greatest_modulus;
                 uint64      rowHash;
  
-               greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
                 rowHash = compute_partition_hash_value(key->partnatts,
                                                        key->partsupfunc,
                                                        key->partcollation,
                                                        values, isnull);
  
-               part_index = boundinfo->indexes[rowHash % greatest_modulus];
+               part_index = boundinfo->indexes[rowHash % boundinfo->nindexes];
             }
             break;
  
diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c

index b9aeb77bc2bad5294fa78e3d20850b582ce05614..0c3f212ff21e6433c8cab81b27663d78dc9d8c07 100644 (file)
--- a/src/backend/partitioning/partbounds.c
+++ b/src/backend/partitioning/partbounds.c
@@ -224,7 +224,6 @@ static int  partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
                                     Oid *partcollation,
                                     PartitionBoundInfo boundinfo,
                                     PartitionRangeBound *probe, int32 *cmpval);
-static int get_partition_bound_num_indexes(PartitionBoundInfo b);
  static Expr *make_partition_op_expr(PartitionKey key, int keynum,
                                     uint16 strategy, Expr *arg1, Expr *arg2);
  static Oid get_partition_operator(PartitionKey key, int col,
@@ -398,6 +397,7 @@ create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts,
  
     boundinfo->ndatums = ndatums;
     boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
+   boundinfo->nindexes = greatest_modulus;
     boundinfo->indexes = (int *) palloc(greatest_modulus * sizeof(int));
     for (i = 0; i < greatest_modulus; i++)
         boundinfo->indexes[i] = -1;
@@ -530,6 +530,7 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts,
  
     boundinfo->ndatums = ndatums;
     boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
+   boundinfo->nindexes = ndatums;
     boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
  
     /*
@@ -725,8 +726,9 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts,
  
     /*
      * For range partitioning, an additional value of -1 is stored as the last
-    * element.
+    * element of the indexes[] array.
      */
+   boundinfo->nindexes = ndatums + 1;
     boundinfo->indexes = (int *) palloc((ndatums + 1) * sizeof(int));
  
     for (i = 0; i < ndatums; i++)
@@ -807,45 +809,41 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
     if (b1->ndatums != b2->ndatums)
         return false;
  
+   if (b1->nindexes != b2->nindexes)
+       return false;
+
     if (b1->null_index != b2->null_index)
         return false;
  
     if (b1->default_index != b2->default_index)
         return false;
  
-   if (b1->strategy == PARTITION_STRATEGY_HASH)
+   /* For all partition strategies, the indexes[] arrays have to match */
+   for (i = 0; i < b1->nindexes; i++)
     {
-       int         greatest_modulus = get_hash_partition_greatest_modulus(b1);
-
-       /*
-        * If two hash partitioned tables have different greatest moduli,
-        * their partition schemes don't match.
-        */
-       if (greatest_modulus != get_hash_partition_greatest_modulus(b2))
+       if (b1->indexes[i] != b2->indexes[i])
             return false;
+   }
  
+   /* Finally, compare the datums[] arrays */
+   if (b1->strategy == PARTITION_STRATEGY_HASH)
+   {
         /*
          * We arrange the partitions in the ascending order of their moduli
          * and remainders.  Also every modulus is factor of next larger
          * modulus.  Therefore we can safely store index of a given partition
          * in indexes array at remainder of that partition.  Also entries at
          * (remainder + N * modulus) positions in indexes array are all same
-        * for (modulus, remainder) specification for any partition.  Thus
-        * datums array from both the given bounds are same, if and only if
-        * their indexes array will be same.  So, it suffices to compare
-        * indexes array.
-        */
-       for (i = 0; i < greatest_modulus; i++)
-           if (b1->indexes[i] != b2->indexes[i])
-               return false;
-
-#ifdef USE_ASSERT_CHECKING
-
-       /*
-        * Nonetheless make sure that the bounds are indeed same when the
+        * for (modulus, remainder) specification for any partition.  Thus the
+        * datums arrays from the given bounds are the same, if and only if
+        * their indexes arrays are the same.  So, it suffices to compare the
+        * indexes arrays.
+        *
+        * Nonetheless make sure that the bounds are indeed the same when the
          * indexes match.  Hash partition bound stores modulus and remainder
          * at b1->datums[i][0] and b1->datums[i][1] position respectively.
          */
+#ifdef USE_ASSERT_CHECKING
         for (i = 0; i < b1->ndatums; i++)
             Assert((b1->datums[i][0] == b2->datums[i][0] &&
                     b1->datums[i][1] == b2->datums[i][1]));
@@ -891,15 +889,7 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
                                   parttypbyval[j], parttyplen[j]))
                     return false;
             }
-
-           if (b1->indexes[i] != b2->indexes[i])
-               return false;
         }
-
-       /* There are ndatums+1 indexes in case of range partitions */
-       if (b1->strategy == PARTITION_STRATEGY_RANGE &&
-           b1->indexes[i] != b2->indexes[i])
-           return false;
     }
     return true;
  }
@@ -920,8 +910,8 @@ partition_bounds_copy(PartitionBoundInfo src,
     PartitionBoundInfo dest;
     int         i;
     int         ndatums;
+   int         nindexes;
     int         partnatts;
-   int         num_indexes;
     bool        hash_part;
     int         natts;
  
@@ -929,10 +919,9 @@ partition_bounds_copy(PartitionBoundInfo src,
  
     dest->strategy = src->strategy;
     ndatums = dest->ndatums = src->ndatums;
+   nindexes = dest->nindexes = src->nindexes;
     partnatts = key->partnatts;
  
-   num_indexes = get_partition_bound_num_indexes(src);
-
     /* List partitioned tables have only a single partition key. */
     Assert(key->strategy != PARTITION_STRATEGY_LIST || partnatts == 1);
  
@@ -990,8 +979,8 @@ partition_bounds_copy(PartitionBoundInfo src,
         }
     }
  
-   dest->indexes = (int *) palloc(sizeof(int) * num_indexes);
-   memcpy(dest->indexes, src->indexes, sizeof(int) * num_indexes);
+   dest->indexes = (int *) palloc(sizeof(int) * nindexes);
+   memcpy(dest->indexes, src->indexes, sizeof(int) * nindexes);
  
     dest->null_index = src->null_index;
     dest->default_index = src->default_index;
@@ -2456,6 +2445,7 @@ build_merged_partition_bounds(char strategy, List *merged_datums,
     }
  
     Assert(list_length(merged_indexes) == ndatums);
+   merged_bounds->nindexes = ndatums;
     merged_bounds->indexes = (int *) palloc(sizeof(int) * ndatums);
     pos = 0;
     foreach(lc, merged_indexes)
@@ -2889,7 +2879,7 @@ check_new_partition_bound(char *relname, Relation parent,
                                 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
                                  errmsg("every hash partition modulus must be a factor of the next larger modulus")));
  
-                   greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
+                   greatest_modulus = boundinfo->nindexes;
                     remainder = spec->remainder;
  
                     /*
@@ -3282,18 +3272,15 @@ check_default_partition_contents(Relation parent, Relation default_rel,
  /*
   * get_hash_partition_greatest_modulus
   *
- * Returns the greatest modulus of the hash partition bound. The greatest
- * modulus will be at the end of the datums array because hash partitions are
- * arranged in the ascending order of their moduli and remainders.
+ * Returns the greatest modulus of the hash partition bound.
+ * This is no longer used in the core code, but we keep it around
+ * in case external modules are using it.
   */
  int
  get_hash_partition_greatest_modulus(PartitionBoundInfo bound)
  {
     Assert(bound && bound->strategy == PARTITION_STRATEGY_HASH);
-   Assert(bound->datums && bound->ndatums > 0);
-   Assert(DatumGetInt32(bound->datums[bound->ndatums - 1][0]) > 0);
-
-   return DatumGetInt32(bound->datums[bound->ndatums - 1][0]);
+   return bound->nindexes;
  }
  
  /*
@@ -3697,46 +3684,6 @@ qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
                                 b1, b2);
  }
  
-/*
- * get_partition_bound_num_indexes
- *
- * Returns the number of the entries in the partition bound indexes array.
- */
-static int
-get_partition_bound_num_indexes(PartitionBoundInfo bound)
-{
-   int         num_indexes;
-
-   Assert(bound);
-
-   switch (bound->strategy)
-   {
-       case PARTITION_STRATEGY_HASH:
-
-           /*
-            * The number of the entries in the indexes array is same as the
-            * greatest modulus.
-            */
-           num_indexes = get_hash_partition_greatest_modulus(bound);
-           break;
-
-       case PARTITION_STRATEGY_LIST:
-           num_indexes = bound->ndatums;
-           break;
-
-       case PARTITION_STRATEGY_RANGE:
-           /* Range partitioned table has an extra index. */
-           num_indexes = bound->ndatums + 1;
-           break;
-
-       default:
-           elog(ERROR, "unexpected partition strategy: %d",
-                (int) bound->strategy);
-   }
-
-   return num_indexes;
-}
-
  /*
   * get_partition_operator
   *
diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c

index fac921eea5b9329c052e1c8d3c9f7172ff00fa95..42c7c5f55412f0d739e04ee77978ed25550c0be0 100644 (file)
--- a/src/backend/partitioning/partprune.c
+++ b/src/backend/partitioning/partprune.c
@@ -781,7 +781,10 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
     scan_default = final_result->scan_default;
     while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0)
     {
-       int         partindex = context->boundinfo->indexes[i];
+       int         partindex;
+
+       Assert(i < context->boundinfo->nindexes);
+       partindex = context->boundinfo->indexes[i];
  
         if (partindex < 0)
         {
@@ -2514,20 +2517,19 @@ get_matching_hash_bounds(PartitionPruneContext *context,
         for (i = 0; i < partnatts; i++)
             isnull[i] = bms_is_member(i, nullkeys);
  
-       greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
         rowHash = compute_partition_hash_value(partnatts, partsupfunc, partcollation,
                                                values, isnull);
  
+       greatest_modulus = boundinfo->nindexes;
         if (partindices[rowHash % greatest_modulus] >= 0)
             result->bound_offsets =
                 bms_make_singleton(rowHash % greatest_modulus);
     }
     else
     {
-       /* Getting here means at least one hash partition exists. */
-       Assert(boundinfo->ndatums > 0);
+       /* Report all valid offsets into the boundinfo->indexes array. */
         result->bound_offsets = bms_add_range(NULL, 0,
-                                             boundinfo->ndatums - 1);
+                                             boundinfo->nindexes - 1);
     }
  
     /*
@@ -3388,30 +3390,20 @@ perform_pruning_combine_step(PartitionPruneContext *context,
                              PartitionPruneStepCombine *cstep,
                              PruneStepResult **step_results)
  {
-   ListCell   *lc1;
-   PruneStepResult *result = NULL;
+   PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
     bool        firststep;
+   ListCell   *lc1;
  
     /*
      * A combine step without any source steps is an indication to not perform
      * any partition pruning.  Return all datum indexes in that case.
      */
-   result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
-   if (list_length(cstep->source_stepids) == 0)
+   if (cstep->source_stepids == NIL)
     {
         PartitionBoundInfo boundinfo = context->boundinfo;
-       int         rangemax;
-
-       /*
-        * Add all valid offsets into the boundinfo->indexes array.  For range
-        * partitioning, boundinfo->indexes contains (boundinfo->ndatums + 1)
-        * valid entries; otherwise there are boundinfo->ndatums.
-        */
-       rangemax = context->strategy == PARTITION_STRATEGY_RANGE ?
-           boundinfo->ndatums : boundinfo->ndatums - 1;
  
         result->bound_offsets =
-           bms_add_range(result->bound_offsets, 0, rangemax);
+           bms_add_range(NULL, 0, boundinfo->nindexes - 1);
         result->scan_default = partition_bound_has_default(boundinfo);
         result->scan_null = partition_bound_accepts_nulls(boundinfo);
         return result;
diff --git a/src/include/partitioning/partbounds.h b/src/include/partitioning/partbounds.h

index 6bd330dd938a9ef266f112645af0d2441c7353fb..ebf3ff1f4973c95907edb66879c47f5c71f8f797 100644 (file)
--- a/src/include/partitioning/partbounds.h
+++ b/src/include/partitioning/partbounds.h
@@ -30,7 +30,7 @@ struct RelOptInfo;                /* avoid including pathnodes.h here */
   * In the case of range partitioning, ndatums will typically be far less than
   * 2 * nparts, because a partition's upper bound and the next partition's lower
   * bound are the same in most common cases, and we only store one of them (the
- * upper bound).  In case of hash partitioning, ndatums will be same as the
+ * upper bound).  In case of hash partitioning, ndatums will be the same as the
   * number of partitions.
   *
   * For range and list partitioned tables, datums is an array of datum-tuples
@@ -46,24 +46,31 @@ struct RelOptInfo;              /* avoid including pathnodes.h here */
   * the partition key's operator classes and collations.
   *
   * In the case of list partitioning, the indexes array stores one entry for
- * every datum, which is the index of the partition that accepts a given datum.
- * In case of range partitioning, it stores one entry per distinct range
- * datum, which is the index of the partition for which a given datum
- * is an upper bound.  In the case of hash partitioning, the number of the
- * entries in the indexes array is same as the greatest modulus amongst all
- * partitions.  For a given partition key datum-tuple, the index of the
- * partition which would accept that datum-tuple would be given by the entry
- * pointed by remainder produced when hash value of the datum-tuple is divided
- * by the greatest modulus.
+ * each datum-array entry, which is the index of the partition that accepts
+ * rows matching that datum.  So nindexes == ndatums.
+ *
+ * In the case of range partitioning, the indexes array stores one entry per
+ * distinct range datum, which is the index of the partition for which that
+ * datum is an upper bound (or -1 for a "gap" that has no partition).  It is
+ * convenient to have an extra -1 entry representing values above the last
+ * range datum, so nindexes == ndatums + 1.
+ *
+ * In the case of hash partitioning, the number of entries in the indexes
+ * array is the same as the greatest modulus amongst all partitions (which
+ * is a multiple of all partition moduli), so nindexes == greatest modulus.
+ * The indexes array is indexed according to the hash key's remainder modulo
+ * the greatest modulus, and it contains either the partition index accepting
+ * that remainder, or -1 if there is no partition for that remainder.
   */
  typedef struct PartitionBoundInfoData
  {
     char        strategy;       /* hash, list or range? */
-   int         ndatums;        /* Length of the datums following array */
+   int         ndatums;        /* Length of the datums[] array */
     Datum     **datums;
     PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
                                      * NULL for hash and list partitioned
                                      * tables */
+   int         nindexes;       /* Length of the indexes[] array */
     int        *indexes;        /* Partition indexes */
     int         null_index;     /* Index of the null-accepting partition; -1
                                  * if there isn't one */
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out

index c72a6d051f1dd02a69335b65b8bd9dd8a140ba12..bde29e38a94362ca820f16dc9d84e82127b45572 100644 (file)
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -1538,26 +1538,27 @@ drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, boolrangep, rp, coll_pru
  -- result on different machines.  See the definitions of
  -- part_part_test_int4_ops and part_test_text_ops in insert.sql.
  --
-create table hp (a int, b text) partition by hash (a part_test_int4_ops, b part_test_text_ops);
+create table hp (a int, b text, c int)
+  partition by hash (a part_test_int4_ops, b part_test_text_ops);
  create table hp0 partition of hp for values with (modulus 4, remainder 0);
  create table hp3 partition of hp for values with (modulus 4, remainder 3);
  create table hp1 partition of hp for values with (modulus 4, remainder 1);
  create table hp2 partition of hp for values with (modulus 4, remainder 2);
-insert into hp values (null, null);
-insert into hp values (1, null);
-insert into hp values (1, 'xxx');
-insert into hp values (null, 'xxx');
-insert into hp values (2, 'xxx');
-insert into hp values (1, 'abcde');
-select tableoid::regclass, * from hp order by 1;
- tableoid | a |   b   
-----------+---+-------
- hp0      |   | 
- hp0      | 1 | xxx
- hp3      | 2 | xxx
- hp1      | 1 | 
- hp2      |   | xxx
- hp2      | 1 | abcde
+insert into hp values (null, null, 0);
+insert into hp values (1, null, 1);
+insert into hp values (1, 'xxx', 2);
+insert into hp values (null, 'xxx', 3);
+insert into hp values (2, 'xxx', 4);
+insert into hp values (1, 'abcde', 5);
+select tableoid::regclass, * from hp order by c;
+ tableoid | a |   b   | c 
+----------+---+-------+---
+ hp0      |   |       | 0
+ hp1      | 1 |       | 1
+ hp0      | 1 | xxx   | 2
+ hp2      |   | xxx   | 3
+ hp3      | 2 | xxx   | 4
+ hp2      | 1 | abcde | 5
  (6 rows)
  
  -- partial keys won't prune, nor would non-equality conditions
@@ -1715,6 +1716,33 @@ explain (costs off) select * from hp where (a = 1 and b = 'abcde') or (a = 2 and
           Filter: (((a = 1) AND (b = 'abcde'::text)) OR ((a = 2) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
  (7 rows)
  
+-- test pruning when not all the partitions exist
+drop table hp1;
+drop table hp3;
+explain (costs off) select * from hp where a = 1 and b = 'abcde';
+                 QUERY PLAN                  
+---------------------------------------------
+ Seq Scan on hp2 hp
+   Filter: ((a = 1) AND (b = 'abcde'::text))
+(2 rows)
+
+explain (costs off) select * from hp where a = 1 and b = 'abcde' and
+  (c = 2 or c = 3);
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Seq Scan on hp2 hp
+   Filter: ((a = 1) AND (b = 'abcde'::text) AND ((c = 2) OR (c = 3)))
+(2 rows)
+
+drop table hp2;
+explain (costs off) select * from hp where a = 1 and b = 'abcde' and
+  (c = 2 or c = 3);
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
  drop table hp;
  --
  -- Test runtime partition pruning
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql

index ffd5fe8b0dc501f09de9ce3e53172802fa71105f..6ccb52ad1d674cbfbed6cf3da2a84d09843d22fa 100644 (file)
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -304,19 +304,20 @@ drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, boolrangep, rp, coll_pru
  -- part_part_test_int4_ops and part_test_text_ops in insert.sql.
  --
  
-create table hp (a int, b text) partition by hash (a part_test_int4_ops, b part_test_text_ops);
+create table hp (a int, b text, c int)
+  partition by hash (a part_test_int4_ops, b part_test_text_ops);
  create table hp0 partition of hp for values with (modulus 4, remainder 0);
  create table hp3 partition of hp for values with (modulus 4, remainder 3);
  create table hp1 partition of hp for values with (modulus 4, remainder 1);
  create table hp2 partition of hp for values with (modulus 4, remainder 2);
  
-insert into hp values (null, null);
-insert into hp values (1, null);
-insert into hp values (1, 'xxx');
-insert into hp values (null, 'xxx');
-insert into hp values (2, 'xxx');
-insert into hp values (1, 'abcde');
-select tableoid::regclass, * from hp order by 1;
+insert into hp values (null, null, 0);
+insert into hp values (1, null, 1);
+insert into hp values (1, 'xxx', 2);
+insert into hp values (null, 'xxx', 3);
+insert into hp values (2, 'xxx', 4);
+insert into hp values (1, 'abcde', 5);
+select tableoid::regclass, * from hp order by c;
  
  -- partial keys won't prune, nor would non-equality conditions
  explain (costs off) select * from hp where a = 1;
@@ -337,6 +338,16 @@ explain (costs off) select * from hp where a = 2 and b = 'xxx';
  explain (costs off) select * from hp where a = 1 and b = 'abcde';
  explain (costs off) select * from hp where (a = 1 and b = 'abcde') or (a = 2 and b = 'xxx') or (a is null and b is null);
  
+-- test pruning when not all the partitions exist
+drop table hp1;
+drop table hp3;
+explain (costs off) select * from hp where a = 1 and b = 'abcde';
+explain (costs off) select * from hp where a = 1 and b = 'abcde' and
+  (c = 2 or c = 3);
+drop table hp2;
+explain (costs off) select * from hp where a = 1 and b = 'abcde' and
+  (c = 2 or c = 3);
+
  drop table hp;
  
  --
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 28 Jan 2021 18:41:55 +0000 (13:41 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 28 Jan 2021 18:41:55 +0000 (13:41 -0500)
src/backend/executor/execPartition.c		patch \| blob \| blame \| history
src/backend/partitioning/partbounds.c		patch \| blob \| blame \| history
src/backend/partitioning/partprune.c		patch \| blob \| blame \| history
src/include/partitioning/partbounds.h		patch \| blob \| blame \| history
src/test/regress/expected/partition_prune.out		patch \| blob \| blame \| history
src/test/regress/sql/partition_prune.sql		patch \| blob \| blame \| history