Reordering DISTINCT keys to match input path's pathkeys
authorRichard Guo <rguo@postgresql.org>
Tue, 26 Nov 2024 00:25:18 +0000 (09:25 +0900)
committerRichard Guo <rguo@postgresql.org>
Tue, 26 Nov 2024 00:25:18 +0000 (09:25 +0900)
The ordering of DISTINCT items is semantically insignificant, so we
can reorder them as needed.  In fact, in the parser, we absorb the
sorting semantics of the sortClause as much as possible into the
distinctClause, ensuring that one clause is a prefix of the other.
This can help avoid a possible need to re-sort.

In this commit, we attempt to adjust the DISTINCT keys to match the
input path's pathkeys.  This can likewise help avoid re-sorting, or
allow us to use incremental-sort to save efforts.

For DISTINCT ON expressions, the parser already ensures that they
match the initial ORDER BY expressions.  When reordering the DISTINCT
keys, we must ensure that the resulting pathkey list matches the
initial distinctClause pathkeys.

This introduces a new GUC, enable_distinct_reordering, which allows
the optimization to be disabled if needed.

Author: Richard Guo
Reviewed-by: Andrei Lepikhov
Discussion: https://postgr.es/m/CAMbWs48dR26cCcX0f=8bja2JKQPcU64136kHk=xekHT9xschiQ@mail.gmail.com

doc/src/sgml/config.sgml
src/backend/optimizer/path/pathkeys.c
src/backend/optimizer/plan/planner.c
src/backend/utils/misc/guc_tables.c
src/backend/utils/misc/postgresql.conf.sample
src/include/optimizer/optimizer.h
src/test/regress/expected/select_distinct.out
src/test/regress/expected/select_distinct_on.out
src/test/regress/expected/sysviews.out
src/test/regress/sql/select_distinct.sql
src/test/regress/sql/select_distinct_on.sql

index a84e60c09b95f5018390ba6d48ae6dd2d63cc7a0..76ab72db9640cad19b8e3fa3def763933d22361b 100644 (file)
@@ -5360,6 +5360,20 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-enable-distinct-reordering" xreflabel="enable_distinct_reordering">
+      <term><varname>enable_distinct_reordering</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>enable_distinct_reordering</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Enables or disables the query planner's ability to reorder DISTINCT
+        keys to match the input path's pathkeys. The default is <literal>on</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-enable-gathermerge" xreflabel="enable_gathermerge">
       <term><varname>enable_gathermerge</varname> (<type>boolean</type>)
       <indexterm>
index 7cf15e89b634735c5e1fc8d74658ea0303fa215f..03c2ac36e05f6e0507f99e76277157df6690992b 100644 (file)
@@ -2207,6 +2207,41 @@ pathkeys_useful_for_grouping(PlannerInfo *root, List *pathkeys)
        return n;
 }
 
+/*
+ * pathkeys_useful_for_distinct
+ *             Count the number of pathkeys that are useful for DISTINCT or DISTINCT
+ *             ON clause.
+ *
+ * DISTINCT keys could be reordered to benefit from the given pathkey list.  As
+ * with pathkeys_useful_for_grouping, we return the number of leading keys in
+ * the list that are shared by the distinctClause pathkeys.
+ */
+static int
+pathkeys_useful_for_distinct(PlannerInfo *root, List *pathkeys)
+{
+       int                     n_common_pathkeys;
+
+       /*
+        * distinct_pathkeys may have become empty if all of the pathkeys were
+        * determined to be redundant.  Return 0 in this case.
+        */
+       if (root->distinct_pathkeys == NIL)
+               return 0;
+
+       /* walk the pathkeys and search for matching DISTINCT key */
+       n_common_pathkeys = 0;
+       foreach_node(PathKey, pathkey, pathkeys)
+       {
+               /* no matching DISTINCT key, we're done */
+               if (!list_member_ptr(root->distinct_pathkeys, pathkey))
+                       break;
+
+               n_common_pathkeys++;
+       }
+
+       return n_common_pathkeys;
+}
+
 /*
  * pathkeys_useful_for_setop
  *             Count the number of leading common pathkeys root's 'setop_pathkeys' in
@@ -2240,6 +2275,9 @@ truncate_useless_pathkeys(PlannerInfo *root,
        if (nuseful2 > nuseful)
                nuseful = nuseful2;
        nuseful2 = pathkeys_useful_for_grouping(root, pathkeys);
+       if (nuseful2 > nuseful)
+               nuseful = nuseful2;
+       nuseful2 = pathkeys_useful_for_distinct(root, pathkeys);
        if (nuseful2 > nuseful)
                nuseful = nuseful2;
        nuseful2 = pathkeys_useful_for_setop(root, pathkeys);
index 1f78dc3d530fb645660cce1af9bdd07d95c653df..b665a7762ec9b7eb94c2aaa87602933dfb1f8d82 100644 (file)
@@ -67,6 +67,7 @@
 double         cursor_tuple_fraction = DEFAULT_CURSOR_TUPLE_FRACTION;
 int                    debug_parallel_query = DEBUG_PARALLEL_OFF;
 bool           parallel_leader_participation = true;
+bool           enable_distinct_reordering = true;
 
 /* Hook for plugins to get control in planner() */
 planner_hook_type planner_hook = NULL;
@@ -198,6 +199,9 @@ static void create_partial_distinct_paths(PlannerInfo *root,
 static RelOptInfo *create_final_distinct_paths(PlannerInfo *root,
                                                                                           RelOptInfo *input_rel,
                                                                                           RelOptInfo *distinct_rel);
+static List *get_useful_pathkeys_for_distinct(PlannerInfo *root,
+                                                                                         List *needed_pathkeys,
+                                                                                         List *path_pathkeys);
 static RelOptInfo *create_ordered_paths(PlannerInfo *root,
                                                                                RelOptInfo *input_rel,
                                                                                PathTarget *target,
@@ -235,6 +239,12 @@ static RelOptInfo *create_partial_grouping_paths(PlannerInfo *root,
                                                                                                 grouping_sets_data *gd,
                                                                                                 GroupPathExtraData *extra,
                                                                                                 bool force_rel_creation);
+static Path *make_ordered_path(PlannerInfo *root,
+                                                          RelOptInfo *rel,
+                                                          Path *path,
+                                                          Path *cheapest_path,
+                                                          List *pathkeys,
+                                                          double limit_tuples);
 static void gather_grouping_paths(PlannerInfo *root, RelOptInfo *rel);
 static bool can_partial_agg(PlannerInfo *root);
 static void apply_scanjoin_target_to_paths(PlannerInfo *root,
@@ -4938,7 +4948,10 @@ create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
 
        /*
         * Try sorting the cheapest path and incrementally sorting any paths with
-        * presorted keys and put a unique paths atop of those.
+        * presorted keys and put a unique paths atop of those.  We'll also
+        * attempt to reorder the required pathkeys to match the input path's
+        * pathkeys as much as possible, in hopes of avoiding a possible need to
+        * re-sort.
         */
        if (grouping_is_sortable(root->processed_distinctClause))
        {
@@ -4946,86 +4959,67 @@ create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
                {
                        Path       *input_path = (Path *) lfirst(lc);
                        Path       *sorted_path;
-                       bool            is_sorted;
-                       int                     presorted_keys;
+                       List       *useful_pathkeys_list = NIL;
 
-                       is_sorted = pathkeys_count_contained_in(root->distinct_pathkeys,
-                                                                                                       input_path->pathkeys,
-                                                                                                       &presorted_keys);
+                       useful_pathkeys_list =
+                               get_useful_pathkeys_for_distinct(root,
+                                                                                                root->distinct_pathkeys,
+                                                                                                input_path->pathkeys);
+                       Assert(list_length(useful_pathkeys_list) > 0);
 
-                       if (is_sorted)
-                               sorted_path = input_path;
-                       else
+                       foreach_node(List, useful_pathkeys, useful_pathkeys_list)
                        {
-                               /*
-                                * Try at least sorting the cheapest path and also try
-                                * incrementally sorting any path which is partially sorted
-                                * already (no need to deal with paths which have presorted
-                                * keys when incremental sort is disabled unless it's the
-                                * cheapest partial path).
-                                */
-                               if (input_path != cheapest_partial_path &&
-                                       (presorted_keys == 0 || !enable_incremental_sort))
+                               sorted_path = make_ordered_path(root,
+                                                                                               partial_distinct_rel,
+                                                                                               input_path,
+                                                                                               cheapest_partial_path,
+                                                                                               useful_pathkeys,
+                                                                                               -1.0);
+
+                               if (sorted_path == NULL)
                                        continue;
 
                                /*
-                                * We've no need to consider both a sort and incremental sort.
-                                * We'll just do a sort if there are no presorted keys and an
-                                * incremental sort when there are presorted keys.
+                                * An empty distinct_pathkeys means all tuples have the same
+                                * value for the DISTINCT clause.  See
+                                * create_final_distinct_paths()
                                 */
-                               if (presorted_keys == 0 || !enable_incremental_sort)
-                                       sorted_path = (Path *) create_sort_path(root,
-                                                                                                                       partial_distinct_rel,
-                                                                                                                       input_path,
-                                                                                                                       root->distinct_pathkeys,
-                                                                                                                       -1.0);
-                               else
-                                       sorted_path = (Path *) create_incremental_sort_path(root,
-                                                                                                                                               partial_distinct_rel,
-                                                                                                                                               input_path,
-                                                                                                                                               root->distinct_pathkeys,
-                                                                                                                                               presorted_keys,
-                                                                                                                                               -1.0);
-                       }
-
-                       /*
-                        * An empty distinct_pathkeys means all tuples have the same value
-                        * for the DISTINCT clause.  See create_final_distinct_paths()
-                        */
-                       if (root->distinct_pathkeys == NIL)
-                       {
-                               Node       *limitCount;
+                               if (root->distinct_pathkeys == NIL)
+                               {
+                                       Node       *limitCount;
 
-                               limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
-                                                                                               sizeof(int64),
-                                                                                               Int64GetDatum(1), false,
-                                                                                               FLOAT8PASSBYVAL);
+                                       limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
+                                                                                                       sizeof(int64),
+                                                                                                       Int64GetDatum(1), false,
+                                                                                                       FLOAT8PASSBYVAL);
 
-                               /*
-                                * Apply a LimitPath onto the partial path to restrict the
-                                * tuples from each worker to 1.  create_final_distinct_paths
-                                * will need to apply an additional LimitPath to restrict this
-                                * to a single row after the Gather node.  If the query
-                                * already has a LIMIT clause, then we could end up with three
-                                * Limit nodes in the final plan.  Consolidating the top two
-                                * of these could be done, but does not seem worth troubling
-                                * over.
-                                */
-                               add_partial_path(partial_distinct_rel, (Path *)
-                                                                create_limit_path(root, partial_distinct_rel,
-                                                                                                  sorted_path,
-                                                                                                  NULL,
-                                                                                                  limitCount,
-                                                                                                  LIMIT_OPTION_COUNT,
-                                                                                                  0, 1));
-                       }
-                       else
-                       {
-                               add_partial_path(partial_distinct_rel, (Path *)
-                                                                create_upper_unique_path(root, partial_distinct_rel,
-                                                                                                                 sorted_path,
-                                                                                                                 list_length(root->distinct_pathkeys),
-                                                                                                                 numDistinctRows));
+                                       /*
+                                        * Apply a LimitPath onto the partial path to restrict the
+                                        * tuples from each worker to 1.
+                                        * create_final_distinct_paths will need to apply an
+                                        * additional LimitPath to restrict this to a single row
+                                        * after the Gather node.  If the query already has a
+                                        * LIMIT clause, then we could end up with three Limit
+                                        * nodes in the final plan.  Consolidating the top two of
+                                        * these could be done, but does not seem worth troubling
+                                        * over.
+                                        */
+                                       add_partial_path(partial_distinct_rel, (Path *)
+                                                                        create_limit_path(root, partial_distinct_rel,
+                                                                                                          sorted_path,
+                                                                                                          NULL,
+                                                                                                          limitCount,
+                                                                                                          LIMIT_OPTION_COUNT,
+                                                                                                          0, 1));
+                               }
+                               else
+                               {
+                                       add_partial_path(partial_distinct_rel, (Path *)
+                                                                        create_upper_unique_path(root, partial_distinct_rel,
+                                                                                                                         sorted_path,
+                                                                                                                         list_length(root->distinct_pathkeys),
+                                                                                                                         numDistinctRows));
+                               }
                        }
                }
        }
@@ -5134,7 +5128,9 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
                 * Unique node on those.  We also, consider doing an explicit sort of
                 * the cheapest input path and Unique'ing that.  If any paths have
                 * presorted keys then we'll create an incremental sort atop of those
-                * before adding a unique node on the top.
+                * before adding a unique node on the top.  We'll also attempt to
+                * reorder the required pathkeys to match the input path's pathkeys as
+                * much as possible, in hopes of avoiding a possible need to re-sort.
                 *
                 * When we have DISTINCT ON, we must sort by the more rigorous of
                 * DISTINCT and ORDER BY, else it won't have the desired behavior.
@@ -5158,86 +5154,66 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
                {
                        Path       *input_path = (Path *) lfirst(lc);
                        Path       *sorted_path;
-                       bool            is_sorted;
-                       int                     presorted_keys;
+                       List       *useful_pathkeys_list = NIL;
 
-                       is_sorted = pathkeys_count_contained_in(needed_pathkeys,
-                                                                                                       input_path->pathkeys,
-                                                                                                       &presorted_keys);
+                       useful_pathkeys_list =
+                               get_useful_pathkeys_for_distinct(root,
+                                                                                                needed_pathkeys,
+                                                                                                input_path->pathkeys);
+                       Assert(list_length(useful_pathkeys_list) > 0);
 
-                       if (is_sorted)
-                               sorted_path = input_path;
-                       else
+                       foreach_node(List, useful_pathkeys, useful_pathkeys_list)
                        {
-                               /*
-                                * Try at least sorting the cheapest path and also try
-                                * incrementally sorting any path which is partially sorted
-                                * already (no need to deal with paths which have presorted
-                                * keys when incremental sort is disabled unless it's the
-                                * cheapest input path).
-                                */
-                               if (input_path != cheapest_input_path &&
-                                       (presorted_keys == 0 || !enable_incremental_sort))
+                               sorted_path = make_ordered_path(root,
+                                                                                               distinct_rel,
+                                                                                               input_path,
+                                                                                               cheapest_input_path,
+                                                                                               useful_pathkeys,
+                                                                                               limittuples);
+
+                               if (sorted_path == NULL)
                                        continue;
 
                                /*
-                                * We've no need to consider both a sort and incremental sort.
-                                * We'll just do a sort if there are no presorted keys and an
-                                * incremental sort when there are presorted keys.
+                                * distinct_pathkeys may have become empty if all of the
+                                * pathkeys were determined to be redundant.  If all of the
+                                * pathkeys are redundant then each DISTINCT target must only
+                                * allow a single value, therefore all resulting tuples must
+                                * be identical (or at least indistinguishable by an equality
+                                * check).  We can uniquify these tuples simply by just taking
+                                * the first tuple.  All we do here is add a path to do "LIMIT
+                                * 1" atop of 'sorted_path'.  When doing a DISTINCT ON we may
+                                * still have a non-NIL sort_pathkeys list, so we must still
+                                * only do this with paths which are correctly sorted by
+                                * sort_pathkeys.
                                 */
-                               if (presorted_keys == 0 || !enable_incremental_sort)
-                                       sorted_path = (Path *) create_sort_path(root,
-                                                                                                                       distinct_rel,
-                                                                                                                       input_path,
-                                                                                                                       needed_pathkeys,
-                                                                                                                       limittuples);
-                               else
-                                       sorted_path = (Path *) create_incremental_sort_path(root,
-                                                                                                                                               distinct_rel,
-                                                                                                                                               input_path,
-                                                                                                                                               needed_pathkeys,
-                                                                                                                                               presorted_keys,
-                                                                                                                                               limittuples);
-                       }
-
-                       /*
-                        * distinct_pathkeys may have become empty if all of the pathkeys
-                        * were determined to be redundant.  If all of the pathkeys are
-                        * redundant then each DISTINCT target must only allow a single
-                        * value, therefore all resulting tuples must be identical (or at
-                        * least indistinguishable by an equality check).  We can uniquify
-                        * these tuples simply by just taking the first tuple.  All we do
-                        * here is add a path to do "LIMIT 1" atop of 'sorted_path'.  When
-                        * doing a DISTINCT ON we may still have a non-NIL sort_pathkeys
-                        * list, so we must still only do this with paths which are
-                        * correctly sorted by sort_pathkeys.
-                        */
-                       if (root->distinct_pathkeys == NIL)
-                       {
-                               Node       *limitCount;
+                               if (root->distinct_pathkeys == NIL)
+                               {
+                                       Node       *limitCount;
 
-                               limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
-                                                                                               sizeof(int64),
-                                                                                               Int64GetDatum(1), false,
-                                                                                               FLOAT8PASSBYVAL);
+                                       limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
+                                                                                                       sizeof(int64),
+                                                                                                       Int64GetDatum(1), false,
+                                                                                                       FLOAT8PASSBYVAL);
 
-                               /*
-                                * If the query already has a LIMIT clause, then we could end
-                                * up with a duplicate LimitPath in the final plan. That does
-                                * not seem worth troubling over too much.
-                                */
-                               add_path(distinct_rel, (Path *)
-                                                create_limit_path(root, distinct_rel, sorted_path,
-                                                                                  NULL, limitCount,
-                                                                                  LIMIT_OPTION_COUNT, 0, 1));
-                       }
-                       else
-                       {
-                               add_path(distinct_rel, (Path *)
-                                                create_upper_unique_path(root, distinct_rel,
-                                                                                                 sorted_path,
-                                                                                                 list_length(root->distinct_pathkeys),
-                                                                                                 numDistinctRows));
+                                       /*
+                                        * If the query already has a LIMIT clause, then we could
+                                        * end up with a duplicate LimitPath in the final plan.
+                                        * That does not seem worth troubling over too much.
+                                        */
+                                       add_path(distinct_rel, (Path *)
+                                                        create_limit_path(root, distinct_rel, sorted_path,
+                                                                                          NULL, limitCount,
+                                                                                          LIMIT_OPTION_COUNT, 0, 1));
+                               }
+                               else
+                               {
+                                       add_path(distinct_rel, (Path *)
+                                                        create_upper_unique_path(root, distinct_rel,
+                                                                                                         sorted_path,
+                                                                                                         list_length(root->distinct_pathkeys),
+                                                                                                         numDistinctRows));
+                               }
                        }
                }
        }
@@ -5280,6 +5256,82 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
        return distinct_rel;
 }
 
+/*
+ * get_useful_pathkeys_for_distinct
+ *       Get useful orderings of pathkeys for distinctClause by reordering
+ *       'needed_pathkeys' to match the given 'path_pathkeys' as much as possible.
+ *
+ * This returns a list of pathkeys that can be useful for DISTINCT or DISTINCT
+ * ON clause.  For convenience, it always includes the given 'needed_pathkeys'.
+ */
+static List *
+get_useful_pathkeys_for_distinct(PlannerInfo *root, List *needed_pathkeys,
+                                                                List *path_pathkeys)
+{
+       List       *useful_pathkeys_list = NIL;
+       List       *useful_pathkeys = NIL;
+
+       /* always include the given 'needed_pathkeys' */
+       useful_pathkeys_list = lappend(useful_pathkeys_list,
+                                                                  needed_pathkeys);
+
+       if (!enable_distinct_reordering)
+               return useful_pathkeys_list;
+
+       /*
+        * Scan the given 'path_pathkeys' and construct a list of PathKey nodes
+        * that match 'needed_pathkeys', but only up to the longest matching
+        * prefix.
+        *
+        * When we have DISTINCT ON, we must ensure that the resulting pathkey
+        * list matches initial distinctClause pathkeys; otherwise, it won't have
+        * the desired behavior.
+        */
+       foreach_node(PathKey, pathkey, path_pathkeys)
+       {
+               /*
+                * The PathKey nodes are canonical, so they can be checked for
+                * equality by simple pointer comparison.
+                */
+               if (!list_member_ptr(needed_pathkeys, pathkey))
+                       break;
+               if (root->parse->hasDistinctOn &&
+                       !list_member_ptr(root->distinct_pathkeys, pathkey))
+                       break;
+
+               useful_pathkeys = lappend(useful_pathkeys, pathkey);
+       }
+
+       /* If no match at all, no point in reordering needed_pathkeys */
+       if (useful_pathkeys == NIL)
+               return useful_pathkeys_list;
+
+       /*
+        * If not full match, the resulting pathkey list is not useful without
+        * incremental sort.
+        */
+       if (list_length(useful_pathkeys) < list_length(needed_pathkeys) &&
+               !enable_incremental_sort)
+               return useful_pathkeys_list;
+
+       /* Append the remaining PathKey nodes in needed_pathkeys */
+       useful_pathkeys = list_concat_unique_ptr(useful_pathkeys,
+                                                                                        needed_pathkeys);
+
+       /*
+        * If the resulting pathkey list is the same as the 'needed_pathkeys',
+        * just drop it.
+        */
+       if (compare_pathkeys(needed_pathkeys,
+                                                useful_pathkeys) == PATHKEYS_EQUAL)
+               return useful_pathkeys_list;
+
+       useful_pathkeys_list = lappend(useful_pathkeys_list,
+                                                                  useful_pathkeys);
+
+       return useful_pathkeys_list;
+}
+
 /*
  * create_ordered_paths
  *
@@ -7008,58 +7060,6 @@ done:
        return parallel_workers;
 }
 
-/*
- * make_ordered_path
- *             Return a path ordered by 'pathkeys' based on the given 'path'.  May
- *             return NULL if it doesn't make sense to generate an ordered path in
- *             this case.
- */
-static Path *
-make_ordered_path(PlannerInfo *root, RelOptInfo *rel, Path *path,
-                                 Path *cheapest_path, List *pathkeys)
-{
-       bool            is_sorted;
-       int                     presorted_keys;
-
-       is_sorted = pathkeys_count_contained_in(pathkeys,
-                                                                                       path->pathkeys,
-                                                                                       &presorted_keys);
-
-       if (!is_sorted)
-       {
-               /*
-                * Try at least sorting the cheapest path and also try incrementally
-                * sorting any path which is partially sorted already (no need to deal
-                * with paths which have presorted keys when incremental sort is
-                * disabled unless it's the cheapest input path).
-                */
-               if (path != cheapest_path &&
-                       (presorted_keys == 0 || !enable_incremental_sort))
-                       return NULL;
-
-               /*
-                * We've no need to consider both a sort and incremental sort. We'll
-                * just do a sort if there are no presorted keys and an incremental
-                * sort when there are presorted keys.
-                */
-               if (presorted_keys == 0 || !enable_incremental_sort)
-                       path = (Path *) create_sort_path(root,
-                                                                                        rel,
-                                                                                        path,
-                                                                                        pathkeys,
-                                                                                        -1.0);
-               else
-                       path = (Path *) create_incremental_sort_path(root,
-                                                                                                                rel,
-                                                                                                                path,
-                                                                                                                pathkeys,
-                                                                                                                presorted_keys,
-                                                                                                                -1.0);
-       }
-
-       return path;
-}
-
 /*
  * add_paths_to_grouping_rel
  *
@@ -7111,7 +7111,8 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
                                                                                 grouped_rel,
                                                                                 path,
                                                                                 cheapest_path,
-                                                                                info->pathkeys);
+                                                                                info->pathkeys,
+                                                                                -1.0);
                                if (path == NULL)
                                        continue;
 
@@ -7192,7 +7193,8 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
                                                                                         grouped_rel,
                                                                                         path,
                                                                                         partially_grouped_rel->cheapest_total_path,
-                                                                                        info->pathkeys);
+                                                                                        info->pathkeys,
+                                                                                        -1.0);
 
                                        if (path == NULL)
                                                continue;
@@ -7443,7 +7445,8 @@ create_partial_grouping_paths(PlannerInfo *root,
                                                                                 partially_grouped_rel,
                                                                                 path,
                                                                                 cheapest_total_path,
-                                                                                info->pathkeys);
+                                                                                info->pathkeys,
+                                                                                -1.0);
 
                                if (path == NULL)
                                        continue;
@@ -7500,7 +7503,8 @@ create_partial_grouping_paths(PlannerInfo *root,
                                                                                 partially_grouped_rel,
                                                                                 path,
                                                                                 cheapest_partial_path,
-                                                                                info->pathkeys);
+                                                                                info->pathkeys,
+                                                                                -1.0);
 
                                if (path == NULL)
                                        continue;
@@ -7586,6 +7590,58 @@ create_partial_grouping_paths(PlannerInfo *root,
        return partially_grouped_rel;
 }
 
+/*
+ * make_ordered_path
+ *             Return a path ordered by 'pathkeys' based on the given 'path'.  May
+ *             return NULL if it doesn't make sense to generate an ordered path in
+ *             this case.
+ */
+static Path *
+make_ordered_path(PlannerInfo *root, RelOptInfo *rel, Path *path,
+                                 Path *cheapest_path, List *pathkeys, double limit_tuples)
+{
+       bool            is_sorted;
+       int                     presorted_keys;
+
+       is_sorted = pathkeys_count_contained_in(pathkeys,
+                                                                                       path->pathkeys,
+                                                                                       &presorted_keys);
+
+       if (!is_sorted)
+       {
+               /*
+                * Try at least sorting the cheapest path and also try incrementally
+                * sorting any path which is partially sorted already (no need to deal
+                * with paths which have presorted keys when incremental sort is
+                * disabled unless it's the cheapest input path).
+                */
+               if (path != cheapest_path &&
+                       (presorted_keys == 0 || !enable_incremental_sort))
+                       return NULL;
+
+               /*
+                * We've no need to consider both a sort and incremental sort. We'll
+                * just do a sort if there are no presorted keys and an incremental
+                * sort when there are presorted keys.
+                */
+               if (presorted_keys == 0 || !enable_incremental_sort)
+                       path = (Path *) create_sort_path(root,
+                                                                                        rel,
+                                                                                        path,
+                                                                                        pathkeys,
+                                                                                        limit_tuples);
+               else
+                       path = (Path *) create_incremental_sort_path(root,
+                                                                                                                rel,
+                                                                                                                path,
+                                                                                                                pathkeys,
+                                                                                                                presorted_keys,
+                                                                                                                limit_tuples);
+       }
+
+       return path;
+}
+
 /*
  * Generate Gather and Gather Merge paths for a grouping relation or partial
  * grouping relation.
index 8a67f01200cad1d2dadf1c34714538ffe7bc462e..9845abd6932215c13832041a34bacbedf03f0d28 100644 (file)
@@ -996,6 +996,16 @@ struct config_bool ConfigureNamesBool[] =
                true,
                NULL, NULL, NULL
        },
+       {
+               {"enable_distinct_reordering", PGC_USERSET, QUERY_TUNING_METHOD,
+                       gettext_noop("Enables reordering of DISTINCT pathkeys."),
+                       NULL,
+                       GUC_EXPLAIN
+               },
+               &enable_distinct_reordering,
+               true,
+               NULL, NULL, NULL
+       },
        {
                {"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
                        gettext_noop("Enables genetic query optimization."),
index 39a3ac231274bbe24e3fa063a25707903fe51827..407cd1e08ca3e705541f77bf1cb0556042fc4c90 100644 (file)
 #enable_sort = on
 #enable_tidscan = on
 #enable_group_by_reordering = on
+#enable_distinct_reordering = on
 
 # - Planner Cost Constants -
 
index 93e3dc719dab23ace751a7a2ce5b3d72d69ad2dc..2e123e08b791360ebd6f4792e619aeaf6cf69a8a 100644 (file)
@@ -111,6 +111,7 @@ typedef enum
 /* GUC parameters */
 extern PGDLLIMPORT int debug_parallel_query;
 extern PGDLLIMPORT bool parallel_leader_participation;
+extern PGDLLIMPORT bool enable_distinct_reordering;
 
 extern struct PlannedStmt *planner(Query *parse, const char *query_string,
                                                                   int cursorOptions,
index 82b8e54f5f17015d2cb9051c4ff25372426a1982..379ba0bc9fa5af34d8c8f92d6350a1daad576a4e 100644 (file)
@@ -464,3 +464,135 @@ SELECT null IS NOT DISTINCT FROM null as "yes";
  t
 (1 row)
 
+--
+-- Test the planner's ability to reorder the distinctClause Pathkeys to match
+-- the input path's ordering
+--
+CREATE TABLE distinct_tbl (x int, y int);
+INSERT INTO distinct_tbl SELECT i%10, i%10 FROM generate_series(1, 1000) AS i;
+CREATE INDEX distinct_tbl_x_y_idx ON distinct_tbl (x, y);
+ANALYZE distinct_tbl;
+-- Produce results with sorting.
+SET enable_hashagg TO OFF;
+-- Ensure we avoid the need to re-sort by reordering the distinctClause
+-- Pathkeys to match the ordering of the input path
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT y, x FROM distinct_tbl;
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Unique
+   ->  Index Only Scan using distinct_tbl_x_y_idx on distinct_tbl
+(2 rows)
+
+SELECT DISTINCT y, x FROM distinct_tbl;
+ y | x 
+---+---
+ 0 | 0
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+ 6 | 6
+ 7 | 7
+ 8 | 8
+ 9 | 9
+(10 rows)
+
+-- Ensure we leverage incremental-sort by reordering the distinctClause
+-- Pathkeys to partially match the ordering of the input path
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT y, x FROM (SELECT * FROM distinct_tbl ORDER BY x) s;
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Unique
+   ->  Incremental Sort
+         Sort Key: s.x, s.y
+         Presorted Key: s.x
+         ->  Subquery Scan on s
+               ->  Index Only Scan using distinct_tbl_x_y_idx on distinct_tbl
+(6 rows)
+
+SELECT DISTINCT y, x FROM (SELECT * FROM distinct_tbl ORDER BY x) s;
+ y | x 
+---+---
+ 0 | 0
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+ 6 | 6
+ 7 | 7
+ 8 | 8
+ 9 | 9
+(10 rows)
+
+-- Ensure we avoid the need to re-sort in partial distinct by reordering the
+-- distinctClause Pathkeys to match the ordering of the input path
+SET parallel_tuple_cost=0;
+SET parallel_setup_cost=0;
+SET min_parallel_table_scan_size=0;
+SET min_parallel_index_scan_size=0;
+SET max_parallel_workers_per_gather=2;
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT y, x FROM distinct_tbl limit 10;
+                                         QUERY PLAN                                          
+---------------------------------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Gather Merge
+               Workers Planned: 1
+               ->  Unique
+                     ->  Parallel Index Only Scan using distinct_tbl_x_y_idx on distinct_tbl
+(6 rows)
+
+SELECT DISTINCT y, x FROM distinct_tbl limit 10;
+ y | x 
+---+---
+ 0 | 0
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+ 6 | 6
+ 7 | 7
+ 8 | 8
+ 9 | 9
+(10 rows)
+
+RESET max_parallel_workers_per_gather;
+RESET min_parallel_index_scan_size;
+RESET min_parallel_table_scan_size;
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+-- Ensure we reorder the distinctClause Pathkeys to match the ordering of the
+-- input path even if there is ORDER BY clause
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT y, x FROM distinct_tbl ORDER BY y;
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Sort
+   Sort Key: y
+   ->  Unique
+         ->  Index Only Scan using distinct_tbl_x_y_idx on distinct_tbl
+(4 rows)
+
+SELECT DISTINCT y, x FROM distinct_tbl ORDER BY y;
+ y | x 
+---+---
+ 0 | 0
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+ 6 | 6
+ 7 | 7
+ 8 | 8
+ 9 | 9
+(10 rows)
+
+RESET enable_hashagg;
+DROP TABLE distinct_tbl;
index 958381afe5ebaabb4e039b3a79f678b686969419..69e0cf3959c0e02b3d1f457f418ef3d43477e794 100644 (file)
@@ -121,3 +121,128 @@ SELECT DISTINCT ON (four) four,hundred
          Filter: (four = 0)
 (3 rows)
 
+--
+-- Test the planner's ability to reorder the distinctClause Pathkeys to match
+-- the input path's ordering
+--
+CREATE TABLE distinct_tbl (x int, y int, z int);
+INSERT INTO distinct_tbl SELECT i%10, i%10, i%10 FROM generate_series(1, 1000) AS i;
+CREATE INDEX distinct_tbl_x_y_idx ON distinct_tbl (x, y);
+ANALYZE distinct_tbl;
+-- Produce results with sorting.
+SET enable_hashagg TO OFF;
+-- Ensure we avoid the need to re-sort by reordering the distinctClause
+-- Pathkeys to match the ordering of the input path
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (y, x) x, y FROM distinct_tbl;
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Unique
+   ->  Index Only Scan using distinct_tbl_x_y_idx on distinct_tbl
+(2 rows)
+
+SELECT DISTINCT ON (y, x) x, y FROM distinct_tbl;
+ x | y 
+---+---
+ 0 | 0
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+ 6 | 6
+ 7 | 7
+ 8 | 8
+ 9 | 9
+(10 rows)
+
+-- Ensure we leverage incremental-sort by reordering the distinctClause
+-- Pathkeys to partially match the ordering of the input path
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (y, x) x, y FROM (SELECT * FROM distinct_tbl ORDER BY x) s;
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Unique
+   ->  Incremental Sort
+         Sort Key: s.x, s.y
+         Presorted Key: s.x
+         ->  Subquery Scan on s
+               ->  Index Only Scan using distinct_tbl_x_y_idx on distinct_tbl
+(6 rows)
+
+SELECT DISTINCT ON (y, x) x, y FROM (SELECT * FROM distinct_tbl ORDER BY x) s;
+ x | y 
+---+---
+ 0 | 0
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+ 6 | 6
+ 7 | 7
+ 8 | 8
+ 9 | 9
+(10 rows)
+
+-- Ensure we reorder the distinctClause Pathkeys to match the ordering of the
+-- input path even if there is ORDER BY clause
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (y, x) x, y FROM distinct_tbl ORDER BY y;
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Sort
+   Sort Key: y
+   ->  Unique
+         ->  Index Only Scan using distinct_tbl_x_y_idx on distinct_tbl
+(4 rows)
+
+SELECT DISTINCT ON (y, x) x, y FROM distinct_tbl ORDER BY y;
+ x | y 
+---+---
+ 0 | 0
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+ 6 | 6
+ 7 | 7
+ 8 | 8
+ 9 | 9
+(10 rows)
+
+-- Ensure the resulting pathkey list matches the initial distinctClause Pathkeys
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (y, x) x, y FROM (select * from distinct_tbl order by x, z, y) s ORDER BY y, x, z;
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Sort
+   Sort Key: s.y, s.x, s.z
+   ->  Unique
+         ->  Incremental Sort
+               Sort Key: s.x, s.y, s.z
+               Presorted Key: s.x
+               ->  Subquery Scan on s
+                     ->  Sort
+                           Sort Key: distinct_tbl.x, distinct_tbl.z, distinct_tbl.y
+                           ->  Seq Scan on distinct_tbl
+(10 rows)
+
+SELECT DISTINCT ON (y, x) x, y FROM (select * from distinct_tbl order by x, z, y) s ORDER BY y, x, z;
+ x | y 
+---+---
+ 0 | 0
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+ 6 | 6
+ 7 | 7
+ 8 | 8
+ 9 | 9
+(10 rows)
+
+RESET enable_hashagg;
+DROP TABLE distinct_tbl;
index fad7fc3a7e0a64b4807369715c0cfde845f38a24..91089ac215f34f599e4906e8fce36aa1043bbb9a 100644 (file)
@@ -150,6 +150,7 @@ select name, setting from pg_settings where name like 'enable%';
 --------------------------------+---------
  enable_async_append            | on
  enable_bitmapscan              | on
+ enable_distinct_reordering     | on
  enable_gathermerge             | on
  enable_group_by_reordering     | on
  enable_hashagg                 | on
@@ -170,7 +171,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(22 rows)
+(23 rows)
 
 -- There are always wait event descriptions for various types.  InjectionPoint
 -- may be present or absent, depending on history since last postmaster start.
index da92c197aba5c9102ef57ec3f47045593526820c..50ac7dde396c6599d48b6e303a701d7fdf210a0b 100644 (file)
@@ -221,3 +221,56 @@ SELECT 1 IS NOT DISTINCT FROM 2 as "no";
 SELECT 2 IS NOT DISTINCT FROM 2 as "yes";
 SELECT 2 IS NOT DISTINCT FROM null as "no";
 SELECT null IS NOT DISTINCT FROM null as "yes";
+
+--
+-- Test the planner's ability to reorder the distinctClause Pathkeys to match
+-- the input path's ordering
+--
+
+CREATE TABLE distinct_tbl (x int, y int);
+INSERT INTO distinct_tbl SELECT i%10, i%10 FROM generate_series(1, 1000) AS i;
+CREATE INDEX distinct_tbl_x_y_idx ON distinct_tbl (x, y);
+ANALYZE distinct_tbl;
+
+-- Produce results with sorting.
+SET enable_hashagg TO OFF;
+
+-- Ensure we avoid the need to re-sort by reordering the distinctClause
+-- Pathkeys to match the ordering of the input path
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT y, x FROM distinct_tbl;
+SELECT DISTINCT y, x FROM distinct_tbl;
+
+-- Ensure we leverage incremental-sort by reordering the distinctClause
+-- Pathkeys to partially match the ordering of the input path
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT y, x FROM (SELECT * FROM distinct_tbl ORDER BY x) s;
+SELECT DISTINCT y, x FROM (SELECT * FROM distinct_tbl ORDER BY x) s;
+
+-- Ensure we avoid the need to re-sort in partial distinct by reordering the
+-- distinctClause Pathkeys to match the ordering of the input path
+SET parallel_tuple_cost=0;
+SET parallel_setup_cost=0;
+SET min_parallel_table_scan_size=0;
+SET min_parallel_index_scan_size=0;
+SET max_parallel_workers_per_gather=2;
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT y, x FROM distinct_tbl limit 10;
+SELECT DISTINCT y, x FROM distinct_tbl limit 10;
+
+RESET max_parallel_workers_per_gather;
+RESET min_parallel_index_scan_size;
+RESET min_parallel_table_scan_size;
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+
+-- Ensure we reorder the distinctClause Pathkeys to match the ordering of the
+-- input path even if there is ORDER BY clause
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT y, x FROM distinct_tbl ORDER BY y;
+SELECT DISTINCT y, x FROM distinct_tbl ORDER BY y;
+
+RESET enable_hashagg;
+
+DROP TABLE distinct_tbl;
index 261e6140feb151484687cfeaf5e3270682095f71..d79dace94c41be1094cb1c75ad64f8e884aabf95 100644 (file)
@@ -42,3 +42,43 @@ SELECT DISTINCT ON (four) four,two
 EXPLAIN (COSTS OFF)
 SELECT DISTINCT ON (four) four,hundred
    FROM tenk1 WHERE four = 0 ORDER BY 1,2;
+
+--
+-- Test the planner's ability to reorder the distinctClause Pathkeys to match
+-- the input path's ordering
+--
+
+CREATE TABLE distinct_tbl (x int, y int, z int);
+INSERT INTO distinct_tbl SELECT i%10, i%10, i%10 FROM generate_series(1, 1000) AS i;
+CREATE INDEX distinct_tbl_x_y_idx ON distinct_tbl (x, y);
+ANALYZE distinct_tbl;
+
+-- Produce results with sorting.
+SET enable_hashagg TO OFF;
+
+-- Ensure we avoid the need to re-sort by reordering the distinctClause
+-- Pathkeys to match the ordering of the input path
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (y, x) x, y FROM distinct_tbl;
+SELECT DISTINCT ON (y, x) x, y FROM distinct_tbl;
+
+-- Ensure we leverage incremental-sort by reordering the distinctClause
+-- Pathkeys to partially match the ordering of the input path
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (y, x) x, y FROM (SELECT * FROM distinct_tbl ORDER BY x) s;
+SELECT DISTINCT ON (y, x) x, y FROM (SELECT * FROM distinct_tbl ORDER BY x) s;
+
+-- Ensure we reorder the distinctClause Pathkeys to match the ordering of the
+-- input path even if there is ORDER BY clause
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (y, x) x, y FROM distinct_tbl ORDER BY y;
+SELECT DISTINCT ON (y, x) x, y FROM distinct_tbl ORDER BY y;
+
+-- Ensure the resulting pathkey list matches the initial distinctClause Pathkeys
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (y, x) x, y FROM (select * from distinct_tbl order by x, z, y) s ORDER BY y, x, z;
+SELECT DISTINCT ON (y, x) x, y FROM (select * from distinct_tbl order by x, z, y) s ORDER BY y, x, z;
+
+RESET enable_hashagg;
+
+DROP TABLE distinct_tbl;