Consider the "LIMIT 1" optimization with parallel DISTINCT

author David Rowley <drowley@postgresql.org>

Wed, 31 Jan 2024 04:22:02 +0000 (17:22 +1300)

committer David Rowley <drowley@postgresql.org>

Wed, 31 Jan 2024 04:22:02 +0000 (17:22 +1300)
author David Rowley <drowley@postgresql.org>
Wed, 31 Jan 2024 04:22:02 +0000 (17:22 +1300)
committer David Rowley <drowley@postgresql.org>
Wed, 31 Jan 2024 04:22:02 +0000 (17:22 +1300)
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c

index 01fa45b9255150153dd7ac8782f78f3ee36c3c0f..342f5ad8d0a11f6d00a49e7b8c2a1c2cdafcc987 100644 (file)
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4737,11 +4737,45 @@ create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
                                                                                                                                                 -1.0);
                         }
  
-                       add_partial_path(partial_distinct_rel, (Path *)
-                                                        create_upper_unique_path(root, partial_distinct_rel,
-                                                                                                         sorted_path,
-                                                                                                         list_length(root->distinct_pathkeys),
-                                                                                                         numDistinctRows));
+                       /*
+                        * An empty distinct_pathkeys means all tuples have the same value
+                        * for the DISTINCT clause.  See create_final_distinct_paths()
+                        */
+                       if (root->distinct_pathkeys == NIL)
+                       {
+                               Node       *limitCount;
+
+                               limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
+                                                                                               sizeof(int64),
+                                                                                               Int64GetDatum(1), false,
+                                                                                               FLOAT8PASSBYVAL);
+
+                               /*
+                                * Apply a LimitPath onto the partial path to restrict the
+                                * tuples from each worker to 1.  create_final_distinct_paths
+                                * will need to apply an additional LimitPath to restrict this
+                                * to a single row after the Gather node.  If the query
+                                * already has a LIMIT clause, then we could end up with three
+                                * Limit nodes in the final plan.  Consolidating the top two
+                                * of these could be done, but does not seem worth troubling
+                                * over.
+                                */
+                               add_partial_path(partial_distinct_rel, (Path *)
+                                                                create_limit_path(root, partial_distinct_rel,
+                                                                                                  sorted_path,
+                                                                                                  NULL,
+                                                                                                  limitCount,
+                                                                                                  LIMIT_OPTION_COUNT,
+                                                                                                  0, 1));
+                       }
+                       else
+                       {
+                               add_partial_path(partial_distinct_rel, (Path *)
+                                                                create_upper_unique_path(root, partial_distinct_rel,
+                                                                                                                 sorted_path,
+                                                                                                                 list_length(root->distinct_pathkeys),
+                                                                                                                 numDistinctRows));
+                       }
                 }
         }
  
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out

index 9d44ea8056d1949b66bd591ac2633f3d4e9e75f1..1f72756ccb4c60e0873da354966a792097acb12e 100644 (file)
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -348,6 +348,26 @@ SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
      0 |        1 |        2 |        3
  (1 row)
  
+SET parallel_setup_cost=0;
+SET min_parallel_table_scan_size=0;
+SET max_parallel_workers_per_gather=2;
+-- Ensure we get a plan with a Limit 1 in both partial distinct and final
+-- distinct
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 10;
+                  QUERY PLAN                  
+----------------------------------------------
+ Limit
+   ->  Gather
+         Workers Planned: 2
+         ->  Limit
+               ->  Parallel Seq Scan on tenk1
+                     Filter: (four = 10)
+(6 rows)
+
+RESET max_parallel_workers_per_gather;
+RESET min_parallel_table_scan_size;
+RESET parallel_setup_cost;
  --
  -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
  -- very own regression file.
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql

index 1643526d991807b38c7d651d3a4390a784f02edb..da92c197aba5c9102ef57ec3f47045593526820c 100644 (file)
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -180,6 +180,19 @@ SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
  -- Ensure we only get 1 row
  SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
  
+SET parallel_setup_cost=0;
+SET min_parallel_table_scan_size=0;
+SET max_parallel_workers_per_gather=2;
+
+-- Ensure we get a plan with a Limit 1 in both partial distinct and final
+-- distinct
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 10;
+
+RESET max_parallel_workers_per_gather;
+RESET min_parallel_table_scan_size;
+RESET parallel_setup_cost;
+
  --
  -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
  -- very own regression file.
author	David Rowley <drowley@postgresql.org>
	Wed, 31 Jan 2024 04:22:02 +0000 (17:22 +1300)
committer	David Rowley <drowley@postgresql.org>
	Wed, 31 Jan 2024 04:22:02 +0000 (17:22 +1300)
src/backend/optimizer/plan/planner.c		patch \| blob \| blame \| history
src/test/regress/expected/select_distinct.out		patch \| blob \| blame \| history
src/test/regress/sql/select_distinct.sql		patch \| blob \| blame \| history