Ignore nullingrels when looking up statistics

author Richard Guo <rguo@postgresql.org>

Thu, 2 Jan 2025 08:56:59 +0000 (17:56 +0900)

committer Richard Guo <rguo@postgresql.org>

Thu, 2 Jan 2025 09:06:00 +0000 (18:06 +0900)
author Richard Guo <rguo@postgresql.org>
Thu, 2 Jan 2025 08:56:59 +0000 (17:56 +0900)
committer Richard Guo <rguo@postgresql.org>
Thu, 2 Jan 2025 09:06:00 +0000 (18:06 +0900)
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index d4c6f6ed5a408b2bbb90dbedb348ae3029169db0..93e4a8906c51c41c5794825ac5bd5f3d35c7e281 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -121,6 +121,7 @@
  #include "parser/parse_clause.h"
  #include "parser/parse_relation.h"
  #include "parser/parsetree.h"
+#include "rewrite/rewriteManip.h"
  #include "statistics/statistics.h"
  #include "storage/bufmgr.h"
  #include "utils/acl.h"
@@ -3306,6 +3307,15 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
  
         ndistinct = get_variable_numdistinct(vardata, &isdefault);
  
+       /*
+        * The nullingrels bits within the var could cause the same var to be
+        * counted multiple times if it's marked with different nullingrels.  They
+        * could also prevent us from matching the var to the expressions in
+        * extended statistics (see estimate_multivariate_ndistinct).  So strip
+        * them out first.
+        */
+       var = remove_nulling_relids(var, root->outer_join_rels, NULL);
+
         foreach(lc, varinfos)
         {
                 varinfo = (GroupVarInfo *) lfirst(lc);
@@ -5025,6 +5035,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
  {
         Node       *basenode;
         Relids          varnos;
+       Relids          basevarnos;
         RelOptInfo *onerel;
  
         /* Make sure we don't return dangling pointers in vardata */
@@ -5066,10 +5077,11 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
          * relation are considered "real" vars.
          */
         varnos = pull_varnos(root, basenode);
+       basevarnos = bms_difference(varnos, root->outer_join_rels);
  
         onerel = NULL;
  
-       if (bms_is_empty(varnos))
+       if (bms_is_empty(basevarnos))
         {
                 /* No Vars at all ... must be pseudo-constant clause */
         }
@@ -5077,7 +5089,8 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
         {
                 int                     relid;
  
-               if (bms_get_singleton_member(varnos, &relid))
+               /* Check if the expression is in vars of a single base relation */
+               if (bms_get_singleton_member(basevarnos, &relid))
                 {
                         if (varRelid == 0 || varRelid == relid)
                         {
@@ -5107,7 +5120,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
                 }
         }
  
-       bms_free(varnos);
+       bms_free(basevarnos);
  
         vardata->var = node;
         vardata->atttype = exprType(node);
@@ -5132,6 +5145,14 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
                 ListCell   *slist;
                 Oid                     userid;
  
+               /*
+                * The nullingrels bits within the expression could prevent us from
+                * matching it to expressional index columns or to the expressions in
+                * extended statistics.  So strip them out first.
+                */
+               if (bms_overlap(varnos, root->outer_join_rels))
+                       node = remove_nulling_relids(node, root->outer_join_rels, NULL);
+
                 /*
                  * Determine the user ID to use for privilege checks: either
                  * onerel->userid if it's set (e.g., in case we're accessing the table
@@ -5402,6 +5423,8 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
                         }
                 }
         }
+
+       bms_free(varnos);
  }
  
  /*
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out

index 0c9b312eafdf3c37520a340d5f3fbd2d0efe77ff..079fcf46f0d9334a6d4ca855097aa332e2dc9420 100644 (file)
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -2573,10 +2573,11 @@ where t1.f1 = coalesce(t2.f1, 1);
                 ->  Materialize
                       ->  Seq Scan on int4_tbl t2
                             Filter: (f1 > 1)
-         ->  Seq Scan on int4_tbl t3
+         ->  Materialize
+               ->  Seq Scan on int4_tbl t3
     ->  Materialize
           ->  Seq Scan on int4_tbl t4
-(13 rows)
+(14 rows)
  
  explain (costs off)
  select * from int4_tbl t1
@@ -8217,3 +8218,24 @@ SELECT * FROM rescan_bhs t1 LEFT JOIN rescan_bhs t2 ON t1.a IN
  
  RESET enable_seqscan;
  RESET enable_indexscan;
+-- Test that we do not account for nullingrels when looking up statistics
+CREATE TABLE group_tbl (a INT, b INT);
+INSERT INTO group_tbl SELECT 1, 1;
+CREATE STATISTICS group_tbl_stat (ndistinct) ON a, b FROM group_tbl;
+ANALYZE group_tbl;
+EXPLAIN (COSTS OFF)
+SELECT 1 FROM group_tbl t1
+    LEFT JOIN (SELECT a c1, COALESCE(a) c2 FROM group_tbl t2) s ON TRUE
+GROUP BY s.c1, s.c2;
+                 QUERY PLAN                 
+--------------------------------------------
+ Group
+   Group Key: t2.a, (COALESCE(t2.a))
+   ->  Sort
+         Sort Key: t2.a, (COALESCE(t2.a))
+         ->  Nested Loop Left Join
+               ->  Seq Scan on group_tbl t1
+               ->  Seq Scan on group_tbl t2
+(7 rows)
+
+DROP TABLE group_tbl;
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql

index 8cfc1053cb78b1061ec28615dd723c1a8922f2a2..779d56cb30f44ae2c9650c2cf9e271824dbf0c4d 100644 (file)
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -3033,3 +3033,16 @@ SELECT * FROM rescan_bhs t1 LEFT JOIN rescan_bhs t2 ON t1.a IN
  
  RESET enable_seqscan;
  RESET enable_indexscan;
+
+-- Test that we do not account for nullingrels when looking up statistics
+CREATE TABLE group_tbl (a INT, b INT);
+INSERT INTO group_tbl SELECT 1, 1;
+CREATE STATISTICS group_tbl_stat (ndistinct) ON a, b FROM group_tbl;
+ANALYZE group_tbl;
+
+EXPLAIN (COSTS OFF)
+SELECT 1 FROM group_tbl t1
+    LEFT JOIN (SELECT a c1, COALESCE(a) c2 FROM group_tbl t2) s ON TRUE
+GROUP BY s.c1, s.c2;
+
+DROP TABLE group_tbl;
author	Richard Guo <rguo@postgresql.org>
	Thu, 2 Jan 2025 08:56:59 +0000 (17:56 +0900)
committer	Richard Guo <rguo@postgresql.org>
	Thu, 2 Jan 2025 09:06:00 +0000 (18:06 +0900)
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/test/regress/expected/join.out		patch \| blob \| blame \| history
src/test/regress/sql/join.sql		patch \| blob \| blame \| history