Allow Memoize to operate in binary comparison mode

Memoize would always use the hash equality operator for the cache key types to determine if the current set of parameters were the same as some previously cached set. Certain types such as floating points where -0.0 and +0.0 differ in their binary representation but are classed as equal by the hash equality operator may cause problems as unless the join uses the same operator it's possible that whichever join operator is being used would be able to distinguish the two values. In which case we may accidentally return in the incorrect rows out of the cache. To fix this here we add a binary mode to Memoize to allow it to the current set of parameters to previously cached values by comparing bit-by-bit rather than logically using the hash equality operator. This binary mode is always used for LATERAL joins and it's used for normal joins when any of the join operators are not hashable. Reported-by: Tom Lane Author: David Rowley Discussion: https://postgr.es/m/3004308.1632952496@sss.pgh.pa.us Backpatch-through: 14, where Memoize was added
author: David Rowley 2021-11-23 21:06:59 +0000
committer: David Rowley 2021-11-23 21:06:59 +0000
commit: e502150f7d0be41e3c8784be007fa871a32d8a7f (patch)
tree: a6c96abbe3eae534d938d05539627b4f03d23f62 /src/test
parent: 1922d7c6e1a74178bd2f1d5aa5a6ab921b3fcd34 (diff)
4 files changed, 149 insertions, 14 deletions
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 84331659e7d..d5b5b775fdd 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -3686,9 +3686,10 @@ where t1.unique1 = 1;
                      Index Cond: (hundred = t1.hundred)
          ->  Memoize
                Cache Key: t2.thousand
+               Cache Mode: logical
                ->  Index Scan using tenk1_unique2 on tenk1 t3
                      Index Cond: (unique2 = t2.thousand)
-(13 rows)
+(14 rows)
 
 explain (costs off)
 select * from tenk1 t1 left join
@@ -3708,9 +3709,10 @@ where t1.unique1 = 1;
                      Index Cond: (hundred = t1.hundred)
          ->  Memoize
                Cache Key: t2.thousand
+               Cache Mode: logical
                ->  Index Scan using tenk1_unique2 on tenk1 t3
                      Index Cond: (unique2 = t2.thousand)
-(13 rows)
+(14 rows)
 
 explain (costs off)
 select count(*) from
@@ -4238,11 +4240,12 @@ where t1.f1 = ss.f1;
    ->  Memoize
          Output: (i8.q1), t2.f1
          Cache Key: i8.q1
+         Cache Mode: binary
          ->  Limit
                Output: (i8.q1), t2.f1
                ->  Seq Scan on public.text_tbl t2
                      Output: i8.q1, t2.f1
-(19 rows)
+(20 rows)
 
 select * from
   text_tbl t1
@@ -4282,6 +4285,7 @@ where t1.f1 = ss2.f1;
          ->  Memoize
                Output: (i8.q1), t2.f1
                Cache Key: i8.q1
+               Cache Mode: binary
                ->  Limit
                      Output: (i8.q1), t2.f1
                      ->  Seq Scan on public.text_tbl t2
@@ -4289,11 +4293,12 @@ where t1.f1 = ss2.f1;
    ->  Memoize
          Output: ((i8.q1)), (t2.f1)
          Cache Key: (i8.q1), t2.f1
+         Cache Mode: binary
          ->  Limit
                Output: ((i8.q1)), (t2.f1)
                ->  Seq Scan on public.text_tbl t3
                      Output: (i8.q1), t2.f1
-(28 rows)
+(30 rows)
 
 select * from
   text_tbl t1
@@ -4342,6 +4347,7 @@ where tt1.f1 = ss1.c0;
    ->  Memoize
          Output: ss1.c0
          Cache Key: tt4.f1
+         Cache Mode: binary
          ->  Subquery Scan on ss1
                Output: ss1.c0
                Filter: (ss1.c0 = 'foo'::text)
@@ -4349,7 +4355,7 @@ where tt1.f1 = ss1.c0;
                      Output: (tt4.f1)
                      ->  Seq Scan on public.text_tbl tt5
                            Output: tt4.f1
-(32 rows)
+(33 rows)
 
 select 1 from
   text_tbl as tt1
@@ -5058,8 +5064,9 @@ explain (costs off)
          ->  Seq Scan on tenk1 a
          ->  Memoize
                Cache Key: a.two
+               Cache Mode: binary
                ->  Function Scan on generate_series g
-(6 rows)
+(7 rows)
 
 explain (costs off)
   select count(*) from tenk1 a cross join lateral generate_series(1,two) g;
@@ -5070,8 +5077,9 @@ explain (costs off)
          ->  Seq Scan on tenk1 a
          ->  Memoize
                Cache Key: a.two
+               Cache Mode: binary
                ->  Function Scan on generate_series g
-(6 rows)
+(7 rows)
 
 -- don't need the explicit LATERAL keyword for functions
 explain (costs off)
@@ -5083,8 +5091,9 @@ explain (costs off)
          ->  Seq Scan on tenk1 a
          ->  Memoize
                Cache Key: a.two
+               Cache Mode: binary
                ->  Function Scan on generate_series g
-(6 rows)
+(7 rows)
 
 -- lateral with UNION ALL subselect
 explain (costs off)
@@ -5145,9 +5154,10 @@ explain (costs off)
                ->  Values Scan on "*VALUES*"
          ->  Memoize
                Cache Key: "*VALUES*".column1
+               Cache Mode: logical
                ->  Index Only Scan using tenk1_unique2 on tenk1 b
                      Index Cond: (unique2 = "*VALUES*".column1)
-(9 rows)
+(10 rows)
 
 select count(*) from tenk1 a,
   tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x;
diff --git a/src/test/regress/expected/memoize.out b/src/test/regress/expected/memoize.out
index 9a025c4a7ab..0ed5d8474af 100644
--- a/src/test/regress/expected/memoize.out
+++ b/src/test/regress/expected/memoize.out
@@ -44,11 +44,12 @@ WHERE t2.unique1 < 1000;', false);
                Rows Removed by Filter: 9000
          ->  Memoize (actual rows=1 loops=N)
                Cache Key: t2.twenty
+               Cache Mode: logical
                Hits: 980  Misses: 20  Evictions: Zero  Overflows: 0  Memory Usage: NkB
                ->  Index Only Scan using tenk1_unique1 on tenk1 t1 (actual rows=1 loops=N)
                      Index Cond: (unique1 = t2.twenty)
                      Heap Fetches: N
-(11 rows)
+(12 rows)
 
 -- And check we get the expected results.
 SELECT COUNT(*),AVG(t1.unique1) FROM tenk1 t1
@@ -73,11 +74,12 @@ WHERE t1.unique1 < 1000;', false);
                Rows Removed by Filter: 9000
          ->  Memoize (actual rows=1 loops=N)
                Cache Key: t1.twenty
+               Cache Mode: logical
                Hits: 980  Misses: 20  Evictions: Zero  Overflows: 0  Memory Usage: NkB
                ->  Index Only Scan using tenk1_unique1 on tenk1 t2 (actual rows=1 loops=N)
                      Index Cond: (unique1 = t1.twenty)
                      Heap Fetches: N
-(11 rows)
+(12 rows)
 
 -- And check we get the expected results.
 SELECT COUNT(*),AVG(t2.unique1) FROM tenk1 t1,
@@ -107,12 +109,94 @@ WHERE t2.unique1 < 1200;', true);
                Rows Removed by Filter: 8800
          ->  Memoize (actual rows=1 loops=N)
                Cache Key: t2.thousand
+               Cache Mode: logical
                Hits: N  Misses: N  Evictions: N  Overflows: 0  Memory Usage: NkB
                ->  Index Only Scan using tenk1_unique1 on tenk1 t1 (actual rows=1 loops=N)
                      Index Cond: (unique1 = t2.thousand)
                      Heap Fetches: N
-(11 rows)
+(12 rows)
 
+CREATE TABLE flt (f float);
+CREATE INDEX flt_f_idx ON flt (f);
+INSERT INTO flt VALUES('-0.0'::float),('+0.0'::float);
+ANALYZE flt;
+SET enable_seqscan TO off;
+-- Ensure memoize operates in logical mode
+SELECT explain_memoize('
+SELECT * FROM flt f1 INNER JOIN flt f2 ON f1.f = f2.f;', false);
+                                explain_memoize                                
+-------------------------------------------------------------------------------
+ Nested Loop (actual rows=4 loops=N)
+   ->  Index Only Scan using flt_f_idx on flt f1 (actual rows=2 loops=N)
+         Heap Fetches: N
+   ->  Memoize (actual rows=2 loops=N)
+         Cache Key: f1.f
+         Cache Mode: logical
+         Hits: 1  Misses: 1  Evictions: Zero  Overflows: 0  Memory Usage: NkB
+         ->  Index Only Scan using flt_f_idx on flt f2 (actual rows=2 loops=N)
+               Index Cond: (f = f1.f)
+               Heap Fetches: N
+(10 rows)
+
+-- Ensure memoize operates in binary mode
+SELECT explain_memoize('
+SELECT * FROM flt f1 INNER JOIN flt f2 ON f1.f >= f2.f;', false);
+                                explain_memoize                                
+-------------------------------------------------------------------------------
+ Nested Loop (actual rows=4 loops=N)
+   ->  Index Only Scan using flt_f_idx on flt f1 (actual rows=2 loops=N)
+         Heap Fetches: N
+   ->  Memoize (actual rows=2 loops=N)
+         Cache Key: f1.f
+         Cache Mode: binary
+         Hits: 0  Misses: 2  Evictions: Zero  Overflows: 0  Memory Usage: NkB
+         ->  Index Only Scan using flt_f_idx on flt f2 (actual rows=2 loops=N)
+               Index Cond: (f <= f1.f)
+               Heap Fetches: N
+(10 rows)
+
+DROP TABLE flt;
+-- Exercise Memoize in binary mode with a large fixed width type and a
+-- varlena type.
+CREATE TABLE strtest (n name, t text);
+CREATE INDEX strtest_n_idx ON strtest (n);
+CREATE INDEX strtest_t_idx ON strtest (t);
+INSERT INTO strtest VALUES('one','one'),('two','two'),('three',repeat(md5('three'),100));
+-- duplicate rows so we get some cache hits
+INSERT INTO strtest SELECT * FROM strtest;
+ANALYZE strtest;
+-- Ensure we get 3 hits and 3 misses
+SELECT explain_memoize('
+SELECT * FROM strtest s1 INNER JOIN strtest s2 ON s1.n >= s2.n;', false);
+                                 explain_memoize                                  
+----------------------------------------------------------------------------------
+ Nested Loop (actual rows=24 loops=N)
+   ->  Seq Scan on strtest s1 (actual rows=6 loops=N)
+   ->  Memoize (actual rows=4 loops=N)
+         Cache Key: s1.n
+         Cache Mode: binary
+         Hits: 3  Misses: 3  Evictions: Zero  Overflows: 0  Memory Usage: NkB
+         ->  Index Scan using strtest_n_idx on strtest s2 (actual rows=4 loops=N)
+               Index Cond: (n <= s1.n)
+(8 rows)
+
+-- Ensure we get 3 hits and 3 misses
+SELECT explain_memoize('
+SELECT * FROM strtest s1 INNER JOIN strtest s2 ON s1.t >= s2.t;', false);
+                                 explain_memoize                                  
+----------------------------------------------------------------------------------
+ Nested Loop (actual rows=24 loops=N)
+   ->  Seq Scan on strtest s1 (actual rows=6 loops=N)
+   ->  Memoize (actual rows=4 loops=N)
+         Cache Key: s1.t
+         Cache Mode: binary
+         Hits: 3  Misses: 3  Evictions: Zero  Overflows: 0  Memory Usage: NkB
+         ->  Index Scan using strtest_t_idx on strtest s2 (actual rows=4 loops=N)
+               Index Cond: (t <= s1.t)
+(8 rows)
+
+DROP TABLE strtest;
+RESET enable_seqscan;
 RESET enable_mergejoin;
 RESET work_mem;
 RESET enable_bitmapscan;
@@ -140,9 +224,10 @@ WHERE t1.unique1 < 1000;
                                  Index Cond: (unique1 < 1000)
                      ->  Memoize
                            Cache Key: t1.twenty
+                           Cache Mode: logical
                            ->  Index Only Scan using tenk1_unique1 on tenk1 t2
                                  Index Cond: (unique1 = t1.twenty)
-(13 rows)
+(14 rows)
 
 -- And ensure the parallel plan gives us the correct results.
 SELECT COUNT(*),AVG(t2.unique1) FROM tenk1 t1,
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 07426260330..4e8ddc70613 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1139,13 +1139,14 @@ where o.ten = 1;
                Filter: (ten = 1)
          ->  Memoize
                Cache Key: o.four
+               Cache Mode: binary
                ->  CTE Scan on x
                      CTE x
                        ->  Recursive Union
                              ->  Result
                              ->  WorkTable Scan on x x_1
                                    Filter: (a < 10)
-(12 rows)
+(13 rows)
 
 select sum(o.four), sum(ss.a) from
   onek o cross join lateral (
diff --git a/src/test/regress/sql/memoize.sql b/src/test/regress/sql/memoize.sql
index 548cc3eee30..3c7360adf9c 100644
--- a/src/test/regress/sql/memoize.sql
+++ b/src/test/regress/sql/memoize.sql
@@ -65,6 +65,45 @@ SELECT explain_memoize('
 SELECT COUNT(*),AVG(t1.unique1) FROM tenk1 t1
 INNER JOIN tenk1 t2 ON t1.unique1 = t2.thousand
 WHERE t2.unique1 < 1200;', true);
+
+CREATE TABLE flt (f float);
+CREATE INDEX flt_f_idx ON flt (f);
+INSERT INTO flt VALUES('-0.0'::float),('+0.0'::float);
+ANALYZE flt;
+
+SET enable_seqscan TO off;
+
+-- Ensure memoize operates in logical mode
+SELECT explain_memoize('
+SELECT * FROM flt f1 INNER JOIN flt f2 ON f1.f = f2.f;', false);
+
+-- Ensure memoize operates in binary mode
+SELECT explain_memoize('
+SELECT * FROM flt f1 INNER JOIN flt f2 ON f1.f >= f2.f;', false);
+
+DROP TABLE flt;
+
+-- Exercise Memoize in binary mode with a large fixed width type and a
+-- varlena type.
+CREATE TABLE strtest (n name, t text);
+CREATE INDEX strtest_n_idx ON strtest (n);
+CREATE INDEX strtest_t_idx ON strtest (t);
+INSERT INTO strtest VALUES('one','one'),('two','two'),('three',repeat(md5('three'),100));
+-- duplicate rows so we get some cache hits
+INSERT INTO strtest SELECT * FROM strtest;
+ANALYZE strtest;
+
+-- Ensure we get 3 hits and 3 misses
+SELECT explain_memoize('
+SELECT * FROM strtest s1 INNER JOIN strtest s2 ON s1.n >= s2.n;', false);
+
+-- Ensure we get 3 hits and 3 misses
+SELECT explain_memoize('
+SELECT * FROM strtest s1 INNER JOIN strtest s2 ON s1.t >= s2.t;', false);
+
+DROP TABLE strtest;
+
+RESET enable_seqscan;
 RESET enable_mergejoin;
 RESET work_mem;
 RESET enable_bitmapscan;
author	David Rowley	2021-11-23 21:06:59 +0000
committer	David Rowley	2021-11-23 21:06:59 +0000
commit	e502150f7d0be41e3c8784be007fa871a32d8a7f (patch)
tree	a6c96abbe3eae534d938d05539627b4f03d23f62 /src/test
parent	1922d7c6e1a74178bd2f1d5aa5a6ab921b3fcd34 (diff)