Unbreak Finalize HashAggregate over Partial HashAggregate.
authorRobert Haas <rhaas@postgresql.org>
Fri, 16 Dec 2016 15:03:08 +0000 (10:03 -0500)
committerRobert Haas <rhaas@postgresql.org>
Fri, 16 Dec 2016 15:03:08 +0000 (10:03 -0500)
Commit 5dfc198146b49ce7ecc8a1fc9d5e171fb75f6ba5 introduced the use
of a new type of hash table with linear reprobing for hash aggregates.
Such a hash table behaves very poorly if keys are inserted in hash
order, which does in fact happen in the case where a query use a
Finalize HashAggregate node fed (via Gather) by a Partial
HashAggregate node.  In fact, queries with this type of plan tend
to run effectively forever.

Fix that by seeding the hash value differently in each worker
(and in the leader, if it participates).

Andres Freund and Robert Haas

src/backend/executor/execGrouping.c
src/backend/executor/nodeAgg.c
src/backend/executor/nodeRecursiveunion.c
src/backend/executor/nodeSetOp.c
src/backend/executor/nodeSubplan.c
src/include/executor/executor.h
src/include/nodes/execnodes.h

index 94cc59de7e1cbcc21807e7553342eb1d80327087..3149fbe975c2caaed1d16ca6f126189a98969188 100644 (file)
@@ -18,6 +18,8 @@
  */
 #include "postgres.h"
 
+#include "access/hash.h"
+#include "access/parallel.h"
 #include "executor/executor.h"
 #include "miscadmin.h"
 #include "utils/lsyscache.h"
@@ -289,7 +291,8 @@ BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
                                        FmgrInfo *eqfunctions,
                                        FmgrInfo *hashfunctions,
                                        long nbuckets, Size additionalsize,
-                                       MemoryContext tablecxt, MemoryContext tempcxt)
+                                       MemoryContext tablecxt, MemoryContext tempcxt,
+                                       bool use_variable_hash_iv)
 {
        TupleHashTable hashtable;
        Size            entrysize = sizeof(TupleHashEntryData) + additionalsize;
@@ -314,6 +317,19 @@ BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
        hashtable->in_hash_funcs = NULL;
        hashtable->cur_eq_funcs = NULL;
 
+       /*
+        * If parallelism is in use, even if the master backend is performing the
+        * scan itself, we don't want to create the hashtable exactly the same way
+        * in all workers. As hashtables are iterated over in keyspace-order,
+        * doing so in all processes in the same way is likely to lead to
+        * "unbalanced" hashtables when the table size initially is
+        * underestimated.
+        */
+       if (use_variable_hash_iv)
+               hashtable->hash_iv = hash_uint32(ParallelWorkerNumber);
+       else
+               hashtable->hash_iv = 0;
+
        hashtable->hashtab = tuplehash_create(tablecxt, nbuckets);
        hashtable->hashtab->private_data = hashtable;
 
@@ -450,7 +466,7 @@ TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple)
        TupleHashTable hashtable = (TupleHashTable) tb->private_data;
        int                     numCols = hashtable->numCols;
        AttrNumber *keyColIdx = hashtable->keyColIdx;
-       uint32          hashkey = 0;
+       uint32          hashkey = hashtable->hash_iv;
        TupleTableSlot *slot;
        FmgrInfo   *hashfunctions;
        int                     i;
index eefb3d678c639ab3fc56a6d42bcb9ee670aaf470..a093862f34b7b9fc995e07c2f8cf353b00c429f1 100644 (file)
@@ -1723,7 +1723,8 @@ build_hash_table(AggState *aggstate)
                                                                                          node->numGroups,
                                                                                          additionalsize,
                                                         aggstate->aggcontexts[0]->ecxt_per_tuple_memory,
-                                                                                         tmpmem);
+                                                                                         tmpmem,
+                                                                 !DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
 }
 
 /*
index acded079e245ddce63f41062144f3a2599bcf8dc..5b734c05050e658b2b27767c8acc4a1df3dcb245 100644 (file)
@@ -43,7 +43,8 @@ build_hash_table(RecursiveUnionState *rustate)
                                                                                         node->numGroups,
                                                                                         0,
                                                                                         rustate->tableContext,
-                                                                                        rustate->tempContext);
+                                                                                        rustate->tempContext,
+                                                                                        false);
 }
 
 
index e94555ead894ea632aa1c36b0799b4029a093aa0..760b93521a668c14397ff912ca0ea918f07e1782 100644 (file)
@@ -130,7 +130,8 @@ build_hash_table(SetOpState *setopstate)
                                                                                                node->numGroups,
                                                                                                0,
                                                                                                setopstate->tableContext,
-                                                                                               setopstate->tempContext);
+                                                                                               setopstate->tempContext,
+                                                                                               false);
 }
 
 /*
index 8ca8fc460ca90ae729fc1d48b6314b26a80f1915..d3436000d0fad01510d3cdce247051486fe86aa1 100644 (file)
@@ -510,7 +510,8 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
                                                                                  nbuckets,
                                                                                  0,
                                                                                  node->hashtablecxt,
-                                                                                 node->hashtempcxt);
+                                                                                 node->hashtempcxt,
+                                                                                 false);
 
        if (!subplan->unknownEqFalse)
        {
@@ -529,7 +530,8 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
                                                                                          nbuckets,
                                                                                          0,
                                                                                          node->hashtablecxt,
-                                                                                         node->hashtempcxt);
+                                                                                         node->hashtempcxt,
+                                                                                         false);
        }
 
        /*
index b4d09f9564019510d1d25af5f16ded6d590815f2..3f649faf2fe1b24083dc6380e85e705d06713cc0 100644 (file)
@@ -143,7 +143,7 @@ extern TupleHashTable BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
                                        FmgrInfo *hashfunctions,
                                        long nbuckets, Size additionalsize,
                                        MemoryContext tablecxt,
-                                       MemoryContext tempcxt);
+                                       MemoryContext tempcxt, bool use_variable_hash_iv);
 extern TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable,
                                         TupleTableSlot *slot,
                                         bool *isnew);
index 1de5c8196d70dd7639581b0385e2b7abd1f82dc8..703604ab9d746f4f1b2010c3187bcc6d052fe5e8 100644 (file)
@@ -533,6 +533,7 @@ typedef struct TupleHashTableData
        TupleTableSlot *inputslot;      /* current input tuple's slot */
        FmgrInfo   *in_hash_funcs;      /* hash functions for input datatype(s) */
        FmgrInfo   *cur_eq_funcs;       /* equality functions for input vs. table */
+       uint32          hash_iv;                /* hash-function IV */
 }      TupleHashTableData;
 
 typedef tuplehash_iterator TupleHashIterator;