Fix dynahash.c to suppress hash bucket splits while a hash_seq_search() scan

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 26 Apr 2007 23:24:46 +0000 (23:24 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 26 Apr 2007 23:24:46 +0000 (23:24 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 26 Apr 2007 23:24:46 +0000 (23:24 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 26 Apr 2007 23:24:46 +0000 (23:24 +0000)
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c

index d47dae0c412420b031dda689e9a4cff033633787..d705d7a5c1ead2065b80c132ca50b98d00061f89 100644 (file)
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -10,7 +10,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.239 2007/04/03 16:34:35 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.240 2007/04/26 23:24:44 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -1631,6 +1631,7 @@ CommitTransaction(void)
     /* smgrcommit already done */
     AtEOXact_Files();
     AtEOXact_ComboCid();
+   AtEOXact_HashTables(true);
     pgstat_clear_snapshot();
     pgstat_count_xact_commit();
     pgstat_report_txn_timestamp(0);
@@ -1849,6 +1850,7 @@ PrepareTransaction(void)
     /* smgrcommit already done */
     AtEOXact_Files();
     AtEOXact_ComboCid();
+   AtEOXact_HashTables(true);
     pgstat_clear_snapshot();
  
     CurrentResourceOwner = NULL;
@@ -2003,6 +2005,7 @@ AbortTransaction(void)
     smgrabort();
     AtEOXact_Files();
     AtEOXact_ComboCid();
+   AtEOXact_HashTables(false);
     pgstat_clear_snapshot();
     pgstat_count_xact_rollback();
     pgstat_report_txn_timestamp(0);
@@ -3716,6 +3719,7 @@ CommitSubTransaction(void)
                           s->parent->subTransactionId);
     AtEOSubXact_Files(true, s->subTransactionId,
                       s->parent->subTransactionId);
+   AtEOSubXact_HashTables(true, s->nestingLevel);
  
     /*
      * We need to restore the upper transaction's read-only state, in case the
@@ -3827,6 +3831,7 @@ AbortSubTransaction(void)
                               s->parent->subTransactionId);
         AtEOSubXact_Files(false, s->subTransactionId,
                           s->parent->subTransactionId);
+       AtEOSubXact_HashTables(false, s->nestingLevel);
     }
  
     /*
diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c

index e7976ebb6e1280749f1055f8bb8153a3a5cd6493..e9b953f709a107050f722ed5c78960a2b065f8d2 100644 (file)
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -10,7 +10,7 @@
   * Copyright (c) 2002-2007, PostgreSQL Global Development Group
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/commands/prepare.c,v 1.73 2007/04/16 18:21:07 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/commands/prepare.c,v 1.74 2007/04/26 23:24:44 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -21,7 +21,7 @@
  #include "catalog/pg_type.h"
  #include "commands/explain.h"
  #include "commands/prepare.h"
-#include "funcapi.h"
+#include "miscadmin.h"
  #include "parser/analyze.h"
  #include "parser/parse_coerce.h"
  #include "parser/parse_expr.h"
@@ -743,92 +743,99 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, ExplainStmt *stmt,
  Datum
  pg_prepared_statement(PG_FUNCTION_ARGS)
  {
-   FuncCallContext *funcctx;
-   HASH_SEQ_STATUS *hash_seq;
-   PreparedStatement *prep_stmt;
-
-   /* stuff done only on the first call of the function */
-   if (SRF_IS_FIRSTCALL())
-   {
-       TupleDesc   tupdesc;
-       MemoryContext oldcontext;
-
-       /* create a function context for cross-call persistence */
-       funcctx = SRF_FIRSTCALL_INIT();
+   ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+   TupleDesc   tupdesc;
+   Tuplestorestate *tupstore;
+   MemoryContext per_query_ctx;
+   MemoryContext oldcontext;
+
+   /* check to see if caller supports us returning a tuplestore */
+   if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+       ereport(ERROR,
+               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                errmsg("set-valued function called in context that cannot accept a set")));
+   if (!(rsinfo->allowedModes & SFRM_Materialize))
+       ereport(ERROR,
+               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                errmsg("materialize mode required, but it is not " \
+                       "allowed in this context")));
  
-       /*
-        * switch to memory context appropriate for multiple function calls
-        */
-       oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+   /* need to build tuplestore in query context */
+   per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+   oldcontext = MemoryContextSwitchTo(per_query_ctx);
  
-       /* allocate memory for user context */
-       if (prepared_queries)
-       {
-           hash_seq = (HASH_SEQ_STATUS *) palloc(sizeof(HASH_SEQ_STATUS));
-           hash_seq_init(hash_seq, prepared_queries);
-           funcctx->user_fctx = (void *) hash_seq;
-       }
-       else
-           funcctx->user_fctx = NULL;
+   /*
+    * build tupdesc for result tuples. This must match the definition of
+    * the pg_prepared_statements view in system_views.sql
+    */
+   tupdesc = CreateTemplateTupleDesc(5, false);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name",
+                      TEXTOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement",
+                      TEXTOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepare_time",
+                      TIMESTAMPTZOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 4, "parameter_types",
+                      REGTYPEARRAYOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 5, "from_sql",
+                      BOOLOID, -1, 0);
  
-       /*
-        * build tupdesc for result tuples. This must match the definition of
-        * the pg_prepared_statements view in system_views.sql
-        */
-       tupdesc = CreateTemplateTupleDesc(5, false);
-       TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name",
-                          TEXTOID, -1, 0);
-       TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement",
-                          TEXTOID, -1, 0);
-       TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepare_time",
-                          TIMESTAMPTZOID, -1, 0);
-       TupleDescInitEntry(tupdesc, (AttrNumber) 4, "parameter_types",
-                          REGTYPEARRAYOID, -1, 0);
-       TupleDescInitEntry(tupdesc, (AttrNumber) 5, "from_sql",
-                          BOOLOID, -1, 0);
-
-       funcctx->tuple_desc = BlessTupleDesc(tupdesc);
-       MemoryContextSwitchTo(oldcontext);
-   }
+   /*
+    * We put all the tuples into a tuplestore in one scan of the hashtable.
+    * This avoids any issue of the hashtable possibly changing between calls.
+    */
+   tupstore = tuplestore_begin_heap(true, false, work_mem);
  
-   /* stuff done on every call of the function */
-   funcctx = SRF_PERCALL_SETUP();
-   hash_seq = (HASH_SEQ_STATUS *) funcctx->user_fctx;
+   /* hash table might be uninitialized */
+   if (prepared_queries)
+   {
+       HASH_SEQ_STATUS hash_seq;
+       PreparedStatement *prep_stmt;
  
-   /* if the hash table is uninitialized, we're done */
-   if (hash_seq == NULL)
-       SRF_RETURN_DONE(funcctx);
+       hash_seq_init(&hash_seq, prepared_queries);
+       while ((prep_stmt = hash_seq_search(&hash_seq)) != NULL)
+       {
+           HeapTuple   tuple;
+           Datum       values[5];
+           bool        nulls[5];
  
-   prep_stmt = hash_seq_search(hash_seq);
-   if (prep_stmt)
-   {
-       Datum       result;
-       HeapTuple   tuple;
-       Datum       values[5];
-       bool        nulls[5];
+           /* generate junk in short-term context */
+           MemoryContextSwitchTo(oldcontext);
  
-       MemSet(nulls, 0, sizeof(nulls));
+           MemSet(nulls, 0, sizeof(nulls));
  
-       values[0] = DirectFunctionCall1(textin,
+           values[0] = DirectFunctionCall1(textin,
                                       CStringGetDatum(prep_stmt->stmt_name));
  
-       if (prep_stmt->plansource->query_string == NULL)
-           nulls[1] = true;
-       else
-           values[1] = DirectFunctionCall1(textin,
+           if (prep_stmt->plansource->query_string == NULL)
+               nulls[1] = true;
+           else
+               values[1] = DirectFunctionCall1(textin,
                         CStringGetDatum(prep_stmt->plansource->query_string));
  
-       values[2] = TimestampTzGetDatum(prep_stmt->prepare_time);
-       values[3] = build_regtype_array(prep_stmt->plansource->param_types,
-                                       prep_stmt->plansource->num_params);
-       values[4] = BoolGetDatum(prep_stmt->from_sql);
+           values[2] = TimestampTzGetDatum(prep_stmt->prepare_time);
+           values[3] = build_regtype_array(prep_stmt->plansource->param_types,
+                                           prep_stmt->plansource->num_params);
+           values[4] = BoolGetDatum(prep_stmt->from_sql);
+
+           tuple = heap_form_tuple(tupdesc, values, nulls);
  
-       tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
-       result = HeapTupleGetDatum(tuple);
-       SRF_RETURN_NEXT(funcctx, result);
+           /* switch to appropriate context while storing the tuple */
+           MemoryContextSwitchTo(per_query_ctx);
+           tuplestore_puttuple(tupstore, tuple);
+       }
     }
  
-   SRF_RETURN_DONE(funcctx);
+   /* clean up and return the tuplestore */
+   tuplestore_donestoring(tupstore);
+
+   MemoryContextSwitchTo(oldcontext);
+
+   rsinfo->returnMode = SFRM_Materialize;
+   rsinfo->setResult = tupstore;
+   rsinfo->setDesc = tupdesc;
+
+   return (Datum) 0;
  }
  
  /*
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c

index bb0508d6bc243f07d94b41756db3aa48ab2999e0..a66f51c26a62607195bce70cdb64d2c262a50839 100644 (file)
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/executor/nodeSubplan.c,v 1.87 2007/02/27 01:11:25 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/executor/nodeSubplan.c,v 1.88 2007/04/26 23:24:44 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -569,7 +569,7 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot)
     TupleHashIterator hashiter;
     TupleHashEntry entry;
  
-   ResetTupleHashIterator(hashtable, &hashiter);
+   InitTupleHashIterator(hashtable, &hashiter);
     while ((entry = ScanTupleHashTable(&hashiter)) != NULL)
     {
         ExecStoreMinimalTuple(entry->firstTuple, hashtable->tableslot, false);
@@ -577,8 +577,12 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot)
                                numCols, keyColIdx,
                                hashtable->cur_eq_funcs,
                                hashtable->tempcxt))
+       {
+           TermTupleHashIterator(&hashiter);
             return true;
+       }
     }
+   /* No TermTupleHashIterator call needed here */
     return false;
  }
  
diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c

index 2be504213f42d5e49eeba99c8ec2214c508cef89..2f3e00d6a2622f1cce3eb7ead4c876013efb873a 100644 (file)
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -23,7 +23,7 @@
   * Copyright (c) 2003-2007, PostgreSQL Global Development Group
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.11 2007/01/05 22:19:30 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.12 2007/04/26 23:24:44 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -907,7 +907,11 @@ tbm_lossify(TIDBitmap *tbm)
         tbm_mark_page_lossy(tbm, page->blockno);
  
         if (tbm->nentries <= tbm->maxentries)
-           return;             /* we have done enough */
+       {
+           /* we have done enough */
+           hash_seq_term(&status);
+           break;
+       }
  
         /*
          * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c

index f781ca6c3d1a72de4c875fd94824e49629bb7c65..2ced795798abe659252daaad7f20b2092dd313e9 100644 (file)
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -26,7 +26,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.74 2007/01/05 22:19:43 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.75 2007/04/26 23:24:44 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -63,6 +63,7 @@
  
  #include "postgres.h"
  
+#include "access/xact.h"
  #include "storage/shmem.h"
  #include "storage/spin.h"
  #include "utils/dynahash.h"
@@ -160,6 +161,9 @@ struct HTAB
     char       *tabname;        /* table name (for error messages) */
     bool        isshared;       /* true if table is in shared memory */
  
+   /* freezing a shared table isn't allowed, so we can keep state here */
+   bool        frozen;         /* true = no more inserts allowed */
+
     /* We keep local copies of these fixed values to reduce contention */
     Size        keysize;        /* hash key length in bytes */
     long        ssize;          /* segment size --- must be power of 2 */
@@ -195,6 +199,9 @@ static void hdefault(HTAB *hashp);
  static int choose_nelem_alloc(Size entrysize);
  static bool init_htab(HTAB *hashp, long nelem);
  static void hash_corrupted(HTAB *hashp);
+static void register_seq_scan(HTAB *hashp);
+static void deregister_seq_scan(HTAB *hashp);
+static bool has_seq_scans(HTAB *hashp);
  
  
  /*
@@ -356,6 +363,8 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
                      errmsg("out of memory")));
     }
  
+   hashp->frozen = false;
+
     hdefault(hashp);
  
     hctl = hashp->hctl;
@@ -898,6 +907,10 @@ hash_search_with_hash_value(HTAB *hashp,
             if (currBucket != NULL)
                 return (void *) ELEMENTKEY(currBucket);
  
+           /* disallow inserts if frozen */
+           if (hashp->frozen)
+               elog(ERROR, "cannot insert into a frozen hashtable");
+
             currBucket = get_hash_entry(hashp);
             if (currBucket == NULL)
             {
@@ -925,10 +938,15 @@ hash_search_with_hash_value(HTAB *hashp,
  
             /* caller is expected to fill the data field on return */
  
-           /* Check if it is time to split a bucket */
-           /* Can't split if running in partitioned mode */
+           /*
+            * Check if it is time to split a bucket.  Can't split if running
+            * in partitioned mode, nor if table is the subject of any active
+            * hash_seq_search scans.  Strange order of these tests is to try
+            * to check cheaper conditions first.
+            */
             if (!IS_PARTITIONED(hctl) &&
-            hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor)
+               hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
+               !has_seq_scans(hashp))
             {
                 /*
                  * NOTE: failure to expand table is not a fatal error, it just
@@ -1001,18 +1019,30 @@ hash_get_num_entries(HTAB *hashp)
  }
  
  /*
- * hash_seq_init/_search
+ * hash_seq_init/_search/_term
   *         Sequentially search through hash table and return
   *         all the elements one by one, return NULL when no more.
   *
+ * hash_seq_term should be called if and only if the scan is abandoned before
+ * completion; if hash_seq_search returns NULL then it has already done the
+ * end-of-scan cleanup.
+ *
   * NOTE: caller may delete the returned element before continuing the scan.
   * However, deleting any other element while the scan is in progress is
   * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
   * if elements are added to the table while the scan is in progress, it is
   * unspecified whether they will be visited by the scan or not.
   *
+ * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
+ * worry about hash_seq_term cleanup, if the hashtable is first locked against
+ * further insertions by calling hash_freeze.  This is used by nodeAgg.c,
+ * wherein it is inconvenient to track whether a scan is still open, and
+ * there's no possibility of further insertions after readout has begun.
+ *
   * NOTE: to use this with a partitioned hashtable, caller had better hold
   * at least shared lock on all partitions of the table throughout the scan!
+ * We can cope with insertions or deletions by our own backend, but *not*
+ * with concurrent insertions or deletions by another.
   */
  void
  hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
@@ -1020,6 +1050,8 @@ hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
     status->hashp = hashp;
     status->curBucket = 0;
     status->curEntry = NULL;
+   if (!hashp->frozen)
+       register_seq_scan(hashp);
  }
  
  void *
@@ -1054,7 +1086,10 @@ hash_seq_search(HASH_SEQ_STATUS *status)
     max_bucket = hctl->max_bucket;
  
     if (curBucket > max_bucket)
+   {
+       hash_seq_term(status);
         return NULL;            /* search is done */
+   }
  
     /*
      * first find the right segment in the table directory.
@@ -1076,6 +1111,7 @@ hash_seq_search(HASH_SEQ_STATUS *status)
         if (++curBucket > max_bucket)
         {
             status->curBucket = curBucket;
+           hash_seq_term(status);
             return NULL;        /* search is done */
         }
         if (++segment_ndx >= ssize)
@@ -1094,6 +1130,36 @@ hash_seq_search(HASH_SEQ_STATUS *status)
     return (void *) ELEMENTKEY(curElem);
  }
  
+void
+hash_seq_term(HASH_SEQ_STATUS *status)
+{
+   if (!status->hashp->frozen)
+       deregister_seq_scan(status->hashp);
+}
+
+/*
+ * hash_freeze
+ *         Freeze a hashtable against future insertions (deletions are
+ *         still allowed)
+ *
+ * The reason for doing this is that by preventing any more bucket splits,
+ * we no longer need to worry about registering hash_seq_search scans,
+ * and thus caller need not be careful about ensuring hash_seq_term gets
+ * called at the right times.
+ *
+ * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
+ * with active scans (since hash_seq_term would then do the wrong thing).
+ */
+void
+hash_freeze(HTAB *hashp)
+{
+   if (hashp->isshared)
+       elog(ERROR, "cannot freeze shared hashtable");
+   if (!hashp->frozen && has_seq_scans(hashp))
+       elog(ERROR, "cannot freeze hashtable with active scans");
+   hashp->frozen = true;
+}
+
  
  /********************************* UTILITIES ************************/
  
@@ -1324,3 +1390,136 @@ my_log2(long num)
         ;
     return i;
  }
+
+
+/************************* SEQ SCAN TRACKING ************************/
+
+/*
+ * We track active hash_seq_search scans here.  The need for this mechanism
+ * comes from the fact that a scan will get confused if a bucket split occurs
+ * while it's in progress: it might visit entries twice, or even miss some
+ * entirely (if it's partway through the same bucket that splits).  Hence
+ * we want to inhibit bucket splits if there are any active scans on the
+ * table being inserted into.  This is a fairly rare case in current usage,
+ * so just postponing the split until the next insertion seems sufficient.
+ *
+ * Given present usages of the function, only a few scans are likely to be
+ * open concurrently; so a finite-size stack of open scans seems sufficient,
+ * and we don't worry that linear search is too slow.  Note that we do
+ * allow multiple scans of the same hashtable to be open concurrently.
+ *
+ * This mechanism can support concurrent scan and insertion in a shared
+ * hashtable if it's the same backend doing both.  It would fail otherwise,
+ * but locking reasons seem to preclude any such scenario anyway, so we don't
+ * worry.
+ *
+ * This arrangement is reasonably robust if a transient hashtable is deleted
+ * without notifying us.  The absolute worst case is we might inhibit splits
+ * in another table created later at exactly the same address.  We will give
+ * a warning at transaction end for reference leaks, so any bugs leading to
+ * lack of notification should be easy to catch.
+ */
+
+#define MAX_SEQ_SCANS 100
+
+static HTAB *seq_scan_tables[MAX_SEQ_SCANS];   /* tables being scanned */
+static int seq_scan_level[MAX_SEQ_SCANS];      /* subtransaction nest level */
+static int num_seq_scans = 0;
+
+
+/* Register a table as having an active hash_seq_search scan */
+static void
+register_seq_scan(HTAB *hashp)
+{
+   if (num_seq_scans >= MAX_SEQ_SCANS)
+       elog(ERROR, "too many active hash_seq_search scans");
+   seq_scan_tables[num_seq_scans] = hashp;
+   seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
+   num_seq_scans++;
+}
+
+/* Deregister an active scan */
+static void
+deregister_seq_scan(HTAB *hashp)
+{
+   int     i;
+
+   /* Search backward since it's most likely at the stack top */
+   for (i = num_seq_scans - 1; i >= 0; i--)
+   {
+       if (seq_scan_tables[i] == hashp)
+       {
+           seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
+           seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
+           num_seq_scans--;
+           return;
+       }
+   }
+   elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
+        hashp->tabname);
+}
+
+/* Check if a table has any active scan */
+static bool
+has_seq_scans(HTAB *hashp)
+{
+   int     i;
+
+   for (i = 0; i < num_seq_scans; i++)
+   {
+       if (seq_scan_tables[i] == hashp)
+           return true;
+   }
+   return false;
+}
+
+/* Clean up any open scans at end of transaction */
+void
+AtEOXact_HashTables(bool isCommit)
+{
+   /*
+    * During abort cleanup, open scans are expected; just silently clean 'em
+    * out.  An open scan at commit means someone forgot a hash_seq_term()
+    * call, so complain.
+    *
+    * Note: it's tempting to try to print the tabname here, but refrain for
+    * fear of touching deallocated memory.  This isn't a user-facing message
+    * anyway, so it needn't be pretty.
+    */
+   if (isCommit)
+   {
+       int     i;
+
+       for (i = 0; i < num_seq_scans; i++)
+       {
+           elog(WARNING, "leaked hash_seq_search scan for hash table %p",
+                seq_scan_tables[i]);
+       }
+   }
+   num_seq_scans = 0;
+}
+
+/* Clean up any open scans at end of subtransaction */
+void
+AtEOSubXact_HashTables(bool isCommit, int nestDepth)
+{
+   int     i;
+
+   /*
+    * Search backward to make cleanup easy.  Note we must check all entries,
+    * not only those at the end of the array, because deletion technique
+    * doesn't keep them in order.
+    */
+   for (i = num_seq_scans - 1; i >= 0; i--)
+   {
+       if (seq_scan_level[i] >= nestDepth)
+       {
+           if (isCommit)
+               elog(WARNING, "leaked hash_seq_search scan for hash table %p",
+                    seq_scan_tables[i]);
+           seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
+           seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
+           num_seq_scans--;
+       }
+   }
+}
diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c

index 69bb3e3ebdb54be080940f1c63868646ed3fe8f8..412e41952c21b1fbac9621a55db6ae7a12f8ef5e 100644 (file)
--- a/src/backend/utils/mmgr/portalmem.c
+++ b/src/backend/utils/mmgr/portalmem.c
@@ -12,7 +12,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/mmgr/portalmem.c,v 1.102 2007/04/26 16:13:13 neilc Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/mmgr/portalmem.c,v 1.103 2007/04/26 23:24:44 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -22,7 +22,6 @@
  #include "access/xact.h"
  #include "catalog/pg_type.h"
  #include "commands/portalcmds.h"
-#include "funcapi.h"
  #include "miscadmin.h"
  #include "utils/builtins.h"
  #include "utils/memutils.h"
@@ -621,7 +620,9 @@ AtCommit_Portals(void)
         /* Zap all non-holdable portals */
         PortalDrop(portal, true);
  
-       /* Restart the iteration */
+       /* Restart the iteration in case that led to other drops */
+       /* XXX is this really necessary? */
+       hash_seq_term(&status);
         hash_seq_init(&status, PortalHashTable);
     }
  }
@@ -858,79 +859,68 @@ AtSubCleanup_Portals(SubTransactionId mySubid)
  Datum
  pg_cursor(PG_FUNCTION_ARGS)
  {
-   FuncCallContext *funcctx;
-   HASH_SEQ_STATUS *hash_seq;
+   ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+   TupleDesc   tupdesc;
+   Tuplestorestate *tupstore;
+   MemoryContext per_query_ctx;
+   MemoryContext oldcontext;
+   HASH_SEQ_STATUS hash_seq;
     PortalHashEnt *hentry;
  
-   /* stuff done only on the first call of the function */
-   if (SRF_IS_FIRSTCALL())
-   {
-       MemoryContext oldcontext;
-       TupleDesc   tupdesc;
-
-       /* create a function context for cross-call persistence */
-       funcctx = SRF_FIRSTCALL_INIT();
-
-       /*
-        * switch to memory context appropriate for multiple function calls
-        */
-       oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
-
-       if (PortalHashTable)
-       {
-           hash_seq = (HASH_SEQ_STATUS *) palloc(sizeof(HASH_SEQ_STATUS));
-           hash_seq_init(hash_seq, PortalHashTable);
-           funcctx->user_fctx = (void *) hash_seq;
-       }
-       else
-           funcctx->user_fctx = NULL;
-
-       /*
-        * build tupdesc for result tuples. This must match the definition of
-        * the pg_cursors view in system_views.sql
-        */
-       tupdesc = CreateTemplateTupleDesc(6, false);
-       TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name",
-                          TEXTOID, -1, 0);
-       TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement",
-                          TEXTOID, -1, 0);
-       TupleDescInitEntry(tupdesc, (AttrNumber) 3, "is_holdable",
-                          BOOLOID, -1, 0);
-       TupleDescInitEntry(tupdesc, (AttrNumber) 4, "is_binary",
-                          BOOLOID, -1, 0);
-       TupleDescInitEntry(tupdesc, (AttrNumber) 5, "is_scrollable",
-                          BOOLOID, -1, 0);
-       TupleDescInitEntry(tupdesc, (AttrNumber) 6, "creation_time",
-                          TIMESTAMPTZOID, -1, 0);
-
-       funcctx->tuple_desc = BlessTupleDesc(tupdesc);
-       MemoryContextSwitchTo(oldcontext);
-   }
-
-   /* stuff done on every call of the function */
-   funcctx = SRF_PERCALL_SETUP();
-   hash_seq = (HASH_SEQ_STATUS *) funcctx->user_fctx;
+   /* check to see if caller supports us returning a tuplestore */
+   if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+       ereport(ERROR,
+               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                errmsg("set-valued function called in context that cannot accept a set")));
+   if (!(rsinfo->allowedModes & SFRM_Materialize))
+       ereport(ERROR,
+               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                errmsg("materialize mode required, but it is not " \
+                       "allowed in this context")));
+
+   /* need to build tuplestore in query context */
+   per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+   oldcontext = MemoryContextSwitchTo(per_query_ctx);
  
-   /* if the hash table is uninitialized, we're done */
-   if (hash_seq == NULL)
-       SRF_RETURN_DONE(funcctx);
+   /*
+    * build tupdesc for result tuples. This must match the definition of
+    * the pg_cursors view in system_views.sql
+    */
+   tupdesc = CreateTemplateTupleDesc(6, false);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name",
+                      TEXTOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement",
+                      TEXTOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 3, "is_holdable",
+                      BOOLOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 4, "is_binary",
+                      BOOLOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 5, "is_scrollable",
+                      BOOLOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 6, "creation_time",
+                      TIMESTAMPTZOID, -1, 0);
  
-   /* loop until we find a visible portal or hit the end of the list */
-   while ((hentry = hash_seq_search(hash_seq)) != NULL)
-   {
-       if (hentry->portal->visible)
-           break;
-   }
+   /*
+    * We put all the tuples into a tuplestore in one scan of the hashtable.
+    * This avoids any issue of the hashtable possibly changing between calls.
+    */
+   tupstore = tuplestore_begin_heap(true, false, work_mem);
  
-   if (hentry)
+   hash_seq_init(&hash_seq, PortalHashTable);
+   while ((hentry = hash_seq_search(&hash_seq)) != NULL)
     {
-       Portal      portal;
-       Datum       result;
+       Portal      portal = hentry->portal;
         HeapTuple   tuple;
         Datum       values[6];
         bool        nulls[6];
  
-       portal = hentry->portal;
+       /* report only "visible" entries */
+       if (!portal->visible)
+           continue;
+
+       /* generate junk in short-term context */
+       MemoryContextSwitchTo(oldcontext);
+
         MemSet(nulls, 0, sizeof(nulls));
  
         values[0] = DirectFunctionCall1(textin, CStringGetDatum(portal->name));
@@ -944,10 +934,21 @@ pg_cursor(PG_FUNCTION_ARGS)
         values[4] = BoolGetDatum(portal->cursorOptions & CURSOR_OPT_SCROLL);
         values[5] = TimestampTzGetDatum(portal->creation_time);
  
-       tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
-       result = HeapTupleGetDatum(tuple);
-       SRF_RETURN_NEXT(funcctx, result);
+       tuple = heap_form_tuple(tupdesc, values, nulls);
+
+       /* switch to appropriate context while storing the tuple */
+       MemoryContextSwitchTo(per_query_ctx);
+       tuplestore_puttuple(tupstore, tuple);
     }
  
-   SRF_RETURN_DONE(funcctx);
+   /* clean up and return the tuplestore */
+   tuplestore_donestoring(tupstore);
+
+   MemoryContextSwitchTo(oldcontext);
+
+   rsinfo->returnMode = SFRM_Materialize;
+   rsinfo->setResult = tupstore;
+   rsinfo->setDesc = tupdesc;
+
+   return (Datum) 0;
  }
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h

index 56bac9350f5205e7a496e8eafd356e0d9e47a150..726ee5bdae6845b1ab28e876f32ce6669d4336a3 100644 (file)
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.171 2007/03/27 23:21:12 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.172 2007/04/26 23:24:44 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -408,8 +408,20 @@ typedef struct TupleHashTableData
  
  typedef HASH_SEQ_STATUS TupleHashIterator;
  
-#define ResetTupleHashIterator(htable, iter) \
+/*
+ * Use InitTupleHashIterator/TermTupleHashIterator for a read/write scan.
+ * Use ResetTupleHashIterator if the table can be frozen (in this case no
+ * explicit scan termination is needed).
+ */
+#define InitTupleHashIterator(htable, iter) \
     hash_seq_init(iter, (htable)->hashtab)
+#define TermTupleHashIterator(iter) \
+   hash_seq_term(iter)
+#define ResetTupleHashIterator(htable, iter) \
+   do { \
+       hash_freeze((htable)->hashtab); \
+       hash_seq_init(iter, (htable)->hashtab); \
+   } while (0)
  #define ScanTupleHashTable(iter) \
     ((TupleHashEntry) hash_seq_search(iter))
  
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h

index d40ec4f077f7a2be2549b10634e29dfe9c6ec055..47f2dbc9b3f347caa30b7ccba380fdb7598fbaa0 100644 (file)
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.46 2007/01/05 22:19:59 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.47 2007/04/26 23:24:46 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -130,9 +130,13 @@ extern void *hash_search_with_hash_value(HTAB *hashp, const void *keyPtr,
  extern long hash_get_num_entries(HTAB *hashp);
  extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp);
  extern void *hash_seq_search(HASH_SEQ_STATUS *status);
+extern void hash_seq_term(HASH_SEQ_STATUS *status);
+extern void hash_freeze(HTAB *hashp);
  extern Size hash_estimate_size(long num_entries, Size entrysize);
  extern long hash_select_dirsize(long num_entries);
  extern Size hash_get_shared_size(HASHCTL *info, int flags);
+extern void AtEOXact_HashTables(bool isCommit);
+extern void AtEOSubXact_HashTables(bool isCommit, int nestDepth);
  
  /*
   * prototypes for functions in hashfn.c
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 26 Apr 2007 23:24:46 +0000 (23:24 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 26 Apr 2007 23:24:46 +0000 (23:24 +0000)
src/backend/access/transam/xact.c		patch \| blob \| blame \| history
src/backend/commands/prepare.c		patch \| blob \| blame \| history
src/backend/executor/nodeSubplan.c		patch \| blob \| blame \| history
src/backend/nodes/tidbitmap.c		patch \| blob \| blame \| history
src/backend/utils/hash/dynahash.c		patch \| blob \| blame \| history
src/backend/utils/mmgr/portalmem.c		patch \| blob \| blame \| history
src/include/nodes/execnodes.h		patch \| blob \| blame \| history
src/include/utils/hsearch.h		patch \| blob \| blame \| history