diff options
-rw-r--r-- | src/backend/access/transam/xact.c | 6 | ||||
-rw-r--r-- | src/backend/executor/nodeSubplan.c | 8 | ||||
-rw-r--r-- | src/backend/storage/smgr/md.c | 3 | ||||
-rw-r--r-- | src/backend/utils/hash/dynahash.c | 198 | ||||
-rw-r--r-- | src/include/nodes/execnodes.h | 16 | ||||
-rw-r--r-- | src/include/utils/hsearch.h | 8 |
6 files changed, 228 insertions, 11 deletions
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index dd5c4ea64bd..2e5c27635ae 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -10,7 +10,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.195.4.1 2005/04/11 19:51:31 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.195.4.2 2007/04/26 23:25:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1579,6 +1579,7 @@ CommitTransaction(void) AtEOXact_Namespace(true); /* smgrcommit already done */ AtEOXact_Files(); + AtEOXact_HashTables(true); pgstat_count_xact_commit(); CurrentResourceOwner = NULL; @@ -1720,6 +1721,7 @@ AbortTransaction(void) AtEOXact_Namespace(false); smgrabort(); AtEOXact_Files(); + AtEOXact_HashTables(false); pgstat_count_xact_rollback(); /* @@ -3330,6 +3332,7 @@ CommitSubTransaction(void) s->parent->subTransactionId); AtEOSubXact_Files(true, s->subTransactionId, s->parent->subTransactionId); + AtEOSubXact_HashTables(true, s->nestingLevel); /* * We need to restore the upper transaction's read-only state, in case @@ -3439,6 +3442,7 @@ AbortSubTransaction(void) s->parent->subTransactionId); AtEOSubXact_Files(false, s->subTransactionId, s->parent->subTransactionId); + AtEOSubXact_HashTables(false, s->nestingLevel); } /* diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c index 3dcd03db072..7acd98289bf 100644 --- a/src/backend/executor/nodeSubplan.c +++ b/src/backend/executor/nodeSubplan.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/nodeSubplan.c,v 1.66 2004/12/31 21:59:45 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/executor/nodeSubplan.c,v 1.66.4.1 2007/04/26 23:25:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -631,7 +631,7 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot) TupleHashIterator hashiter; TupleHashEntry entry; - ResetTupleHashIterator(hashtable, &hashiter); + InitTupleHashIterator(hashtable, &hashiter); while ((entry = ScanTupleHashTable(&hashiter)) != NULL) { if (!execTuplesUnequal(entry->firstTuple, @@ -640,8 +640,12 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot) numCols, keyColIdx, hashtable->eqfunctions, hashtable->tempcxt)) + { + TermTupleHashIterator(&hashiter); return true; + } } + /* No TermTupleHashIterator call needed here */ return false; } diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 744601f1150..5fd6740ddb3 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.114.4.1 2006/11/20 01:08:10 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.114.4.2 2007/04/26 23:25:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -792,6 +792,7 @@ mdsync(void) entry->rnode.spcNode, entry->rnode.dbNode, entry->rnode.relNode))); + hash_seq_term(&hstat); return false; } } diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index 1a54005d2c8..2d5a6b571d4 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.58.4.1 2005/06/18 20:51:44 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.58.4.2 2007/04/26 23:25:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -44,6 +44,7 @@ #include "postgres.h" +#include "access/xact.h" #include "utils/dynahash.h" #include "utils/hsearch.h" #include "utils/memutils.h" @@ -69,6 +70,9 @@ static bool expand_table(HTAB *hashp); static void hdefault(HTAB *hashp); static bool init_htab(HTAB *hashp, long nelem); static void hash_corrupted(HTAB *hashp); +static void register_seq_scan(HTAB *hashp); +static void deregister_seq_scan(HTAB *hashp); +static bool has_seq_scans(HTAB *hashp); /* @@ -186,6 +190,8 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) errmsg("out of memory"))); } + hashp->frozen = false; + hdefault(hashp); hctl = hashp->hctl; @@ -646,6 +652,10 @@ hash_search(HTAB *hashp, if (currBucket != NULL) return (void *) ELEMENTKEY(currBucket); + /* disallow inserts if frozen */ + if (hashp->frozen) + elog(ERROR, "cannot insert into a frozen hashtable"); + /* get the next free element */ currBucket = hctl->freeList; if (currBucket == NULL) @@ -669,8 +679,12 @@ hash_search(HTAB *hashp, /* caller is expected to fill the data field on return */ - /* Check if it is time to split the segment */ - if (++hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor) + /* + * Check if it is time to split a bucket. Can't split if table + * is the subject of any active hash_seq_search scans. + */ + if (++hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor && + !has_seq_scans(hashp)) { /* * NOTE: failure to expand table is not a fatal error, it @@ -689,15 +703,25 @@ hash_search(HTAB *hashp, } /* - * hash_seq_init/_search + * hash_seq_init/_search/_term * Sequentially search through hash table and return * all the elements one by one, return NULL when no more. * + * hash_seq_term should be called if and only if the scan is abandoned before + * completion; if hash_seq_search returns NULL then it has already done the + * end-of-scan cleanup. + * * NOTE: caller may delete the returned element before continuing the scan. * However, deleting any other element while the scan is in progress is * UNDEFINED (it might be the one that curIndex is pointing at!). Also, * if elements are added to the table while the scan is in progress, it is * unspecified whether they will be visited by the scan or not. + * + * NOTE: it is possible to use hash_seq_init/hash_seq_search without any + * worry about hash_seq_term cleanup, if the hashtable is first locked against + * further insertions by calling hash_freeze. This is used by nodeAgg.c, + * wherein it is inconvenient to track whether a scan is still open, and + * there's no possibility of further insertions after readout has begun. */ void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp) @@ -705,6 +729,8 @@ hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp) status->hashp = hashp; status->curBucket = 0; status->curEntry = NULL; + if (!hashp->frozen) + register_seq_scan(hashp); } void * @@ -758,9 +784,40 @@ hash_seq_search(HASH_SEQ_STATUS *status) ++status->curBucket; } + hash_seq_term(status); return NULL; /* out of buckets */ } +void +hash_seq_term(HASH_SEQ_STATUS *status) +{ + if (!status->hashp->frozen) + deregister_seq_scan(status->hashp); +} + +/* + * hash_freeze + * Freeze a hashtable against future insertions (deletions are + * still allowed) + * + * The reason for doing this is that by preventing any more bucket splits, + * we no longer need to worry about registering hash_seq_search scans, + * and thus caller need not be careful about ensuring hash_seq_term gets + * called at the right times. + * + * Multiple calls to hash_freeze() are allowed, but you can't freeze a table + * with active scans (since hash_seq_term would then do the wrong thing). + */ +void +hash_freeze(HTAB *hashp) +{ + if (hashp->isshared) + elog(ERROR, "cannot freeze shared hashtable"); + if (!hashp->frozen && has_seq_scans(hashp)) + elog(ERROR, "cannot freeze hashtable with active scans"); + hashp->frozen = true; +} + /********************************* UTILITIES ************************/ @@ -971,3 +1028,136 @@ my_log2(long num) ; return i; } + + +/************************* SEQ SCAN TRACKING ************************/ + +/* + * We track active hash_seq_search scans here. The need for this mechanism + * comes from the fact that a scan will get confused if a bucket split occurs + * while it's in progress: it might visit entries twice, or even miss some + * entirely (if it's partway through the same bucket that splits). Hence + * we want to inhibit bucket splits if there are any active scans on the + * table being inserted into. This is a fairly rare case in current usage, + * so just postponing the split until the next insertion seems sufficient. + * + * Given present usages of the function, only a few scans are likely to be + * open concurrently; so a finite-size stack of open scans seems sufficient, + * and we don't worry that linear search is too slow. Note that we do + * allow multiple scans of the same hashtable to be open concurrently. + * + * This mechanism can support concurrent scan and insertion in a shared + * hashtable if it's the same backend doing both. It would fail otherwise, + * but locking reasons seem to preclude any such scenario anyway, so we don't + * worry. + * + * This arrangement is reasonably robust if a transient hashtable is deleted + * without notifying us. The absolute worst case is we might inhibit splits + * in another table created later at exactly the same address. We will give + * a warning at transaction end for reference leaks, so any bugs leading to + * lack of notification should be easy to catch. + */ + +#define MAX_SEQ_SCANS 100 + +static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */ +static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */ +static int num_seq_scans = 0; + + +/* Register a table as having an active hash_seq_search scan */ +static void +register_seq_scan(HTAB *hashp) +{ + if (num_seq_scans >= MAX_SEQ_SCANS) + elog(ERROR, "too many active hash_seq_search scans"); + seq_scan_tables[num_seq_scans] = hashp; + seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel(); + num_seq_scans++; +} + +/* Deregister an active scan */ +static void +deregister_seq_scan(HTAB *hashp) +{ + int i; + + /* Search backward since it's most likely at the stack top */ + for (i = num_seq_scans - 1; i >= 0; i--) + { + if (seq_scan_tables[i] == hashp) + { + seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1]; + seq_scan_level[i] = seq_scan_level[num_seq_scans - 1]; + num_seq_scans--; + return; + } + } + elog(ERROR, "no hash_seq_search scan for hash table \"%s\"", + hashp->tabname); +} + +/* Check if a table has any active scan */ +static bool +has_seq_scans(HTAB *hashp) +{ + int i; + + for (i = 0; i < num_seq_scans; i++) + { + if (seq_scan_tables[i] == hashp) + return true; + } + return false; +} + +/* Clean up any open scans at end of transaction */ +void +AtEOXact_HashTables(bool isCommit) +{ + /* + * During abort cleanup, open scans are expected; just silently clean 'em + * out. An open scan at commit means someone forgot a hash_seq_term() + * call, so complain. + * + * Note: it's tempting to try to print the tabname here, but refrain for + * fear of touching deallocated memory. This isn't a user-facing message + * anyway, so it needn't be pretty. + */ + if (isCommit) + { + int i; + + for (i = 0; i < num_seq_scans; i++) + { + elog(WARNING, "leaked hash_seq_search scan for hash table %p", + seq_scan_tables[i]); + } + } + num_seq_scans = 0; +} + +/* Clean up any open scans at end of subtransaction */ +void +AtEOSubXact_HashTables(bool isCommit, int nestDepth) +{ + int i; + + /* + * Search backward to make cleanup easy. Note we must check all entries, + * not only those at the end of the array, because deletion technique + * doesn't keep them in order. + */ + for (i = num_seq_scans - 1; i >= 0; i--) + { + if (seq_scan_level[i] >= nestDepth) + { + if (isCommit) + elog(WARNING, "leaked hash_seq_search scan for hash table %p", + seq_scan_tables[i]); + seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1]; + seq_scan_level[i] = seq_scan_level[num_seq_scans - 1]; + num_seq_scans--; + } + } +} diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index a9bfd5c5d9b..3598ba8de75 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.122 2004/12/31 22:03:34 pgsql Exp $ + * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.122.4.1 2007/04/26 23:25:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -360,8 +360,20 @@ typedef struct TupleHashTableData typedef HASH_SEQ_STATUS TupleHashIterator; -#define ResetTupleHashIterator(htable, iter) \ +/* + * Use InitTupleHashIterator/TermTupleHashIterator for a read/write scan. + * Use ResetTupleHashIterator if the table can be frozen (in this case no + * explicit scan termination is needed). + */ +#define InitTupleHashIterator(htable, iter) \ hash_seq_init(iter, (htable)->hashtab) +#define TermTupleHashIterator(iter) \ + hash_seq_term(iter) +#define ResetTupleHashIterator(htable, iter) \ + do { \ + hash_freeze((htable)->hashtab); \ + hash_seq_init(iter, (htable)->hashtab); \ + } while (0) #define ScanTupleHashTable(iter) \ ((TupleHashEntry) hash_seq_search(iter)) diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h index 5a012309ea1..db5bf4e3798 100644 --- a/src/include/utils/hsearch.h +++ b/src/include/utils/hsearch.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.34.4.1 2005/06/18 20:51:44 tgl Exp $ + * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.34.4.2 2007/04/26 23:25:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -119,6 +119,8 @@ typedef struct HTAB * used */ char *tabname; /* table name (for error messages) */ bool isshared; /* true if table is in shared memory */ + /* freezing a shared table isn't allowed, so we can keep state here */ + bool frozen; /* true = no more inserts allowed */ HashCopyFunc keycopy; /* key copying function */ } HTAB; @@ -188,8 +190,12 @@ extern void *hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr); extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp); extern void *hash_seq_search(HASH_SEQ_STATUS *status); +extern void hash_seq_term(HASH_SEQ_STATUS *status); +extern void hash_freeze(HTAB *hashp); extern long hash_estimate_size(long num_entries, Size entrysize); extern long hash_select_dirsize(long num_entries); +extern void AtEOXact_HashTables(bool isCommit); +extern void AtEOSubXact_HashTables(bool isCommit, int nestDepth); /* * prototypes for functions in hashfn.c |