Final stage of psort reconstruction work: replace psort.c with

author Tom Lane <tgl@sss.pgh.pa.us>

Sun, 17 Oct 1999 22:15:09 +0000 (22:15 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sun, 17 Oct 1999 22:15:09 +0000 (22:15 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Sun, 17 Oct 1999 22:15:09 +0000 (22:15 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sun, 17 Oct 1999 22:15:09 +0000 (22:15 +0000)
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c

index 11f527fc3bad21b3b632e6a9efddc999a9ea5fa8..d8d835f424bf9d88fdfefd55ab946ec1ed9525e1 100644 (file)
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -1,17 +1,17 @@
  /*-------------------------------------------------------------------------
   *
- * btree.c
+ * nbtree.c
   *   Implementation of Lehman and Yao's btree management algorithm for
   *   Postgres.
   *
- * Copyright (c) 1994, Regents of the University of California
+ * NOTES
+ *   This file contains only the public interface routines.
   *
   *
- * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.46 1999/09/18 19:06:10 tgl Exp $
+ * Copyright (c) 1994, Regents of the University of California
   *
- * NOTES
- *   This file contains only the public interface routines.
+ * IDENTIFICATION
+ *   $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.47 1999/10/17 22:15:03 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -76,7 +76,7 @@ btbuild(Relation heap,
  #endif
     Node       *pred,
                *oldPred;
-   void       *spool = (void *) NULL;
+   BTSpool    *spool = NULL;
     bool        isunique;
     bool        usefast;
  
@@ -147,7 +147,7 @@ btbuild(Relation heap,
  
     if (usefast)
     {
-       spool = _bt_spoolinit(index, 7, isunique);
+       spool = _bt_spoolinit(index, isunique);
         res = (InsertIndexResult) NULL;
     }
  
@@ -249,11 +249,11 @@ btbuild(Relation heap,
  
         /*
          * if we are doing bottom-up btree build, we insert the index into
-        * a spool page for subsequent processing.  otherwise, we insert
+        * a spool file for subsequent processing.  otherwise, we insert
          * into the btree.
          */
         if (usefast)
-           _bt_spool(index, btitem, spool);
+           _bt_spool(btitem, spool);
         else
             res = _bt_doinsert(index, btitem, isunique, heap);
  
@@ -275,15 +275,13 @@ btbuild(Relation heap,
     }
  
     /*
-    * if we are doing bottom-up btree build, we now have a bunch of
-    * sorted runs in the spool pages.  finish the build by (1) merging
-    * the runs, (2) inserting the sorted tuples into btree pages and (3)
-    * building the upper levels.
+    * if we are doing bottom-up btree build, finish the build by
+    * (1) completing the sort of the spool file, (2) inserting the
+    * sorted tuples into btree pages and (3) building the upper levels.
      */
     if (usefast)
     {
-       _bt_spool(index, (BTItem) NULL, spool); /* flush the spool */
-       _bt_leafbuild(index, spool);
+       _bt_leafbuild(spool);
         _bt_spooldestroy(spool);
     }
  
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c

index c1564544b0318f35164e24cf1437b0d1a6f9232b..48386c113f05113e07ac90edec3d760f25478d43 100644 (file)
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1,68 +1,47 @@
  /*-------------------------------------------------------------------------
- * btsort.c
- *
- * Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- *   $Id: nbtsort.c,v 1.46 1999/07/19 07:07:19 momjian Exp $
+ * nbtsort.c
+ *     Build a btree from sorted input by loading leaf pages sequentially.
   *
   * NOTES
   *
- * what we do is:
- * - generate a set of initial one-block runs, distributed round-robin
- *  between the output tapes.
- * - for each pass,
- *  - swap input and output tape sets, rewinding both and truncating
- *    the output tapes.
- *  - merge the current run in each input tape to the current output
- *    tape.
- *    - when each input run has been exhausted, switch to another output
- *      tape and start processing another run.
- * - when we have fewer runs than tapes, we know we are ready to start
- *  merging into the btree leaf pages.  (i.e., we do not have to wait
- *  until we have exactly one tape.)
- * - as we extract tuples from the final runs, we build the pages for
- *  each level.  when we have only one page on a level, it must be the
- *  root -- it can be attached to the btree metapage and we are done.
- *
- * conventions:
- * - external interface routines take in and return "void *" for their
- *  opaque handles.  this is for modularity reasons.
+ * We use tuplesort.c to sort the given index tuples into order.
+ * Then we scan the index tuples in order and build the btree pages
+ * for each level.  When we have only one page on a level, it must be the
+ * root -- it can be attached to the btree metapage and we are done.
   *
   * this code is moderately slow (~10% slower) compared to the regular
   * btree (insertion) build code on sorted or well-clustered data.  on
   * random data, however, the insertion build code is unusable -- the
   * difference on a 60MB heap is a factor of 15 because the random
- * probes into the btree thrash the buffer pool.
+ * probes into the btree thrash the buffer pool.  (NOTE: the above
+ * "10%" estimate is probably obsolete, since it refers to an old and
+ * not very good external sort implementation that used to exist in
+ * this module.  tuplesort.c is almost certainly faster.)
   *
   * this code currently packs the pages to 100% of capacity.  this is
   * not wise, since *any* insertion will cause splitting.  filling to
   * something like the standard 70% steady-state load factor for btrees
   * would probably be better.
   *
- * somebody desperately needs to figure out how to do a better job of
- * balancing the merge passes -- the fan-in on the final merges can be
- * pretty poor, which is bad for performance.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *   $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.47 1999/10/17 22:15:04 tgl Exp $
+ *
   *-------------------------------------------------------------------------
   */
  
-#include <fcntl.h>
-
  #include "postgres.h"
  
  #include "access/nbtree.h"
+#include "utils/tuplesort.h"
  
  
  #ifdef BTREE_BUILD_STATS
  #define ShowExecutorStats pg_options[TRACE_EXECUTORSTATS]
  #endif
  
-static BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags);
-static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);
-static void *_bt_pagestate(Relation index, int flags, int level, bool doupper);
-static void _bt_uppershutdown(Relation index, BTPageState *state);
-
  /*
   * turn on debugging output.
   *
@@ -70,689 +49,108 @@ static void _bt_uppershutdown(Relation index, BTPageState *state);
   * only really useful for integer keys.
   */
  /*#define FASTBUILD_DEBUG*/
-#define FASTBUILD_SPOOL
-#define FASTBUILD_MERGE
-
-#define MAXTAPES       (7)
-#define TAPEBLCKSZ     (BLCKSZ << 2)
-
-extern int NDirectFileRead;
-extern int NDirectFileWrite;
-
-/*
- * this is what we use to shovel BTItems in and out of memory. it's
- * bigger than a standard block because we are doing a lot of strictly
- * sequential i/o. this is obviously something of a tradeoff since we
- * are potentially reading a bunch of zeroes off of disk in many
- * cases.
- *
- * BTItems are packed in and MAXALIGN'd.
- *
- * the fd should not be going out to disk, strictly speaking, but it's
- * the only thing like that so i'm not going to worry about wasting a
- * few bytes.
- */
-typedef struct
-{
-   int         bttb_magic;     /* magic number */
-   File        bttb_fd;        /* file descriptor */
-   int         bttb_top;       /* top of free space within bttb_data */
-   short       bttb_ntup;      /* number of tuples in this block */
-   short       bttb_eor;       /* End-Of-Run marker */
-   char        bttb_data[TAPEBLCKSZ - 2 * sizeof(double)];
-} BTTapeBlock;
  
  /*
- * this structure holds the bookkeeping for a simple balanced multiway
- * merge.  (polyphase merging is hairier than i want to get into right
- * now, and i don't see why i have to care how many "tapes" i use
- * right now.  though if psort was in a condition that i could hack it
- * to do this, you bet i would.)
+ * Status record for spooling.
   */
-typedef struct
+struct BTSpool
  {
-   int         bts_ntapes;
-   int         bts_tape;
-   BTTapeBlock **bts_itape;    /* input tape blocks */
-   BTTapeBlock **bts_otape;    /* output tape blocks */
+   Tuplesortstate *sortstate;  /* state data for tuplesort.c */
+   Relation    index;
     bool        isunique;
-} BTSpool;
-
-/*-------------------------------------------------------------------------
- * sorting comparison routine - returns {-1,0,1} depending on whether
- * the key in the left BTItem is {<,=,>} the key in the right BTItem.
- *
- * we want to use _bt_isortcmp as a comparison function for qsort(3),
- * but it needs extra arguments, so we "pass them in" as global
- * variables.  ick.  fortunately, they are the same throughout the
- * build, so we need do this only once.  this is why you must call
- * _bt_isortcmpinit before the call to qsort(3).
- *
- * a NULL BTItem is always assumed to be greater than any actual
- * value; our heap routines (see below) assume that the smallest
- * element in the heap is returned.  that way, NULL values from the
- * exhausted tapes can sift down to the bottom of the heap.  in point
- * of fact we just don't replace the elements of exhausted tapes, but
- * what the heck.
- * *-------------------------------------------------------------------------
- */
-typedef struct
-{
-   Datum      *btsk_datum;
-   char       *btsk_nulls;
-   BTItem      btsk_item;
-} BTSortKey;
-
-static Relation _bt_sortrel;
-static int _bt_nattr;
-static BTSpool *_bt_inspool;
-
-static void
-_bt_isortcmpinit(Relation index, BTSpool *spool)
-{
-   _bt_sortrel = index;
-   _bt_inspool = spool;
-   _bt_nattr = index->rd_att->natts;
-}
-
-static int
-_bt_isortcmp(BTSortKey *k1, BTSortKey *k2)
-{
-   Datum      *k1_datum = k1->btsk_datum;
-   Datum      *k2_datum = k2->btsk_datum;
-   char       *k1_nulls = k1->btsk_nulls;
-   char       *k2_nulls = k2->btsk_nulls;
-   bool        equal_isnull = false;
-   int         i;
-
-   if (k1->btsk_item == (BTItem) NULL)
-   {
-       if (k2->btsk_item == (BTItem) NULL)
-           return 0;           /* 1 = 2 */
-       return 1;               /* 1 > 2 */
-   }
-   else if (k2->btsk_item == (BTItem) NULL)
-       return -1;              /* 1 < 2 */
-
-   for (i = 0; i < _bt_nattr; i++)
-   {
-       if (k1_nulls[i] != ' ') /* k1 attr is NULL */
-       {
-           if (k2_nulls[i] != ' ')     /* the same for k2 */
-           {
-               equal_isnull = true;
-               continue;
-           }
-           return 1;           /* NULL ">" NOT_NULL */
-       }
-       else if (k2_nulls[i] != ' ')    /* k2 attr is NULL */
-           return -1;          /* NOT_NULL "<" NULL */
-
-       if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber,
-                           k1_datum[i], k2_datum[i]))
-           return 1;           /* 1 > 2 */
-       else if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber,
-                                k2_datum[i], k1_datum[i]))
-           return -1;          /* 1 < 2 */
-   }
-
-   if (_bt_inspool->isunique && !equal_isnull)
-   {
-       _bt_spooldestroy((void *) _bt_inspool);
-       elog(ERROR, "Cannot create unique index. Table contains non-unique values");
-   }
-   return 0;                   /* 1 = 2 */
-}
-
-static void
-_bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk)
-{
-   sk->btsk_item = (BTItem) NULL;
-   sk->btsk_datum = (Datum *) NULL;
-   sk->btsk_nulls = (char *) NULL;
-
-   if (bti != (BTItem) NULL)
-   {
-       IndexTuple  it = &(bti->bti_itup);
-       TupleDesc   itdesc = index->rd_att;
-       Datum      *dp = (Datum *) palloc(_bt_nattr * sizeof(Datum));
-       char       *np = (char *) palloc(_bt_nattr * sizeof(char));
-       bool        isnull;
-       int         i;
-
-       for (i = 0; i < _bt_nattr; i++)
-       {
-           dp[i] = index_getattr(it, i + 1, itdesc, &isnull);
-           if (isnull)
-               np[i] = 'n';
-           else
-               np[i] = ' ';
-       }
-       sk->btsk_item = bti;
-       sk->btsk_datum = dp;
-       sk->btsk_nulls = np;
-   }
-}
-
-/*-------------------------------------------------------------------------
- * priority queue methods
- *
- * these were more-or-less lifted from the heap section of the 1984
- * edition of gonnet's book on algorithms and data structures.  they
- * are coded so that the smallest element in the heap is returned (we
- * use them for merging sorted runs).
- *
- * XXX these probably ought to be generic library functions.
- *-------------------------------------------------------------------------
- */
-typedef struct
-{
-   int         btpqe_tape;     /* tape identifier */
-   BTSortKey   btpqe_item;     /* pointer to BTItem in tape buffer */
-} BTPriQueueElem;
-
-#define MAXELEM MAXTAPES
-typedef struct
-{
-   int         btpq_nelem;
-   BTPriQueueElem btpq_queue[MAXELEM];
-   Relation    btpq_rel;
-} BTPriQueue;
-
-/* be sure to call _bt_isortcmpinit first */
-#define GREATER(a, b) \
-   (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0)
-
-static void
-_bt_pqsift(BTPriQueue *q, int parent)
-{
-   int         child;
-   BTPriQueueElem e;
-
-   for (child = parent * 2 + 1;
-        child < q->btpq_nelem;
-        child = parent * 2 + 1)
-   {
-       if (child < q->btpq_nelem - 1)
-       {
-           if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child + 1])))
-               ++child;
-       }
-       if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child])))
-       {
-           e = q->btpq_queue[child];   /* struct = */
-           q->btpq_queue[child] = q->btpq_queue[parent];       /* struct = */
-           q->btpq_queue[parent] = e;  /* struct = */
-           parent = child;
-       }
-       else
-           parent = child + 1;
-   }
-}
-
-static int
-_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e)
-{
-   if (q->btpq_nelem < 1)
-   {                           /* already empty */
-       return -1;
-   }
-   *e = q->btpq_queue[0];      /* struct = */
-
-   if (--q->btpq_nelem < 1)
-   {                           /* now empty, don't sift */
-       return 0;
-   }
-   q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem];    /* struct = */
-   _bt_pqsift(q, 0);
-   return 0;
-}
-
-static void
-_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e)
-{
-   int         child,
-               parent;
-
-   if (q->btpq_nelem >= MAXELEM)
-       elog(ERROR, "_bt_pqadd: queue overflow");
-
-   child = q->btpq_nelem++;
-   while (child > 0)
-   {
-       parent = child / 2;
-       if (GREATER(e, &(q->btpq_queue[parent])))
-           break;
-       else
-       {
-           q->btpq_queue[child] = q->btpq_queue[parent];       /* struct = */
-           child = parent;
-       }
-   }
-
-   q->btpq_queue[child] = *e;  /* struct = */
-}
-
-/*-------------------------------------------------------------------------
- * tape methods
- *-------------------------------------------------------------------------
- */
+};
  
  #define BTITEMSZ(btitem) \
     ((btitem) ? \
      (IndexTupleDSize((btitem)->bti_itup) + \
       (sizeof(BTItemData) - sizeof(IndexTupleData))) : \
      0)
-#define SPCLEFT(tape) \
-   (sizeof((tape)->bttb_data) - (tape)->bttb_top)
-#define EMPTYTAPE(tape) \
-   ((tape)->bttb_ntup <= 0)
-#define BTTAPEMAGIC        0x19660226
-
-/*
- * reset the tape header for its next use without doing anything to
- * the physical tape file. (setting bttb_top to 0 makes the block
- * empty.)
- */
-static void
-_bt_tapereset(BTTapeBlock *tape)
-{
-   tape->bttb_eor = 0;
-   tape->bttb_top = 0;
-   tape->bttb_ntup = 0;
-}
-
-/*
- * rewind the physical tape file.
- */
-static void
-_bt_taperewind(BTTapeBlock *tape)
-{
-   FileSeek(tape->bttb_fd, 0L, SEEK_SET);
-}
-
-/*
- * destroy the contents of the physical tape file without destroying
- * the tape data structure or removing the physical tape file.
- *
- * we use the VFD version of ftruncate(2) to do this rather than
- * unlinking and recreating the file.  you still have to wait while
- * the OS frees up all of the file system blocks and stuff, but at
- * least you don't have to delete and reinsert the directory entries.
- */
-static void
-_bt_tapeclear(BTTapeBlock *tape)
-{
-   /* blow away the contents of the old file */
-   _bt_taperewind(tape);
-#ifdef NOT_USED
-   FileSync(tape->bttb_fd);
-#endif
-   FileTruncate(tape->bttb_fd, 0);
-
-   /* reset the buffer */
-   _bt_tapereset(tape);
-}
-
-/*
- * create a new BTTapeBlock, allocating memory for the data structure
- * as well as opening a physical tape file.
- */
-static BTTapeBlock *
-_bt_tapecreate(void)
-{
-   BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock));
-
-   if (tape == (BTTapeBlock *) NULL)
-       elog(ERROR, "_bt_tapecreate: out of memory");
-
-   tape->bttb_magic = BTTAPEMAGIC;
-
-   tape->bttb_fd = OpenTemporaryFile();
-   Assert(tape->bttb_fd >= 0);
-
-   /* initialize the buffer */
-   _bt_tapereset(tape);
-
-   return tape;
-}
-
-/*
- * destroy the BTTapeBlock structure and its physical tape file.
- */
-static void
-_bt_tapedestroy(BTTapeBlock *tape)
-{
-   FileUnlink(tape->bttb_fd);
-   pfree((void *) tape);
-}
-
-/*
- * flush the tape block to the file, marking End-Of-Run if requested.
- */
-static void
-_bt_tapewrite(BTTapeBlock *tape, int eor)
-{
-   tape->bttb_eor = eor;
-   FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ);
-   NDirectFileWrite += TAPEBLCKSZ / BLCKSZ;
-   _bt_tapereset(tape);
-}
-
-/*
- * read a tape block from the file, overwriting the current contents
- * of the buffer.
- *
- * returns:
- * - 0 if there are no more blocks in the tape or in this run (call
- *  _bt_tapereset to clear the End-Of-Run marker)
- * - 1 if a valid block was read
- */
-static int
-_bt_taperead(BTTapeBlock *tape)
-{
-   File        fd;
-   int         nread;
-
-   if (tape->bttb_eor)
-   {
-       return 0;               /* we are already at End-Of-Run */
-   }
-
-   /*
-    * we're clobbering the old tape block, but we do need to save the VFD
-    * (the one in the block we're reading is bogus).
-    */
-   fd = tape->bttb_fd;
-   nread = FileRead(fd, (char *) tape, TAPEBLCKSZ);
-   tape->bttb_fd = fd;
  
-   if (nread != TAPEBLCKSZ)
-   {
-       Assert(nread == 0);     /* we are at EOF */
-       return 0;
-   }
-   Assert(tape->bttb_magic == BTTAPEMAGIC);
-   NDirectFileRead += TAPEBLCKSZ / BLCKSZ;
-   return 1;
-}
  
-/*
- * get the next BTItem from a tape block.
- *
- * returns:
- * - NULL if we have run out of BTItems
- * - a pointer to the BTItemData in the block otherwise
- *
- * side effects:
- * - sets 'pos' to the current position within the block.
- */
-static BTItem
-_bt_tapenext(BTTapeBlock *tape, char **pos)
-{
-   Size        itemsz;
-   BTItem      bti;
+static void _bt_load(Relation index, BTSpool *btspool);
+static BTItem _bt_buildadd(Relation index, BTPageState *state, BTItem bti,
+                          int flags);
+static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);
+static BTPageState *_bt_pagestate(Relation index, int flags,
+                                 int level, bool doupper);
+static void _bt_uppershutdown(Relation index, BTPageState *state);
  
-   if (*pos >= tape->bttb_data + tape->bttb_top)
-       return (BTItem) NULL;
-   bti = (BTItem) *pos;
-   itemsz = BTITEMSZ(bti);
-   *pos += MAXALIGN(itemsz);
-   return bti;
-}
  
  /*
- * copy a BTItem into a tape block.
- *
- * assumes that we have already checked to see if the block has enough
- * space for the item.
- *
- * side effects:
- *
- * - advances the 'top' pointer in the tape block header to point to
- * the beginning of free space.
+ * Interface routines
   */
-static void
-_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz)
-{
-   memcpy(tape->bttb_data + tape->bttb_top, item, itemsz);
-   ++tape->bttb_ntup;
-   tape->bttb_top += MAXALIGN(itemsz);
-}
  
-/*-------------------------------------------------------------------------
- * spool methods
- *-------------------------------------------------------------------------
- */
  
  /*
- * create and initialize a spool structure, including the underlying
- * files.
+ * create and initialize a spool structure
   */
-void *
-_bt_spoolinit(Relation index, int ntapes, bool isunique)
+BTSpool *
+_bt_spoolinit(Relation index, bool isunique)
  {
     BTSpool    *btspool = (BTSpool *) palloc(sizeof(BTSpool));
-   int         i;
  
-   if (btspool == (BTSpool *) NULL)
-       elog(ERROR, "_bt_spoolinit: out of memory");
     MemSet((char *) btspool, 0, sizeof(BTSpool));
-   btspool->bts_ntapes = ntapes;
-   btspool->bts_tape = 0;
-   btspool->isunique = isunique;
  
-   btspool->bts_itape = (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
-   btspool->bts_otape = (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
-   if (btspool->bts_itape == (BTTapeBlock **) NULL ||
-       btspool->bts_otape == (BTTapeBlock **) NULL)
-       elog(ERROR, "_bt_spoolinit: out of memory");
+   btspool->index = index;
+   btspool->isunique = isunique;
  
-   for (i = 0; i < ntapes; ++i)
-   {
-       btspool->bts_itape[i] = _bt_tapecreate();
-       btspool->bts_otape[i] = _bt_tapecreate();
-   }
+   btspool->sortstate = tuplesort_begin_index(index, isunique, false);
  
-   _bt_isortcmpinit(index, btspool);
+   /*
+    * Currently, tuplesort provides sort functions on IndexTuples.
+    * If we kept anything in a BTItem other than a regular IndexTuple,
+    * we'd need to modify tuplesort to understand BTItems as such.
+    */
+   Assert(sizeof(BTItemData) == sizeof(IndexTupleData));
  
-   return (void *) btspool;
+   return btspool;
  }
  
  /*
   * clean up a spool structure and its substructures.
   */
  void
-_bt_spooldestroy(void *spool)
+_bt_spooldestroy(BTSpool *btspool)
  {
-   BTSpool    *btspool = (BTSpool *) spool;
-   int         i;
-
-   for (i = 0; i < btspool->bts_ntapes; ++i)
-   {
-       _bt_tapedestroy(btspool->bts_otape[i]);
-       _bt_tapedestroy(btspool->bts_itape[i]);
-   }
+   tuplesort_end(btspool->sortstate);
     pfree((void *) btspool);
  }
  
  /*
- * flush out any dirty output tape blocks
+ * spool a btitem into the sort file.
   */
-static void
-_bt_spoolflush(BTSpool *btspool)
+void
+_bt_spool(BTItem btitem, BTSpool *btspool)
  {
-   int         i;
-
-   for (i = 0; i < btspool->bts_ntapes; ++i)
-   {
-       if (!EMPTYTAPE(btspool->bts_otape[i]))
-           _bt_tapewrite(btspool->bts_otape[i], 1);
-   }
+   /* A BTItem is really just an IndexTuple */
+   tuplesort_puttuple(btspool->sortstate, (void *) btitem);
  }
  
  /*
- * swap input tapes and output tapes by swapping their file
- * descriptors.  additional preparation for the next merge pass
- * includes rewinding the new input tapes and clearing out the new
- * output tapes.
+ * given a spool loaded by successive calls to _bt_spool,
+ * create an entire btree.
   */
-static void
-_bt_spoolswap(BTSpool *btspool)
+void
+_bt_leafbuild(BTSpool *btspool)
  {
-   File        tmpfd;
-   BTTapeBlock *itape;
-   BTTapeBlock *otape;
-   int         i;
-
-   for (i = 0; i < btspool->bts_ntapes; ++i)
+#ifdef BTREE_BUILD_STATS
+   if (ShowExecutorStats)
     {
-       itape = btspool->bts_itape[i];
-       otape = btspool->bts_otape[i];
-
-       /*
-        * swap the input and output VFDs.
-        */
-       tmpfd = itape->bttb_fd;
-       itape->bttb_fd = otape->bttb_fd;
-       otape->bttb_fd = tmpfd;
-
-       /*
-        * rewind the new input tape.
-        */
-       _bt_taperewind(itape);
-       _bt_tapereset(itape);
-
-       /*
-        * clear the new output tape -- it's ok to throw away the old
-        * inputs.
-        */
-       _bt_tapeclear(otape);
+       fprintf(stderr, "! BtreeBuild (Spool) Stats:\n");
+       ShowUsage();
+       ResetUsage();
     }
+#endif
+   tuplesort_performsort(btspool->sortstate);
+
+   _bt_load(btspool->index, btspool);
  }
  
-/*-------------------------------------------------------------------------
- * sorting routines
- *-------------------------------------------------------------------------
- */
  
  /*
- * spool 'btitem' into an initial run. as tape blocks are filled, the
- * block BTItems are qsorted and written into some output tape (it
- * doesn't matter which; we go round-robin for simplicity).  the
- * initial runs are therefore always just one block.
+ * Internal routines.
   */
-void
-_bt_spool(Relation index, BTItem btitem, void *spool)
-{
-   BTSpool    *btspool = (BTSpool *) spool;
-   BTTapeBlock *itape;
-   Size        itemsz;
-
-   _bt_isortcmpinit(index, btspool);
  
-   itape = btspool->bts_itape[btspool->bts_tape];
-   itemsz = BTITEMSZ(btitem);
-   itemsz = MAXALIGN(itemsz);
-
-   /*
-    * if this buffer is too full for this BTItemData, or if we have run
-    * out of BTItems, we need to sort the buffer and write it out.  in
-    * this case, the BTItemData will go into the next tape's buffer.
-    */
-   if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz)
-   {
-       BTSortKey  *parray = (BTSortKey *) NULL;
-       BTTapeBlock *otape;
-       BTItem      bti;
-       char       *pos;
-       int         btisz;
-       int         it_ntup = itape->bttb_ntup;
-       int         i;
-
-       /*
-        * build an array of pointers to the BTItemDatas on the input
-        * block.
-        */
-       if (it_ntup > 0)
-       {
-           parray = (BTSortKey *) palloc(it_ntup * sizeof(BTSortKey));
-           pos = itape->bttb_data;
-           for (i = 0; i < it_ntup; ++i)
-               _bt_setsortkey(index, _bt_tapenext(itape, &pos), &(parray[i]));
-
-           /*
-            * qsort the pointer array.
-            */
-           qsort((void *) parray, it_ntup, sizeof(BTSortKey),
-                 (int (*) (const void *, const void *)) _bt_isortcmp);
-       }
-
-       /*
-        * write the spooled run into the output tape.  we copy the
-        * BTItemDatas in the order dictated by the sorted array of
-        * BTItems, not the original order.
-        *
-        * (since everything was MAXALIGN'd and is all on a single tape
-        * block, everything had *better* still fit on one tape block..)
-        */
-       otape = btspool->bts_otape[btspool->bts_tape];
-       for (i = 0; i < it_ntup; ++i)
-       {
-           bti = parray[i].btsk_item;
-           btisz = BTITEMSZ(bti);
-           btisz = MAXALIGN(btisz);
-           _bt_tapeadd(otape, bti, btisz);
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_SPOOL)
-           {
-               bool        isnull;
-               Datum       d = index_getattr(&(bti->bti_itup), 1, index->rd_att,
-                                             &isnull);
-
-               printf("_bt_spool: inserted <%x> into output tape %d\n",
-                      d, btspool->bts_tape);
-           }
-#endif  /* FASTBUILD_DEBUG && FASTBUILD_SPOOL */
-       }
-
-       /*
-        * the initial runs are always single tape blocks.  flush the
-        * output block, marking End-Of-Run.
-        */
-       _bt_tapewrite(otape, 1);
-
-       /*
-        * reset the input buffer for the next run.  we don't have to
-        * write it out or anything -- we only use it to hold the unsorted
-        * BTItemDatas, the output tape contains all the sorted stuff.
-        *
-        * changing bts_tape changes the output tape and input tape; we
-        * change itape for the code below.
-        */
-       _bt_tapereset(itape);
-       btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
-       itape = btspool->bts_itape[btspool->bts_tape];
-
-       /*
-        * destroy the pointer array.
-        */
-       if (parray != (BTSortKey *) NULL)
-       {
-           for (i = 0; i < it_ntup; i++)
-           {
-               if (parray[i].btsk_datum != (Datum *) NULL)
-                   pfree((void *) (parray[i].btsk_datum));
-               if (parray[i].btsk_nulls != (char *) NULL)
-                   pfree((void *) (parray[i].btsk_nulls));
-           }
-           pfree((void *) parray);
-       }
-   }
-
-   /* insert this item into the current buffer */
-   if (btitem != (BTItem) NULL)
-       _bt_tapeadd(itape, btitem, itemsz);
-}
  
  /*
   * allocate a new, clean btree page, not linked to any siblings.
@@ -805,7 +203,7 @@ _bt_slideleft(Relation index, Buffer buf, Page page)
   * allocate and initialize a new BTPageState.  the returned structure
   * is suitable for immediate use by _bt_buildadd.
   */
-static void *
+static BTPageState *
  _bt_pagestate(Relation index, int flags, int level, bool doupper)
  {
     BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState));
@@ -819,7 +217,7 @@ _bt_pagestate(Relation index, int flags, int level, bool doupper)
     state->btps_level = level;
     state->btps_doupper = doupper;
  
-   return (void *) state;
+   return state;
  }
  
  /*
@@ -883,9 +281,8 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend)
   * if all keys are unique, 'first' will always be the same as 'last'.
   */
  static BTItem
-_bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
+_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
  {
-   BTPageState *state = (BTPageState *) pstate;
     Buffer      nbuf;
     Page        npage;
     BTItem      last_bti;
@@ -944,8 +341,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
             if (PageAddItem(npage, PageGetItem(opage, ii),
                           ii->lp_len, n, LP_USED) == InvalidOffsetNumber)
                 elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)");
-#ifdef NOT_USED
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
+#ifdef FASTBUILD_DEBUG
             {
                 bool        isnull;
                 BTItem      tmpbti =
@@ -956,7 +352,6 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
                 printf("_bt_buildadd: moved <%x> to offset %d at level %d\n",
                        d, n, state->btps_level);
             }
-#endif  /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
  #endif
         }
  
@@ -989,7 +384,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
             nopaque->btpo_prev = BufferGetBlockNumber(obuf);
             nopaque->btpo_next = P_NONE;
  
-           if (_bt_itemcmp(index, _bt_nattr,
+           if (_bt_itemcmp(index, index->rd_att->natts,
               (BTItem) PageGetItem(opage, PageGetItemId(opage, P_HIKEY)),
             (BTItem) PageGetItem(opage, PageGetItemId(opage, P_FIRSTKEY)),
                             BTEqualStrategyNumber))
@@ -1030,8 +425,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
     off = OffsetNumberNext(last_off);
     if (PageAddItem(npage, (Item) bti, btisz, off, LP_USED) == InvalidOffsetNumber)
         elog(FATAL, "btree: failed to add item to the page in _bt_sort (2)");
-#ifdef NOT_USED
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
+#ifdef FASTBUILD_DEBUG
     {
         bool        isnull;
         Datum       d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull);
@@ -1039,11 +433,10 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
         printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n",
                d, off, state->btps_level);
     }
-#endif  /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
  #endif
     if (last_bti == (BTItem) NULL)
         first_off = P_FIRSTKEY;
-   else if (!_bt_itemcmp(index, _bt_nattr,
+   else if (!_bt_itemcmp(index, index->rd_att->natts,
                           bti, last_bti, BTEqualStrategyNumber))
         first_off = off;
     last_off = off;
@@ -1103,224 +496,31 @@ _bt_uppershutdown(Relation index, BTPageState *state)
  }
  
  /*
- * take the input tapes stored by 'btspool' and perform successive
- * merging passes until at most one run is left in each tape.  at that
- * point, merge the final tape runs into a set of btree leaves.
- *
- * XXX three nested loops? gross.  cut me up into smaller routines.
+ * Read tuples in correct sort order from tuplesort, and load them into
+ * btree leaves.
   */
  static void
-_bt_merge(Relation index, BTSpool *btspool)
+_bt_load(Relation index, BTSpool *btspool)
  {
     BTPageState *state;
-   BTPriQueue  q;
-   BTPriQueueElem e;
-   BTSortKey   btsk;
     BTItem      bti;
-   BTTapeBlock *itape;
-   BTTapeBlock *otape;
-   char       *tapepos[MAXTAPES];
-   int         tapedone[MAXTAPES];
-   int         t;
-   int         goodtapes;
-   int         npass;
-   int         nruns;
-   Size        btisz;
-   bool        doleaf = false;
+   bool        should_free;
  
     /*
      * initialize state needed for the merge into the btree leaf pages.
      */
-   state = (BTPageState *) _bt_pagestate(index, BTP_LEAF, 0, true);
-
-   npass = 0;
-   do
-   {                           /* pass */
-
-       /*
-        * each pass starts by flushing the previous outputs and swapping
-        * inputs and outputs.  flushing sets End-of-Run for any dirty
-        * output tapes.  swapping clears the new output tapes and rewinds
-        * the new input tapes.
-        */
-       btspool->bts_tape = btspool->bts_ntapes - 1;
-       _bt_spoolflush(btspool);
-       _bt_spoolswap(btspool);
-
-       ++npass;
-       nruns = 0;
-
-       for (;;)
-       {                       /* run */
-
-           /*
-            * each run starts by selecting a new output tape.  the merged
-            * results of a given run are always sent to this one tape.
-            */
-           btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
-           otape = btspool->bts_otape[btspool->bts_tape];
-
-           /*
-            * initialize the priority queue by loading it with the first
-            * element of the given run in each tape.  since we are
-            * starting a new run, we reset the tape (clearing the
-            * End-Of-Run marker) before reading it.  this means that
-            * _bt_taperead will return 0 only if the tape is actually at
-            * EOF.
-            */
-           MemSet((char *) &q, 0, sizeof(BTPriQueue));
-           goodtapes = 0;
-           for (t = 0; t < btspool->bts_ntapes; ++t)
-           {
-               itape = btspool->bts_itape[t];
-               tapepos[t] = itape->bttb_data;
-               tapedone[t] = 0;
-               _bt_tapereset(itape);
-               do
-               {
-                   if (_bt_taperead(itape) == 0)
-                       tapedone[t] = 1;
-               } while (!tapedone[t] && EMPTYTAPE(itape));
-               if (!tapedone[t])
-               {
-                   ++goodtapes;
-                   e.btpqe_tape = t;
-                   _bt_setsortkey(index, _bt_tapenext(itape, &tapepos[t]),
-                                  &(e.btpqe_item));
-                   if (e.btpqe_item.btsk_item != (BTItem) NULL)
-                       _bt_pqadd(&q, &e);
-               }
-           }
-
-           /*
-            * if we don't have any tapes with any input (i.e., they are
-            * all at EOF), there is no work to do in this run -- we must
-            * be done with this pass.
-            */
-           if (goodtapes == 0)
-           {
-               break;          /* for */
-           }
-           ++nruns;
-
-           /*
-            * output the smallest element from the queue until there are
-            * no more.
-            */
-           while (_bt_pqnext(&q, &e) >= 0)
-           {                   /* item */
-
-               /*
-                * replace the element taken from priority queue, fetching
-                * a new block if needed.  a tape can run out if it hits
-                * either End-Of-Run or EOF.
-                */
-               t = e.btpqe_tape;
-               btsk = e.btpqe_item;
-               bti = btsk.btsk_item;
-               if (bti != (BTItem) NULL)
-               {
-                   btisz = BTITEMSZ(bti);
-                   btisz = MAXALIGN(btisz);
-                   if (doleaf)
-                   {
-                       _bt_buildadd(index, state, bti, BTP_LEAF);
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
-                       {
-                           bool        isnull;
-                           Datum       d = index_getattr(&(bti->bti_itup), 1,
-                                                index->rd_att, &isnull);
-
-                           printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into block %d\n",
-                                  npass, nruns, d, t,
-                                  BufferGetBlockNumber(state->btps_buf));
-                       }
-#endif  /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
-                   }
-                   else
-                   {
-                       if (SPCLEFT(otape) < btisz)
-                       {
-
-                           /*
-                            * if it's full, write it out and add the item
-                            * to the next block.  (since we will be
-                            * adding another tuple immediately after
-                            * this, we can be sure that there will be at
-                            * least one more block in this run and so we
-                            * know we do *not* want to set End-Of-Run
-                            * here.)
-                            */
-                           _bt_tapewrite(otape, 0);
-                       }
-                       _bt_tapeadd(otape, bti, btisz);
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
-                       {
-                           bool        isnull;
-                           Datum       d = index_getattr(&(bti->bti_itup), 1,
-                                                index->rd_att, &isnull);
-
-                           printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into output tape %d\n",
-                                  npass, nruns, d, t,
-                                  btspool->bts_tape);
-                       }
-#endif  /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
-                   }
-
-                   if (btsk.btsk_datum != (Datum *) NULL)
-                       pfree((void *) (btsk.btsk_datum));
-                   if (btsk.btsk_nulls != (char *) NULL)
-                       pfree((void *) (btsk.btsk_nulls));
-
-               }
-               itape = btspool->bts_itape[t];
-               if (!tapedone[t])
-               {
-                   BTItem      newbti = _bt_tapenext(itape, &tapepos[t]);
-
-                   if (newbti == (BTItem) NULL)
-                   {
-                       do
-                       {
-                           if (_bt_taperead(itape) == 0)
-                               tapedone[t] = 1;
-                       } while (!tapedone[t] && EMPTYTAPE(itape));
-                       if (!tapedone[t])
-                       {
-                           tapepos[t] = itape->bttb_data;
-                           newbti = _bt_tapenext(itape, &tapepos[t]);
-                       }
-                   }
-                   if (newbti != (BTItem) NULL)
-                   {
-                       BTPriQueueElem nexte;
-
-                       nexte.btpqe_tape = t;
-                       _bt_setsortkey(index, newbti, &(nexte.btpqe_item));
-                       _bt_pqadd(&q, &nexte);
-                   }
-               }
-           }                   /* item */
-
-           /*
-            * that's it for this run.  flush the output tape, marking
-            * End-of-Run.
-            */
-           _bt_tapewrite(otape, 1);
-       }                       /* run */
+   state = _bt_pagestate(index, BTP_LEAF, 0, true);
  
-       /*
-        * we are here because we ran out of input on all of the input
-        * tapes.
-        *
-        * if this pass did not generate more actual output runs than we have
-        * tapes, we know we have at most one run in each tape.  this
-        * means that we are ready to merge into the final btree leaf
-        * pages instead of merging into a tape file.
-        */
-       if (nruns <= btspool->bts_ntapes)
-           doleaf = true;
-   } while (nruns > 0);        /* pass */
+   for (;;)
+   {
+       bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, true,
+                                              &should_free);
+       if (bti == (BTItem) NULL)
+           break;
+       _bt_buildadd(index, state, bti, BTP_LEAF);
+       if (should_free)
+           pfree((void *) bti);
+   }
  
     _bt_uppershutdown(index, state);
  }
@@ -1359,7 +559,7 @@ _bt_upperbuild(Relation index)
     ropaque->btpo_flags &= ~BTP_ROOT;
     _bt_wrtbuf(index, rbuf);
  
-   state = (BTPageState *) _bt_pagestate(index, 0, 0, true);
+   state = _bt_pagestate(index, 0, 0, true);
  
     /* for each page... */
     do
@@ -1380,7 +580,7 @@ _bt_upperbuild(Relation index)
              * the lower page and insert it into a page at this level.
              */
             nbti = _bt_minitem(rpage, blk, P_RIGHTMOST(ropaque));
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
+#ifdef FASTBUILD_DEBUG
             {
                 bool        isnull;
                 Datum       d = index_getattr(&(nbti->bti_itup), 1, index->rd_att,
@@ -1389,7 +589,7 @@ _bt_upperbuild(Relation index)
                 printf("_bt_upperbuild: inserting <%x> at %d\n",
                        d, state->btps_level);
             }
-#endif  /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
+#endif
             _bt_buildadd(index, state, nbti, 0);
             pfree((void *) nbti);
         }
@@ -1401,25 +601,3 @@ _bt_upperbuild(Relation index)
  }
  
  #endif
-
-/*
- * given a spool loading by successive calls to _bt_spool, create an
- * entire btree.
- */
-void
-_bt_leafbuild(Relation index, void *spool)
-{
-   _bt_isortcmpinit(index, (BTSpool *) spool);
-
-#ifdef BTREE_BUILD_STATS
-   if (ShowExecutorStats)
-   {
-       fprintf(stderr, "! BtreeBuild (Spool) Stats:\n");
-       ShowUsage();
-       ResetUsage();
-   }
-#endif
-
-   _bt_merge(index, (BTSpool *) spool);
-
-}
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c

index f82fecf0d6f8ac26db7c720446eff050927c348d..14e8b46aa86cfb69516775f740f328de19e4150a 100644 (file)
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -7,16 +7,17 @@
   *
   *
   * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/executor/nodeSort.c,v 1.23 1999/07/17 20:16:58 momjian Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/executor/nodeSort.c,v 1.24 1999/10/17 22:15:02 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
  
  #include "postgres.h"
+
  #include "executor/executor.h"
  #include "executor/execdebug.h"
  #include "executor/nodeSort.h"
-#include "utils/psort.h"
+#include "utils/tuplesort.h"
  
  /* ----------------------------------------------------------------
   *     FormSortKeys(node)
@@ -83,11 +84,9 @@ FormSortKeys(Sort *sortnode)
  /* ----------------------------------------------------------------
   *     ExecSort
   *
- * old comments
- *     Sorts tuples from the outer subtree of the node in psort,
+ *     Sorts tuples from the outer subtree of the node using tuplesort,
   *     which saves the results in a temporary file or memory. After the
   *     initial call, returns a tuple from the file with each call.
- *     Assumes that heap access method is used.
   *
   *     Conditions:
   *       -- none.
@@ -101,10 +100,8 @@ ExecSort(Sort *node)
  {
     EState     *estate;
     SortState  *sortstate;
-   Plan       *outerNode;
     ScanDirection dir;
-   int         keycount;
-   ScanKey     sortkeys;
+   Tuplesortstate *tuplesortstate;
     HeapTuple   heapTuple;
     TupleTableSlot *slot;
     bool        should_free;
@@ -119,43 +116,71 @@ ExecSort(Sort *node)
     sortstate = node->sortstate;
     estate = node->plan.state;
     dir = estate->es_direction;
+   tuplesortstate = (Tuplesortstate *) sortstate->tuplesortstate;
  
     /* ----------------
-    *  the first time we call this, psort sorts this into a file.
-    *  Subsequent calls return tuples from psort.
+    *  If first time through, read all tuples from outer plan and
+    *  pass them to tuplesort.c.
+    *  Subsequent calls just fetch tuples from tuplesort.
      * ----------------
      */
  
-   if (sortstate->sort_Flag == false)
+   if (! sortstate->sort_Done)
     {
+       Plan       *outerNode;
+       TupleDesc   tupDesc;
+       int         keycount;
+       ScanKey     sortkeys;
+
         SO1_printf("ExecSort: %s\n",
-                  "sortstate == false -> sorting subplan");
+                  "sorting subplan");
         /* ----------------
-        *  set all relations to be scanned in the forward direction
-        *  while creating the temporary relation.
+        *  Want to scan subplan in the forward direction while creating
+        *  the sorted data.  (Does setting my direction actually affect
+        *  the subplan?  I bet this is useless code...)
          * ----------------
          */
         estate->es_direction = ForwardScanDirection;
  
         /* ----------------
-        *   prepare information for psort_begin()
+        *   Initialize tuplesort module.
          * ----------------
          */
-       outerNode = outerPlan((Plan *) node);
+       SO1_printf("ExecSort: %s\n",
+                  "calling tuplesort_begin");
  
+       outerNode = outerPlan((Plan *) node);
+       tupDesc = ExecGetTupType(outerNode);
         keycount = node->keycount;
         sortkeys = (ScanKey) sortstate->sort_Keys;
-       SO1_printf("ExecSort: %s\n",
-                  "calling psort_begin");
  
-       if (!psort_begin(node,  /* this node */
-                        keycount,      /* number keys */
-                        sortkeys))     /* keys */
+       tuplesortstate = tuplesort_begin_heap(tupDesc, keycount, sortkeys,
+                                             true /* randomAccess */);
+
+       sortstate->tuplesortstate = (void *) tuplesortstate;
+
+       /* ----------------
+        *   Scan the subplan and feed all the tuples to tuplesort.
+        * ----------------
+        */
+
+       for (;;)
         {
-           /* Psort says, there are no tuples to be sorted */
-           return NULL;
+           slot = ExecProcNode(outerNode, (Plan *) node);
+
+           if (TupIsNull(slot))
+               break;
+
+           tuplesort_puttuple(tuplesortstate, (void *) slot->val);
+           ExecClearTuple(slot);
         }
  
+       /* ----------------
+        *   Complete the sort.
+        * ----------------
+        */
+       tuplesort_performsort(tuplesortstate);
+
         /* ----------------
          *   restore to user specified direction
          * ----------------
@@ -167,25 +192,29 @@ ExecSort(Sort *node)
          * ----------------
          */
         slot = (TupleTableSlot *) sortstate->csstate.cstate.cs_ResultTupleSlot;
-       slot->ttc_tupleDescriptor = ExecGetTupType(outerNode);
+       slot->ttc_tupleDescriptor = tupDesc;
+
         /* ----------------
          *  finally set the sorted flag to true
          * ----------------
          */
-       sortstate->sort_Flag = true;
+       sortstate->sort_Done = true;
         SO1_printf(stderr, "ExecSort: sorting done.\n");
     }
     else
         slot = (TupleTableSlot *) sortstate->csstate.cstate.cs_ResultTupleSlot;
  
     SO1_printf("ExecSort: %s\n",
-              "retrieving tuple from sorted relation");
+              "retrieving tuple from tuplesort");
  
     /* ----------------
-    *  at this point we grab a tuple from psort
+    *  Get the first or next tuple from tuplesort.
+    *  Returns NULL if no more tuples.
      * ----------------
      */
-   heapTuple = psort_grabtuple(node, &should_free);
+   heapTuple = tuplesort_getheaptuple(tuplesortstate,
+                                      ScanDirectionIsForward(dir),
+                                      &should_free);
  
     return ExecStoreTuple(heapTuple, slot, InvalidBuffer, should_free);
  }
@@ -193,7 +222,6 @@ ExecSort(Sort *node)
  /* ----------------------------------------------------------------
   *     ExecInitSort
   *
- * old comments
   *     Creates the run-time state information for the sort node
   *     produced by the planner and initailizes its outer subtree.
   * ----------------------------------------------------------------
@@ -203,7 +231,6 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
  {
     SortState  *sortstate;
     Plan       *outerPlan;
-   ScanKey     sortkeys;
  
     SO1_printf("ExecInitSort: %s\n",
                "initializing sort node");
@@ -219,14 +246,14 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
      * ----------------
      */
     sortstate = makeNode(SortState);
-   sortstate->sort_Flag = 0;
+   sortstate->sort_Done = false;
     sortstate->sort_Keys = NULL;
-   node->cleaned = FALSE;
+   sortstate->tuplesortstate = NULL;
  
     node->sortstate = sortstate;
  
     /* ----------------
-    *  Miscellanious initialization
+    *  Miscellaneous initialization
      *
      *       +  assign node's base_id
      *       +  assign debugging hooks
@@ -259,9 +286,7 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
      *  initialize sortstate information
      * ----------------
      */
-   sortkeys = FormSortKeys(node);
-   sortstate->sort_Keys = sortkeys;
-   sortstate->sort_Flag = false;
+   sortstate->sort_Keys = FormSortKeys(node);
  
     /* ----------------
      *  initialize tuple type.  no need to initialize projection
@@ -275,11 +300,6 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
     SO1_printf("ExecInitSort: %s\n",
                "sort node initialized");
  
-   /* ----------------
-    *  return relation oid of temporary sort relation in a list
-    *  (someday -- for now we return LispTrue... cim 10/12/89)
-    * ----------------
-    */
     return TRUE;
  }
  
@@ -293,8 +313,6 @@ ExecCountSlotsSort(Sort *node)
  
  /* ----------------------------------------------------------------
   *     ExecEndSort(node)
- *
- * old comments
   * ----------------------------------------------------------------
   */
  void
@@ -325,8 +343,13 @@ ExecEndSort(Sort *node)
      */
     ExecClearTuple(sortstate->csstate.css_ScanTupleSlot);
  
-   /* Clean up after psort */
-   psort_end(node);
+   /* ----------------
+    *  Release tuplesort resources
+    * ----------------
+    */
+   if (sortstate->tuplesortstate != NULL)
+       tuplesort_end((Tuplesortstate *) sortstate->tuplesortstate);
+   sortstate->tuplesortstate = NULL;
  
     SO1_printf("ExecEndSort: %s\n",
                "sort node shutdown");
@@ -335,51 +358,47 @@ ExecEndSort(Sort *node)
  /* ----------------------------------------------------------------
   *     ExecSortMarkPos
   *
- *     Calls psort to save the current position in the sorted file.
+ *     Calls tuplesort to save the current position in the sorted file.
   * ----------------------------------------------------------------
   */
  void
  ExecSortMarkPos(Sort *node)
  {
-   SortState  *sortstate;
+   SortState  *sortstate = node->sortstate;
  
     /* ----------------
      *  if we haven't sorted yet, just return
      * ----------------
      */
-   sortstate = node->sortstate;
-   if (sortstate->sort_Flag == false)
+   if (! sortstate->sort_Done)
         return;
  
-   psort_markpos(node);
-
-   return;
+   tuplesort_markpos((Tuplesortstate *) sortstate->tuplesortstate);
  }
  
  /* ----------------------------------------------------------------
   *     ExecSortRestrPos
   *
- *     Calls psort to restore the last saved sort file position.
+ *     Calls tuplesort to restore the last saved sort file position.
   * ----------------------------------------------------------------
   */
  void
  ExecSortRestrPos(Sort *node)
  {
-   SortState  *sortstate;
+   SortState  *sortstate = node->sortstate;
  
     /* ----------------
      *  if we haven't sorted yet, just return.
      * ----------------
      */
-   sortstate = node->sortstate;
-   if (sortstate->sort_Flag == false)
+   if (! sortstate->sort_Done)
         return;
  
     /* ----------------
      *  restore the scan to the previously marked position
      * ----------------
      */
-   psort_restorepos(node);
+   tuplesort_restorepos((Tuplesortstate *) sortstate->tuplesortstate);
  }
  
  void
@@ -392,17 +411,25 @@ ExecReScanSort(Sort *node, ExprContext *exprCtxt, Plan *parent)
      * not NULL then it will be re-scanned by ExecProcNode, else - no
      * reason to re-scan it at all.
      */
-   if (sortstate->sort_Flag == false)
+   if (! sortstate->sort_Done)
         return;
  
     ExecClearTuple(sortstate->csstate.cstate.cs_ResultTupleSlot);
  
-   psort_rescan(node);
-
     /*
-    * If subnode is to be rescanned then we aren't sorted
+    * If subnode is to be rescanned then we forget previous sort
+    * results; we have to re-read the subplan and re-sort.
+    *
+    * Otherwise we can just rewind and rescan the sorted output.
      */
     if (((Plan *) node)->lefttree->chgParam != NULL)
-       sortstate->sort_Flag = false;
-
+   {
+       sortstate->sort_Done = false;
+       tuplesort_end((Tuplesortstate *) sortstate->tuplesortstate);
+       sortstate->tuplesortstate = NULL;
+   }
+   else
+   {
+       tuplesort_rescan((Tuplesortstate *) sortstate->tuplesortstate);
+   }
  }
diff --git a/src/backend/utils/sort/Makefile b/src/backend/utils/sort/Makefile

index d411a89c735a3b1e667a67c20a6b6ebedb8fb6ee..c680a089230667a58b828170b20ef11b85ead459 100644 (file)
--- a/src/backend/utils/sort/Makefile
+++ b/src/backend/utils/sort/Makefile
@@ -4,7 +4,7 @@
  #    Makefile for utils/sort
  #
  # IDENTIFICATION
-#    $Header: /cvsroot/pgsql/src/backend/utils/sort/Makefile,v 1.6 1999/10/16 19:49:27 tgl Exp $
+#    $Header: /cvsroot/pgsql/src/backend/utils/sort/Makefile,v 1.7 1999/10/17 22:15:05 tgl Exp $
  #
  #-------------------------------------------------------------------------
  
@@ -13,7 +13,7 @@ include ../../../Makefile.global
  
  CFLAGS += -I../..
  
-OBJS = logtape.o lselect.o psort.o
+OBJS = logtape.o tuplesort.o
  
  all: SUBSYS.o
  
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c

index 8d5d34c00a73aec9239d2efd90cc1040b3a14f69..46497598b56bbd2c8a75dcb7fd85ff1cc8923056 100644 (file)
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -4,8 +4,8 @@
   *   Management of "logical tapes" within temporary files.
   *
   * This module exists to support sorting via multiple merge passes (see
- * psort.c).  Merging is an ideal algorithm for tape devices, but if we
- * implement it on disk by creating a separate file for each "tape",
+ * tuplesort.c).  Merging is an ideal algorithm for tape devices, but if
+ * we implement it on disk by creating a separate file for each "tape",
   * there is an annoying problem: the peak space usage is at least twice
   * the volume of actual data to be sorted.  (This must be so because each
   * datum will appear in both the input and output tapes of the final
@@ -23,7 +23,7 @@
   * Few OSes allow arbitrary parts of a file to be released back to the OS,
   * so we have to implement this space-recycling ourselves within a single
   * logical file.  logtape.c exists to perform this bookkeeping and provide
- * the illusion of N independent tape devices to psort.c.  Note that
+ * the illusion of N independent tape devices to tuplesort.c.  Note that
   * logtape.c itself depends on buffile.c to provide a "logical file" of
   * larger size than the underlying OS may support.
   *
@@ -63,7 +63,7 @@
   * Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/utils/sort/logtape.c,v 1.1 1999/10/16 19:49:27 tgl Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/utils/sort/logtape.c,v 1.2 1999/10/17 22:15:05 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c

new file mode 100644 (file)

index 0000000..2240564
--- /dev/null
+++ b/src/backend/utils/sort/tuplesort.c
@@ -0,0 +1,1465 @@
+/*-------------------------------------------------------------------------
+ *
+ * tuplesort.c
+ *   Generalized tuple sorting routines.
+ *
+ * This module handles sorting of either heap tuples or index tuples
+ * (and could fairly easily support other kinds of sortable objects,
+ * if necessary).  It works efficiently for both small and large amounts
+ * of data.  Small amounts are sorted in-memory using qsort().  Large
+ * amounts are sorted using temporary files and a standard external sort
+ * algorithm.
+ *
+ * See Knuth, volume 3, for more than you want to know about the external
+ * sorting algorithm.  We divide the input into sorted runs using replacement
+ * selection, in the form of a priority tree implemented as a heap
+ * (essentially his Algorithm 5.2.3H), then merge the runs using polyphase
+ * merge, Knuth's Algorithm 5.4.2D.  The logical "tapes" used by Algorithm D
+ * are implemented by logtape.c, which avoids space wastage by recycling
+ * disk space as soon as each block is read from its "tape".
+ *
+ * We do not form the initial runs using Knuth's recommended replacement
+ * selection method (Algorithm 5.4.1R), because it uses a fixed number of
+ * records in memory at all times.  Since we are dealing with tuples that
+ * may vary considerably in size, we want to be able to vary the number of
+ * records kept in memory to ensure full utilization of the allowed sort
+ * memory space.  This is easily done by keeping a variable-size heap in
+ * which the records of the current run are stored, plus a variable-size
+ * unsorted array holding records that must go into the next run.
+ *
+ * The (approximate) amount of memory allowed for any one sort operation
+ * is given in kilobytes by the external variable SortMem.  Initially,
+ * we absorb tuples and simply store them in an unsorted array as long as
+ * we haven't exceeded SortMem.  If we reach the end of the input without
+ * exceeding SortMem, we sort the array using qsort() and subsequently return
+ * tuples just by scanning the tuple array sequentially.  If we do exceed
+ * SortMem, we construct a heap using Algorithm H and begin to emit tuples
+ * into sorted runs in temporary tapes, emitting just enough tuples at each
+ * step to get back within the SortMem limit.  New tuples are added to the
+ * heap if they can go into the current run, else they are temporarily added
+ * to the unsorted array.  Whenever the heap empties, we construct a new heap
+ * from the current contents of the unsorted array, and begin a new run with a
+ * new output tape (selected per Algorithm D).  After the end of the input
+ * is reached, we dump out remaining tuples in memory into a final run
+ * (or two), then merge the runs using Algorithm D.
+ *
+ * When the caller requests random access to the sort result, we form
+ * the final sorted run on a logical tape which is then "frozen", so
+ * that we can access it randomly.  When the caller does not need random
+ * access, we return from tuplesort_performsort() as soon as we are down
+ * to one run per logical tape.  The final merge is then performed
+ * on-the-fly as the caller repeatedly calls tuplesort_gettuple; this
+ * saves one cycle of writing all the data out to disk and reading it in.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *   $Header: /cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v 1.1 1999/10/17 22:15:05 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/nbtree.h"
+#include "miscadmin.h"
+#include "utils/logtape.h"
+#include "utils/tuplesort.h"
+
+/*
+ * Possible states of a Tuplesort object.  These denote the states that
+ * persist between calls of Tuplesort routines.
+ */
+typedef enum
+{
+   TSS_INITIAL,        /* Loading tuples; still within memory limit */
+   TSS_BUILDRUNS,      /* Loading tuples; writing to tape */
+   TSS_SORTEDINMEM,    /* Sort completed entirely in memory */
+   TSS_SORTEDONTAPE,   /* Sort completed, final run is on tape */
+   TSS_FINALMERGE      /* Performing final merge on-the-fly */
+} TupSortStatus;
+
+/*
+ * We use a seven-tape polyphase merge, which is the "sweet spot" on the
+ * tapes-to-passes curve according to Knuth's figure 70 (section 5.4.2).
+ */
+#define MAXTAPES       7               /* Knuth's T */
+#define TAPERANGE      (MAXTAPES-1)    /* Knuth's P */
+
+/*
+ * Private state of a Tuplesort operation.
+ */
+struct Tuplesortstate
+{
+   TupSortStatus status;       /* enumerated value as shown above */
+   bool        randomAccess;   /* did caller request random access? */
+   long        availMem;       /* remaining memory available, in bytes */
+   LogicalTapeSet *tapeset;    /* logtape.c object for tapes in a temp file */
+
+   /*
+    * These function pointers decouple the routines that must know what kind
+    * of tuple we are sorting from the routines that don't need to know it.
+    * They are set up by the tuplesort_begin_xxx routines.
+    *
+    * Function to compare two tuples; result is per qsort() convention,
+    * ie, <0, 0, >0 according as a<b, a=b, a>b.
+    */
+   int (*comparetup) (Tuplesortstate *state, const void *a, const void *b);
+   /*
+    * Function to copy a supplied input tuple into palloc'd space.
+    * (NB: we assume that a single pfree() is enough to release the tuple
+    * later, so the representation must be "flat" in one palloc chunk.)
+    * state->availMem must be decreased by the amount of space used.
+    */
+   void * (*copytup) (Tuplesortstate *state, void *tup);
+   /*
+    * Function to write a stored tuple onto tape.  The representation of
+    * the tuple on tape need not be the same as it is in memory; requirements
+    * on the tape representation are given below.  After writing the tuple,
+    * pfree() it, and increase state->availMem by the amount of memory space
+    * thereby released.
+    */
+   void (*writetup) (Tuplesortstate *state, int tapenum, void *tup);
+   /*
+    * Function to read a stored tuple from tape back into memory.
+    * 'len' is the already-read length of the stored tuple.  Create and
+    * return a palloc'd copy, and decrease state->availMem by the amount
+    * of memory space consumed.
+    */
+   void * (*readtup) (Tuplesortstate *state, int tapenum, unsigned int len);
+
+   /*
+    * This array holds "unsorted" tuples during the input phases.
+    * If we are able to complete the sort in memory, it holds the
+    * final sorted result as well.
+    */
+   void      **memtuples;      /* array of pointers to palloc'd tuples */
+   int         memtupcount;    /* number of tuples currently present */
+   int         memtupsize;     /* allocated length of memtuples array */
+
+   /*
+    * This array holds the partially-sorted "heap" of tuples that will go
+    * out in the current run during BUILDRUNS state.  While completing
+    * the sort, we use it to merge runs of tuples from input tapes.
+    * It is never allocated unless we need to use tapes.
+    */
+   void      **heaptuples;     /* array of pointers to palloc'd tuples */
+   int         heaptupcount;   /* number of tuples currently present */
+   int         heaptupsize;    /* allocated length of heaptuples array */
+   /*
+    * While merging, this array holds the actual number of the input tape
+    * that each tuple in heaptuples[] came from.
+    */
+   int        *heapsrctapes;
+
+   /*
+    * Variables for Algorithm D.  Note that destTape is a "logical" tape
+    * number, ie, an index into the tp_xxx[] arrays.  Be careful to keep
+    * "logical" and "actual" tape numbers straight!
+    */
+   int         Level;          /* Knuth's l */
+   int         destTape;       /* current output tape (Knuth's j, less 1) */
+   int         tp_fib[MAXTAPES]; /* Target Fibonacci run counts (A[]) */
+   int         tp_runs[MAXTAPES];  /* # of real runs on each tape */
+   int         tp_dummy[MAXTAPES]; /* # of dummy runs for each tape (D[]) */
+   int         tp_tapenum[MAXTAPES]; /* Actual tape numbers (TAPE[]) */
+
+   bool        multipleRuns;   /* T if we have created more than 1 run */
+
+   /*
+    * These variables are used after completion of sorting to keep track
+    * of the next tuple to return.  (In the tape case, the tape's current
+    * read position is also critical state.)
+    */
+   int         result_tape;    /* actual tape number of finished output */
+   int         current;        /* array index (only used if SORTEDINMEM) */
+   bool        eof_reached;    /* reached EOF (needed for cursors) */
+
+   /* markpos_xxx holds marked position for mark and restore */
+   long        markpos_block;  /* tape block# (only used if SORTEDONTAPE) */
+   int         markpos_offset; /* saved "current", or offset in tape block */
+   bool        markpos_eof;    /* saved "eof_reached" */
+
+   /*
+    * These variables are specific to the HeapTuple case; they are set
+    * by tuplesort_begin_heap and used only by the HeapTuple routines.
+    */
+   TupleDesc   tupDesc;
+   int         nKeys;
+   ScanKey     scanKeys;
+
+   /*
+    * These variables are specific to the IndexTuple case; they are set
+    * by tuplesort_begin_index and used only by the IndexTuple routines.
+    */
+   Relation    indexRel;
+   bool        enforceUnique;  /* complain if we find duplicate tuples */
+};
+
+#define COMPARETUP(state,a,b)  ((*(state)->comparetup) (state, a, b))
+#define COPYTUP(state,tup) ((*(state)->copytup) (state, tup))
+#define WRITETUP(state,tape,tup)   ((*(state)->writetup) (state, tape, tup))
+#define READTUP(state,tape,len)    ((*(state)->readtup) (state, tape, len))
+#define LACKMEM(state)     ((state)->availMem < 0)
+#define USEMEM(state,amt)  ((state)->availMem -= (amt))
+#define FREEMEM(state,amt) ((state)->availMem += (amt))
+
+/*--------------------
+ *
+ * NOTES about on-tape representation of tuples:
+ *
+ * We require the first "unsigned int" of a stored tuple to be the total size
+ * on-tape of the tuple, including itself (so it is never zero; an all-zero
+ * unsigned int is used to delimit runs).  The remainder of the stored tuple
+ * may or may not match the in-memory representation of the tuple ---
+ * any conversion needed is the job of the writetup and readtup routines.
+ *
+ * If state->randomAccess is true, then the stored representation of the
+ * tuple must be followed by another "unsigned int" that is a copy of the
+ * length --- so the total tape space used is actually sizeof(unsigned int)
+ * more than the stored length value.  This allows read-backwards.  When
+ * randomAccess is not true, the write/read routines may omit the extra
+ * length word.
+ *
+ * writetup is expected to write both length words as well as the tuple
+ * data.  When readtup is called, the tape is positioned just after the
+ * front length word; readtup must read the tuple data and advance past
+ * the back length word (if present).
+ *
+ * The write/read routines can make use of the tuple description data
+ * stored in the Tuplesortstate record, if needed.  They are also expected
+ * to adjust state->availMem by the amount of memory space (not tape space!)
+ * released or consumed.  There is no error return from either writetup
+ * or readtup; they should elog() on failure.
+ *
+ *
+ * NOTES about memory consumption calculations:
+ *
+ * We count space requested for tuples against the SortMem limit.
+ * Fixed-size space (primarily the LogicalTapeSet I/O buffers) is not
+ * counted, nor do we count the variable-size memtuples and heaptuples
+ * arrays.  (Even though those could grow pretty large, they should be
+ * small compared to the tuples proper, so this is not unreasonable.)
+ *
+ * The major deficiency in this approach is that it ignores palloc overhead.
+ * The memory space actually allocated for a palloc chunk is always more
+ * than the request size, and could be considerably more (as much as 2X
+ * larger, in the current aset.c implementation).  So the space used could
+ * be considerably more than SortMem says.
+ *
+ * One way to fix this is to add a memory management function that, given
+ * a pointer to a palloc'd chunk, returns the actual space consumed by the
+ * chunk.  This would be very easy in the current aset.c module, but I'm
+ * hesitant to do it because it might be unpleasant to support in future
+ * implementations of memory management.  (For example, a direct
+ * implementation of palloc as malloc could not support such a function
+ * portably.)
+ *
+ * A cruder answer is just to apply a fudge factor, say by initializing
+ * availMem to only three-quarters of what SortMem indicates.  This is
+ * probably the right answer if anyone complains that SortMem is not being
+ * obeyed very faithfully.
+ *
+ *--------------------
+ */
+
+static Tuplesortstate *tuplesort_begin_common(bool randomAccess);
+static void inittapes(Tuplesortstate *state);
+static void selectnewtape(Tuplesortstate *state);
+static void mergeruns(Tuplesortstate *state);
+static void mergeonerun(Tuplesortstate *state);
+static void beginmerge(Tuplesortstate *state);
+static void beginrun(Tuplesortstate *state);
+static void dumptuples(Tuplesortstate *state, bool alltuples);
+static void tuplesort_heap_insert(Tuplesortstate *state, void *tuple,
+                                 int tapenum);
+static void tuplesort_heap_siftup(Tuplesortstate *state);
+static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
+static void markrunend(Tuplesortstate *state, int tapenum);
+static int qsort_comparetup(const void *a, const void *b);
+static int comparetup_heap(Tuplesortstate *state,
+                          const void *a, const void *b);
+static void *copytup_heap(Tuplesortstate *state, void *tup);
+static void writetup_heap(Tuplesortstate *state, int tapenum, void *tup);
+static void *readtup_heap(Tuplesortstate *state, int tapenum,
+                         unsigned int len);
+static int comparetup_index(Tuplesortstate *state,
+                           const void *a, const void *b);
+static void *copytup_index(Tuplesortstate *state, void *tup);
+static void writetup_index(Tuplesortstate *state, int tapenum, void *tup);
+static void *readtup_index(Tuplesortstate *state, int tapenum,
+                          unsigned int len);
+
+/*
+ * Since qsort(3) will not pass any context info to qsort_comparetup(),
+ * we have to use this ugly static variable.  It is set to point to the
+ * active Tuplesortstate object just before calling qsort.  It should
+ * not be used directly by anything except qsort_comparetup().
+ */
+static Tuplesortstate *qsort_tuplesortstate;
+
+
+/*
+ *     tuplesort_begin_xxx
+ *
+ * Initialize for a tuple sort operation.
+ *
+ * After calling tuplesort_begin, the caller should call tuplesort_puttuple
+ * zero or more times, then call tuplesort_performsort when all the tuples
+ * have been supplied.  After performsort, retrieve the tuples in sorted
+ * order by calling tuplesort_gettuple until it returns NULL.  (If random
+ * access was requested, rescan, markpos, and restorepos can also be called.)
+ * Call tuplesort_end to terminate the operation and release memory/disk space.
+ */
+
+static Tuplesortstate *
+tuplesort_begin_common(bool randomAccess)
+{
+   Tuplesortstate *state;
+
+   state = (Tuplesortstate *) palloc(sizeof(Tuplesortstate));
+
+   MemSet((char *) state, 0, sizeof(Tuplesortstate));
+
+   state->status = TSS_INITIAL;
+   state->randomAccess = randomAccess;
+   state->availMem = SortMem * 1024L;
+   state->tapeset = NULL;
+
+   state->memtupcount = 0;
+   state->memtupsize = 1024;   /* initial guess */
+   state->memtuples = (void **) palloc(state->memtupsize * sizeof(void *));
+
+   state->heaptuples = NULL;   /* until and unless needed */
+   state->heaptupcount = 0;
+   state->heaptupsize = 0;
+   state->heapsrctapes = NULL;
+
+   /* Algorithm D variables will be initialized by inittapes, if needed */
+
+   state->result_tape = -1;    /* flag that result tape has not been formed */
+
+   return state;
+}
+
+Tuplesortstate *
+tuplesort_begin_heap(TupleDesc tupDesc,
+                    int nkeys, ScanKey keys,
+                    bool randomAccess)
+{
+   Tuplesortstate *state = tuplesort_begin_common(randomAccess);
+
+   AssertArg(nkeys >= 1);
+   AssertArg(keys[0].sk_attno != 0);
+   AssertArg(keys[0].sk_procedure != 0);
+
+   state->comparetup = comparetup_heap;
+   state->copytup = copytup_heap;
+   state->writetup = writetup_heap;
+   state->readtup = readtup_heap;
+
+   state->tupDesc = tupDesc;
+   state->nKeys = nkeys;
+   state->scanKeys = keys;
+
+   return state;
+}
+
+Tuplesortstate *
+tuplesort_begin_index(Relation indexRel,
+                     bool enforceUnique,
+                     bool randomAccess)
+{
+   Tuplesortstate *state = tuplesort_begin_common(randomAccess);
+
+   state->comparetup = comparetup_index;
+   state->copytup = copytup_index;
+   state->writetup = writetup_index;
+   state->readtup = readtup_index;
+
+   state->indexRel = indexRel;
+   state->enforceUnique = enforceUnique;
+
+   return state;
+}
+
+/*
+ * tuplesort_end
+ *
+ * Release resources and clean up.
+ */
+void
+tuplesort_end(Tuplesortstate *state)
+{
+   int     i;
+
+   if (state->tapeset)
+       LogicalTapeSetClose(state->tapeset);
+   if (state->memtuples)
+   {
+       for (i = 0; i < state->memtupcount; i++)
+           pfree(state->memtuples[i]);
+       pfree(state->memtuples);
+   }
+   if (state->heaptuples)
+   {
+       for (i = 0; i < state->heaptupcount; i++)
+           pfree(state->heaptuples[i]);
+       pfree(state->heaptuples);
+   }
+   if (state->heapsrctapes)
+       pfree(state->heapsrctapes);
+}
+
+/*
+ * Accept one tuple while collecting input data for sort.
+ *
+ * Note that the input tuple is always copied; the caller need not save it.
+ */
+void
+tuplesort_puttuple(Tuplesortstate *state, void *tuple)
+{
+   /*
+    * Copy the given tuple into memory we control, and decrease availMem.
+    */
+   tuple = COPYTUP(state, tuple);
+
+   switch (state->status)
+   {
+       case TSS_INITIAL:
+           /*
+            * Save the copied tuple into the unsorted array.
+            */
+           if (state->memtupcount >= state->memtupsize)
+           {
+               /* Grow the unsorted array as needed. */
+               state->memtupsize *= 2;
+               state->memtuples = (void **)
+                   repalloc(state->memtuples,
+                            state->memtupsize * sizeof(void *));
+           }
+           state->memtuples[state->memtupcount++] = tuple;
+           /*
+            * Done if we still fit in available memory.
+            */
+           if (! LACKMEM(state))
+               return;
+           /*
+            * Nope; time to switch to tape-based operation.
+            */
+           inittapes(state);
+           beginrun(state);
+           /*
+            * Dump tuples until we are back under the limit.
+            */
+           dumptuples(state, false);
+           break;
+       case TSS_BUILDRUNS:
+           /*
+            * Insert the copied tuple into the heap if it can go into the
+            * current run; otherwise add it to the unsorted array, whence
+            * it will go into the next run.
+            *
+            * The tuple can go into the current run if it is >= the first
+            * not-yet-output tuple.  (Actually, it could go into the current
+            * run if it is >= the most recently output tuple ... but that
+            * would require keeping around the tuple we last output, and
+            * it's simplest to let writetup free the tuple when written.)
+            *
+            * Note there will always be at least one tuple in the heap
+            * at this point; see dumptuples.
+            */
+           Assert(state->heaptupcount > 0);
+           if (COMPARETUP(state, tuple, state->heaptuples[0]) >= 0)
+           {
+               tuplesort_heap_insert(state, tuple, 0);
+           }
+           else
+           {
+               if (state->memtupcount >= state->memtupsize)
+               {
+                   /* Grow the unsorted array as needed. */
+                   state->memtupsize *= 2;
+                   state->memtuples = (void **)
+                       repalloc(state->memtuples,
+                                state->memtupsize * sizeof(void *));
+               }
+               state->memtuples[state->memtupcount++] = tuple;
+           }
+           /*
+            * If we are over the memory limit, dump tuples till we're under.
+            */
+           dumptuples(state, false);
+           break;
+       default:
+           elog(ERROR, "tuplesort_puttuple: invalid state");
+           break;
+   }
+}
+
+/*
+ * All tuples have been provided; finish the sort.
+ */
+void
+tuplesort_performsort(Tuplesortstate *state)
+{
+   switch (state->status)
+   {
+       case TSS_INITIAL:
+           /*
+            * We were able to accumulate all the tuples within the
+            * allowed amount of memory.  Just qsort 'em and we're done.
+            */
+           if (state->memtupcount > 1)
+           {
+               qsort_tuplesortstate = state;
+               qsort((void *) state->memtuples, state->memtupcount,
+                     sizeof(void *), qsort_comparetup);
+           }
+           state->current = 0;
+           state->eof_reached = false;
+           state->markpos_offset = 0;
+           state->markpos_eof = false;
+           state->status = TSS_SORTEDINMEM;
+           break;
+       case TSS_BUILDRUNS:
+           /*
+            * Finish tape-based sort.  First, flush all tuples remaining
+            * in memory out to tape; then merge until we have a single
+            * remaining run (or, if !randomAccess, one run per tape).
+            * Note that mergeruns sets the correct status.
+            */
+           dumptuples(state, true);
+           mergeruns(state);
+           state->eof_reached = false;
+           state->markpos_block = 0L;
+           state->markpos_offset = 0;
+           state->markpos_eof = false;
+           break;
+       default:
+           elog(ERROR, "tuplesort_performsort: invalid state");
+           break;
+   }
+}
+
+/*
+ * Fetch the next tuple in either forward or back direction.
+ * Returns NULL if no more tuples.  If should_free is set, the
+ * caller must pfree the returned tuple when done with it.
+ */
+void *
+tuplesort_gettuple(Tuplesortstate *state, bool forward,
+                  bool *should_free)
+{
+   unsigned int    tuplen;
+   void           *tup;
+
+   switch (state->status)
+   {
+       case TSS_SORTEDINMEM:
+           Assert(forward || state->randomAccess);
+           *should_free = false;
+           if (forward)
+           {
+               if (state->current < state->memtupcount)
+                   return state->memtuples[state->current++];
+               state->eof_reached = true;
+               return NULL;
+           }
+           else
+           {
+               if (state->current <= 0)
+                   return NULL;
+               /*
+                * if all tuples are fetched already then we return last tuple,
+                * else - tuple before last returned.
+                */
+               if (state->eof_reached)
+                   state->eof_reached = false;
+               else
+               {
+                   state->current--; /* last returned tuple */
+                   if (state->current <= 0)
+                       return NULL;
+               }
+               return state->memtuples[state->current - 1];
+           }
+           break;
+
+       case TSS_SORTEDONTAPE:
+           Assert(forward || state->randomAccess);
+           *should_free = true;
+           if (forward)
+           {
+               if (state->eof_reached)
+                   return NULL;
+               if ((tuplen = getlen(state, state->result_tape, true)) != 0)
+               {
+                   tup = READTUP(state, state->result_tape, tuplen);
+                   return tup;
+               }
+               else
+               {
+                   state->eof_reached = true;
+                   return NULL;
+               }
+           }
+           /* Backward.
+            *
+            * if all tuples are fetched already then we return last tuple,
+            * else - tuple before last returned.
+            */
+           if (state->eof_reached)
+           {
+               /*
+                * Seek position is pointing just past the zero tuplen
+                * at the end of file; back up to fetch last tuple's ending
+                * length word.  If seek fails we must have a completely empty
+                * file.
+                */
+               if (! LogicalTapeBackspace(state->tapeset,
+                                          state->result_tape,
+                                          2 * sizeof(unsigned int)))
+                   return NULL;
+               state->eof_reached = false;
+           }
+           else
+           {
+               /*
+                * Back up and fetch previously-returned tuple's ending length
+                * word.  If seek fails, assume we are at start of file.
+                */
+               if (! LogicalTapeBackspace(state->tapeset,
+                                          state->result_tape,
+                                          sizeof(unsigned int)))
+                   return NULL;
+               tuplen = getlen(state, state->result_tape, false);
+               /*
+                * Back up to get ending length word of tuple before it.
+                */
+               if (! LogicalTapeBackspace(state->tapeset,
+                                          state->result_tape,
+                                          tuplen + 2 * sizeof(unsigned int)))
+               {
+                   /* If that fails, presumably the prev tuple is the first
+                    * in the file.  Back up so that it becomes next to read
+                    * in forward direction (not obviously right, but that is
+                    * what in-memory case does).
+                    */
+                   if (! LogicalTapeBackspace(state->tapeset,
+                                              state->result_tape,
+                                              tuplen + sizeof(unsigned int)))
+                       elog(ERROR, "tuplesort_gettuple: bogus tuple len in backward scan");
+                   return NULL;
+               }
+           }
+
+           tuplen = getlen(state, state->result_tape, false);
+           /*
+            * Now we have the length of the prior tuple, back up and read it.
+            * Note: READTUP expects we are positioned after the initial
+            * length word of the tuple, so back up to that point.
+            */
+           if (! LogicalTapeBackspace(state->tapeset,
+                                      state->result_tape,
+                                      tuplen))
+               elog(ERROR, "tuplesort_gettuple: bogus tuple len in backward scan");
+           tup = READTUP(state, state->result_tape, tuplen);
+           return tup;
+
+       case TSS_FINALMERGE:
+           Assert(forward);
+           *should_free = true;
+           /*
+            * This code should match the inner loop of mergeonerun().
+            */
+           if (state->heaptupcount > 0)
+           {
+               int     srcTape = state->heapsrctapes[0];
+
+               tup = state->heaptuples[0];
+               tuplesort_heap_siftup(state);
+               if ((tuplen = getlen(state, srcTape, true)) != 0)
+               {
+                   void   *newtup = READTUP(state, srcTape, tuplen);
+                   tuplesort_heap_insert(state, newtup, srcTape);
+               }
+               return tup;
+           }
+           return NULL;
+
+       default:
+           elog(ERROR, "tuplesort_gettuple: invalid state");
+           return NULL;        /* keep compiler quiet */
+   }
+}
+
+/*
+ * inittapes - initialize for tape sorting.
+ *
+ * This is called only if we have found we don't have room to sort in memory.
+ */
+static void
+inittapes(Tuplesortstate *state)
+{
+   int         j;
+
+   state->tapeset = LogicalTapeSetCreate(MAXTAPES);
+
+   /*
+    * Initialize heaptuples array slightly larger than current memtuples
+    * usage; memtupcount is probably a good guess at how many tuples we
+    * will be able to have in the heap at once.
+    */
+   state->heaptupcount = 0;
+   state->heaptupsize = state->memtupcount + state->memtupcount / 4;
+   state->heaptuples = (void **) palloc(state->heaptupsize * sizeof(void *));
+
+   /*
+    * Initialize variables of Algorithm D (step D1).
+    */
+   for (j = 0; j < MAXTAPES; j++)
+   {
+       state->tp_fib[j] = 1;
+       state->tp_runs[j] = 0;
+       state->tp_dummy[j] = 1;
+       state->tp_tapenum[j] = j;
+   }
+   state->tp_fib[TAPERANGE] = 0;
+   state->tp_dummy[TAPERANGE] = 0;
+
+   state->Level = 1;
+   state->destTape = 0;
+
+   state->multipleRuns = false;
+
+   state->status = TSS_BUILDRUNS;
+}
+
+/*
+ * selectnewtape -- select new tape for new initial run.
+ *
+ * This is called after finishing a run when we know another run
+ * must be started.  This implements steps D3, D4 of Algorithm D.
+ */
+static void
+selectnewtape(Tuplesortstate *state)
+{
+   int     j;
+   int     a;
+
+   /* We now have at least two initial runs */
+   state->multipleRuns = true;
+
+   /* Step D3: advance j (destTape) */
+   if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape+1])
+   {
+       state->destTape++;
+       return;
+   }
+   if (state->tp_dummy[state->destTape] != 0)
+   {
+       state->destTape = 0;
+       return;
+   }
+
+   /* Step D4: increase level */
+   state->Level++;
+   a = state->tp_fib[0];
+   for (j = 0; j < TAPERANGE; j++)
+   {
+       state->tp_dummy[j] = a + state->tp_fib[j+1] - state->tp_fib[j];
+       state->tp_fib[j] = a + state->tp_fib[j+1];
+   }
+   state->destTape = 0;
+}
+
+/*
+ * mergeruns -- merge all the completed initial runs.
+ *
+ * This implements steps D5, D6 of Algorithm D.  All input data has
+ * already been written to initial runs on tape (see dumptuples).
+ */
+static void
+mergeruns(Tuplesortstate *state)
+{
+   int     tapenum,
+           svTape,
+           svRuns,
+           svDummy;
+
+   Assert(state->status == TSS_BUILDRUNS);
+   Assert(state->memtupcount == 0 && state->heaptupcount == 0);
+   /*
+    * If we produced only one initial run (quite likely if the total
+    * data volume is between 1X and 2X SortMem), we can just use that
+    * tape as the finished output, rather than doing a useless merge.
+    */
+   if (! state->multipleRuns)
+   {
+       state->result_tape = state->tp_tapenum[state->destTape];
+       /* must freeze and rewind the finished output tape */
+       LogicalTapeFreeze(state->tapeset, state->result_tape);
+       state->status = TSS_SORTEDONTAPE;
+       return;
+   }
+
+   /* End of step D2: rewind all output tapes to prepare for merging */
+   for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
+       LogicalTapeRewind(state->tapeset, tapenum, false);
+
+   for (;;)
+   {
+       /* Step D5: merge runs onto tape[T] until tape[P] is empty */
+       while (state->tp_runs[TAPERANGE-1] || state->tp_dummy[TAPERANGE-1])
+       {
+           bool    allDummy = true;
+           bool    allOneRun = true;
+
+           for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
+           {
+               if (state->tp_dummy[tapenum] == 0)
+                   allDummy = false;
+               if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1)
+                   allOneRun = false;
+           }
+           /*
+            * If we don't have to produce a materialized sorted tape,
+            * quit as soon as we're down to one real/dummy run per tape.
+            */
+           if (! state->randomAccess && allOneRun)
+           {
+               Assert(! allDummy);
+               /* Initialize for the final merge pass */
+               beginmerge(state);
+               state->status = TSS_FINALMERGE;
+               return;
+           }
+           if (allDummy)
+           {   
+               state->tp_dummy[TAPERANGE]++;
+               for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
+                   state->tp_dummy[tapenum]--;
+           }
+           else
+           {
+               mergeonerun(state);
+           }
+       }
+       /* Step D6: decrease level */
+       if (--state->Level == 0)
+           break;
+       /* rewind output tape T to use as new input */
+       LogicalTapeRewind(state->tapeset, state->tp_tapenum[TAPERANGE],
+                         false);
+       /* rewind used-up input tape P, and prepare it for write pass */
+       LogicalTapeRewind(state->tapeset, state->tp_tapenum[TAPERANGE-1],
+                         true);
+       state->tp_runs[TAPERANGE-1] = 0;
+       /* reassign tape units per step D6; note we no longer care about A[] */
+       svTape = state->tp_tapenum[TAPERANGE];
+       svDummy = state->tp_dummy[TAPERANGE];
+       svRuns = state->tp_runs[TAPERANGE];
+       for (tapenum = TAPERANGE; tapenum > 0; tapenum--)
+       {
+           state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum-1];
+           state->tp_dummy[tapenum] = state->tp_dummy[tapenum-1];
+           state->tp_runs[tapenum] = state->tp_runs[tapenum-1];
+       }
+       state->tp_tapenum[0] = svTape;
+       state->tp_dummy[0] = svDummy;
+       state->tp_runs[0] = svRuns;
+   }
+   /*
+    * Done.  Knuth says that the result is on TAPE[1], but since we exited
+    * the loop without performing the last iteration of step D6, we have not
+    * rearranged the tape unit assignment, and therefore the result is on
+    * TAPE[T].  We need to do it this way so that we can freeze the final
+    * output tape while rewinding it.  The last iteration of step D6 would
+    * be a waste of cycles anyway...
+    */
+   state->result_tape = state->tp_tapenum[TAPERANGE];
+   LogicalTapeFreeze(state->tapeset, state->result_tape);
+   state->status = TSS_SORTEDONTAPE;
+}
+
+/*
+ * Merge one run from each input tape, except ones with dummy runs.
+ *
+ * This is the inner loop of Algorithm D step D5.  We know that the
+ * output tape is TAPE[T].
+ */
+static void
+mergeonerun(Tuplesortstate *state)
+{
+   int             destTape = state->tp_tapenum[TAPERANGE];
+   int             srcTape;
+   unsigned int    tuplen;
+   void           *tup;
+
+   /*
+    * Start the merge by loading one tuple from each active source tape
+    * into the heap.  We can also decrease the input run/dummy run counts.
+    */
+   beginmerge(state);
+
+   /*
+    * Execute merge by repeatedly extracting lowest tuple in heap,
+    * writing it out, and replacing it with next tuple from same tape
+    * (if there is another one).
+    */
+   while (state->heaptupcount > 0)
+   {
+       WRITETUP(state, destTape, state->heaptuples[0]);
+       srcTape = state->heapsrctapes[0];
+       tuplesort_heap_siftup(state);
+       if ((tuplen = getlen(state, srcTape, true)) != 0)
+       {
+           tup = READTUP(state, srcTape, tuplen);
+           tuplesort_heap_insert(state, tup, srcTape);
+       }
+   }
+
+   /*
+    * When the heap empties, we're done.  Write an end-of-run marker
+    * on the output tape, and increment its count of real runs.
+    */
+   markrunend(state, destTape);
+   state->tp_runs[TAPERANGE]++;
+}
+
+/*
+ * beginmerge - initialize for a merge pass
+ *
+ * We load the first tuple from each nondummy input run into the heap.
+ * We also decrease the counts of real and dummy runs for each tape.
+ */
+static void
+beginmerge(Tuplesortstate *state)
+{
+   int             tapenum;
+   int             srcTape;
+   unsigned int    tuplen;
+   void           *tup;
+
+   Assert(state->heaptuples != NULL && state->heaptupcount == 0);
+   if (state->heapsrctapes == NULL)
+       state->heapsrctapes = (int *) palloc(MAXTAPES * sizeof(int));
+
+   for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
+   {
+       if (state->tp_dummy[tapenum] > 0)
+       {
+           state->tp_dummy[tapenum]--;
+       }
+       else
+       {
+           Assert(state->tp_runs[tapenum] > 0);
+           state->tp_runs[tapenum]--;
+           srcTape = state->tp_tapenum[tapenum];
+           tuplen = getlen(state, srcTape, false);
+           tup = READTUP(state, srcTape, tuplen);
+           tuplesort_heap_insert(state, tup, srcTape);
+       }
+   }
+
+}
+
+/*
+ * beginrun - start a new initial run
+ *
+ * The tuples presently in the unsorted memory array are moved into
+ * the heap.
+ */
+static void
+beginrun(Tuplesortstate *state)
+{
+   int     i;
+
+   Assert(state->heaptupcount == 0 && state->memtupcount > 0);
+   for (i = 0; i < state->memtupcount; i++)
+       tuplesort_heap_insert(state, state->memtuples[i], 0);
+   state->memtupcount = 0;
+}
+
+/*
+ * dumptuples - remove tuples from heap and write to tape
+ *
+ * When alltuples = false, dump only enough tuples to get under the
+ * availMem limit (and leave at least one tuple in the heap in any case,
+ * since puttuple assumes it always has a tuple to compare to).
+ *
+ * When alltuples = true, dump everything currently in memory.
+ * (This case is only used at end of input data.)
+ *
+ * If we empty the heap, then start a new run using the tuples that
+ * have accumulated in memtuples[] (if any).
+ */
+static void
+dumptuples(Tuplesortstate *state, bool alltuples)
+{
+   while (alltuples ||
+          (LACKMEM(state) &&
+           (state->heaptupcount > 0 || state->memtupcount > 0)))
+   {
+       /*
+        * Dump the heap's frontmost entry, and sift up to remove it
+        * from the heap.
+        */
+       Assert(state->heaptupcount > 0);
+       WRITETUP(state, state->tp_tapenum[state->destTape],
+                state->heaptuples[0]);
+       tuplesort_heap_siftup(state);
+       /*
+        * If the heap is now empty, we've finished a run.
+        */
+       if (state->heaptupcount == 0)
+       {
+           markrunend(state, state->tp_tapenum[state->destTape]);
+           state->tp_runs[state->destTape]++;
+           state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
+           if (state->memtupcount == 0)
+               break;          /* all input data has been written to tape */
+           /* Select new output tape and start a new run */
+           selectnewtape(state);
+           beginrun(state);
+       }
+   }
+}
+
+/*
+ * tuplesort_rescan        - rewind and replay the scan
+ */
+void
+tuplesort_rescan(Tuplesortstate *state)
+{
+   Assert(state->randomAccess);
+
+   switch (state->status)
+   {
+       case TSS_SORTEDINMEM:
+           state->current = 0;
+           state->eof_reached = false;
+           state->markpos_offset = 0;
+           state->markpos_eof = false;
+           break;
+       case TSS_SORTEDONTAPE:
+           LogicalTapeRewind(state->tapeset,
+                             state->result_tape,
+                             false);
+           state->eof_reached = false;
+           state->markpos_block = 0L;
+           state->markpos_offset = 0;
+           state->markpos_eof = false;
+           break;
+       default:
+           elog(ERROR, "tuplesort_rescan: invalid state");
+           break;
+   }
+}
+
+/*
+ * tuplesort_markpos   - saves current position in the merged sort file
+ */
+void
+tuplesort_markpos(Tuplesortstate *state)
+{
+   Assert(state->randomAccess);
+
+   switch (state->status)
+   {
+       case TSS_SORTEDINMEM:
+           state->markpos_offset = state->current;
+           state->markpos_eof = state->eof_reached;
+           break;
+       case TSS_SORTEDONTAPE:
+           LogicalTapeTell(state->tapeset,
+                           state->result_tape,
+                           & state->markpos_block,
+                           & state->markpos_offset);
+           state->markpos_eof = state->eof_reached;
+           break;
+       default:
+           elog(ERROR, "tuplesort_markpos: invalid state");
+           break;
+   }
+}
+
+/*
+ * tuplesort_restorepos    - restores current position in merged sort file to
+ *                       last saved position
+ */
+void
+tuplesort_restorepos(Tuplesortstate *state)
+{
+   Assert(state->randomAccess);
+
+   switch (state->status)
+   {
+       case TSS_SORTEDINMEM:
+           state->current = state->markpos_offset;
+           state->eof_reached = state->markpos_eof;
+           break;
+       case TSS_SORTEDONTAPE:
+           if (! LogicalTapeSeek(state->tapeset,
+                                 state->result_tape,
+                                 state->markpos_block,
+                                 state->markpos_offset))
+               elog(ERROR, "tuplesort_restorepos failed");
+           state->eof_reached = state->markpos_eof;
+           break;
+       default:
+           elog(ERROR, "tuplesort_restorepos: invalid state");
+           break;
+   }
+}
+
+
+/*
+ * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
+ */
+
+/*
+ * Insert a new tuple into an empty or existing heap, maintaining the
+ * heap invariant.  The heap lives in state->heaptuples[].  Also, if
+ * state->heapsrctapes is not NULL, we store each tuple's source tapenum
+ * in the corresponding element of state->heapsrctapes[].
+ */
+static void
+tuplesort_heap_insert(Tuplesortstate *state, void *tuple,
+                     int tapenum)
+{
+   int     j;
+
+   /*
+    * Make sure heaptuples[] can handle another entry.
+    * NOTE: we do not enlarge heapsrctapes[]; it's supposed
+    * to be big enough when created.
+    */
+   if (state->heaptupcount >= state->heaptupsize)
+   {
+       /* Grow the unsorted array as needed. */
+       state->heaptupsize *= 2;
+       state->heaptuples = (void **)
+           repalloc(state->heaptuples,
+                    state->heaptupsize * sizeof(void *));
+   }
+   /*
+    * Sift-up the new entry, per Knuth 5.2.3 exercise 16.
+    * Note that Knuth is using 1-based array indexes, not 0-based.
+    */
+   j = state->heaptupcount++;
+   while (j > 0) {
+       int     i = (j-1) >> 1;
+
+       if (COMPARETUP(state, tuple, state->heaptuples[i]) >= 0)
+           break;
+       state->heaptuples[j] = state->heaptuples[i];
+       if (state->heapsrctapes)
+           state->heapsrctapes[j] = state->heapsrctapes[i];
+       j = i;
+   }
+   state->heaptuples[j] = tuple;
+   if (state->heapsrctapes)
+       state->heapsrctapes[j] = tapenum;
+}
+
+/*
+ * The tuple at state->heaptuples[0] has been removed from the heap.
+ * Decrement heaptupcount, and sift up to maintain the heap invariant.
+ */
+static void
+tuplesort_heap_siftup(Tuplesortstate *state)
+{
+   void  **heaptuples = state->heaptuples;
+   void   *tuple;
+   int     i,
+           n;
+
+   if (--state->heaptupcount <= 0)
+       return;
+   n = state->heaptupcount;
+   tuple = heaptuples[n];      /* tuple that must be reinserted */
+   i = 0;                      /* i is where the "hole" is */
+    for (;;) {
+       int     j = 2*i + 1;
+
+       if (j >= n)
+           break;
+       if (j+1 < n &&
+           COMPARETUP(state, heaptuples[j], heaptuples[j+1]) > 0)
+           j++;
+       if (COMPARETUP(state, tuple, heaptuples[j]) <= 0)
+           break;
+       heaptuples[i] = heaptuples[j];
+       if (state->heapsrctapes)
+           state->heapsrctapes[i] = state->heapsrctapes[j];
+       i = j;
+    }
+    heaptuples[i] = tuple;
+   if (state->heapsrctapes)
+       state->heapsrctapes[i] = state->heapsrctapes[n];
+}
+
+
+/*
+ * Tape interface routines
+ */
+
+static unsigned int
+getlen(Tuplesortstate *state, int tapenum, bool eofOK)
+{
+   unsigned int    len;
+
+   if (LogicalTapeRead(state->tapeset, tapenum, (void *) &len,
+                       sizeof(len)) != sizeof(len))
+       elog(ERROR, "tuplesort: unexpected end of tape");
+   if (len == 0 && !eofOK)
+       elog(ERROR, "tuplesort: unexpected end of data");
+   return len;
+}
+
+static void
+markrunend(Tuplesortstate *state, int tapenum)
+{
+   unsigned int    len = 0;
+
+   LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len));
+}
+
+
+/*
+ * qsort interface
+ */
+
+static int
+qsort_comparetup(const void *a, const void *b)
+{
+   /* The passed pointers are pointers to void * ... */
+
+   return COMPARETUP(qsort_tuplesortstate, * (void **) a, * (void **) b);
+}
+
+
+/*
+ * Routines specialized for HeapTuple case
+ */
+
+static int
+comparetup_heap(Tuplesortstate *state, const void *a, const void *b)
+{
+   HeapTuple   ltup = (HeapTuple) a;
+   HeapTuple   rtup = (HeapTuple) b;
+   int         nkey;
+
+   for (nkey = 0; nkey < state->nKeys; nkey++)
+   {
+       ScanKey     scanKey = state->scanKeys + nkey;
+       Datum       lattr,
+                   rattr;
+       bool        isnull1,
+                   isnull2;
+       int         result;
+
+       lattr = heap_getattr(ltup,
+                            scanKey->sk_attno,
+                            state->tupDesc,
+                            &isnull1);
+       rattr = heap_getattr(rtup,
+                            scanKey->sk_attno,
+                            state->tupDesc,
+                            &isnull2);
+       if (isnull1)
+       {
+           if (!isnull2)
+               return 1;       /* NULL sorts after non-NULL */
+       }
+       else if (isnull2)
+           return -1;
+       else if (scanKey->sk_flags & SK_COMMUTE)
+       {
+           if (!(result = - (int) (*fmgr_faddr(&scanKey->sk_func)) (rattr, lattr)))
+               result = (int) (*fmgr_faddr(&scanKey->sk_func)) (lattr, rattr);
+           if (result)
+               return result;
+       }
+       else
+       {
+           if (!(result = - (int) (*fmgr_faddr(&scanKey->sk_func)) (lattr, rattr)))
+               result = (int) (*fmgr_faddr(&scanKey->sk_func)) (rattr, lattr);
+           if (result)
+               return result;
+       }
+   }
+
+   return 0;
+}
+
+static void *
+copytup_heap(Tuplesortstate *state, void *tup)
+{
+   HeapTuple   tuple = (HeapTuple) tup;
+
+   USEMEM(state, HEAPTUPLESIZE + tuple->t_len);
+   return (void *) heap_copytuple(tuple);
+}
+
+/*
+ * We don't bother to write the HeapTupleData part of the tuple.
+ */
+
+static void
+writetup_heap(Tuplesortstate *state, int tapenum, void *tup)
+{
+   HeapTuple       tuple = (HeapTuple) tup;
+   unsigned int    tuplen;
+
+   tuplen = tuple->t_len + sizeof(tuplen);
+   LogicalTapeWrite(state->tapeset, tapenum,
+                    (void*) &tuplen, sizeof(tuplen));
+   LogicalTapeWrite(state->tapeset, tapenum,
+                    (void*) tuple->t_data, tuple->t_len);
+   if (state->randomAccess)    /* need trailing length word? */
+       LogicalTapeWrite(state->tapeset, tapenum,
+                        (void*) &tuplen, sizeof(tuplen));
+
+   FREEMEM(state, HEAPTUPLESIZE + tuple->t_len);
+   pfree(tuple);
+}
+
+static void *
+readtup_heap(Tuplesortstate *state, int tapenum, unsigned int len)
+{
+   unsigned int    tuplen = len - sizeof(unsigned int) + HEAPTUPLESIZE;
+   HeapTuple       tuple = (HeapTuple) palloc(tuplen);
+
+   USEMEM(state, tuplen);
+   /* reconstruct the HeapTupleData portion */
+   tuple->t_len = len - sizeof(unsigned int);
+   ItemPointerSetInvalid(&(tuple->t_self));
+   tuple->t_data = (HeapTupleHeader) (((char *) tuple) + HEAPTUPLESIZE);
+   /* read in the tuple proper */
+   if (LogicalTapeRead(state->tapeset, tapenum, (void *) tuple->t_data,
+                       tuple->t_len) != tuple->t_len)
+       elog(ERROR, "tuplesort: unexpected end of data");
+   if (state->randomAccess)    /* need trailing length word? */
+       if (LogicalTapeRead(state->tapeset, tapenum, (void *) &tuplen,
+                           sizeof(tuplen)) != sizeof(tuplen))
+           elog(ERROR, "tuplesort: unexpected end of data");
+   return (void *) tuple;
+}
+
+
+/*
+ * Routines specialized for IndexTuple case
+ *
+ * NOTE: actually, these are specialized for the btree case; it's not
+ * clear whether you could use them for a non-btree index.  Possibly
+ * you'd need to make another set of routines if you needed to sort
+ * according to another kind of index.
+ */
+
+static int
+comparetup_index(Tuplesortstate *state, const void *a, const void *b)
+{
+   IndexTuple  ltup = (IndexTuple) a;
+   IndexTuple  rtup = (IndexTuple) b;
+   TupleDesc   itdesc = state->indexRel->rd_att;
+   bool        equal_isnull = false;
+   Datum       lattr,
+               rattr;
+   bool        isnull1,
+               isnull2;
+   int         i;
+
+   for (i = 0; i < itdesc->natts; i++)
+   {
+       lattr = index_getattr(ltup, i + 1, itdesc, &isnull1);
+       rattr = index_getattr(rtup, i + 1, itdesc, &isnull2);
+
+       if (isnull1)
+       {
+           if (!isnull2)
+               return 1;       /* NULL sorts after non-NULL */
+           equal_isnull = true;
+           continue;
+       }
+       else if (isnull2)
+           return -1;
+
+       if (_bt_invokestrat(state->indexRel, i + 1,
+                           BTGreaterStrategyNumber,
+                           lattr, rattr))
+           return 1;
+       if (_bt_invokestrat(state->indexRel, i + 1,
+                           BTGreaterStrategyNumber,
+                           rattr, lattr))
+           return -1;
+   }
+
+   /*
+    * If btree has asked us to enforce uniqueness, complain if two equal
+    * tuples are detected (unless there was at least one NULL field).
+    *
+    * It is sufficient to make the test here, because if two tuples are
+    * equal they *must* get compared at some stage of the sort --- otherwise
+    * the sort algorithm wouldn't have checked whether one must appear
+    * before the other.
+    */
+   if (state->enforceUnique && !equal_isnull)
+       elog(ERROR, "Cannot create unique index. Table contains non-unique values");
+
+   return 0;
+}
+
+static void *
+copytup_index(Tuplesortstate *state, void *tup)
+{
+   IndexTuple      tuple = (IndexTuple) tup;
+   unsigned int    tuplen = IndexTupleSize(tuple);
+   IndexTuple      newtuple;
+
+   USEMEM(state, tuplen);
+   newtuple = (IndexTuple) palloc(tuplen);
+   memcpy(newtuple, tuple, tuplen);
+
+   return (void *) newtuple;
+}
+
+static void
+writetup_index(Tuplesortstate *state, int tapenum, void *tup)
+{
+   IndexTuple      tuple = (IndexTuple) tup;
+   unsigned int    tuplen;
+
+   tuplen = IndexTupleSize(tuple) + sizeof(tuplen);
+   LogicalTapeWrite(state->tapeset, tapenum,
+                    (void*) &tuplen, sizeof(tuplen));
+   LogicalTapeWrite(state->tapeset, tapenum,
+                    (void*) tuple, IndexTupleSize(tuple));
+   if (state->randomAccess)    /* need trailing length word? */
+       LogicalTapeWrite(state->tapeset, tapenum,
+                        (void*) &tuplen, sizeof(tuplen));
+
+   FREEMEM(state, IndexTupleSize(tuple));
+   pfree(tuple);
+}
+
+static void *
+readtup_index(Tuplesortstate *state, int tapenum, unsigned int len)
+{
+   unsigned int    tuplen = len - sizeof(unsigned int);
+   IndexTuple      tuple = (IndexTuple) palloc(tuplen);
+
+   USEMEM(state, tuplen);
+   if (LogicalTapeRead(state->tapeset, tapenum, (void *) tuple,
+                       tuplen) != tuplen)
+       elog(ERROR, "tuplesort: unexpected end of data");
+   if (state->randomAccess)    /* need trailing length word? */
+       if (LogicalTapeRead(state->tapeset, tapenum, (void *) &tuplen,
+                           sizeof(tuplen)) != sizeof(tuplen))
+           elog(ERROR, "tuplesort: unexpected end of data");
+   return (void *) tuple;
+}
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h

index 7c57a9a4f9971a1c3a4c23cc1671549027cddc4c..613595febf4df2b32a5f76841d34087260867202 100644 (file)
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -6,7 +6,7 @@
   *
   * Copyright (c) 1994, Regents of the University of California
   *
- * $Id: nbtree.h,v 1.31 1999/08/08 20:12:49 tgl Exp $
+ * $Id: nbtree.h,v 1.32 1999/10/17 22:15:03 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -288,9 +288,12 @@ extern BTItem _bt_formitem(IndexTuple itup);
  /*
   * prototypes for functions in nbtsort.c
   */
-extern void *_bt_spoolinit(Relation index, int ntapes, bool isunique);
-extern void _bt_spooldestroy(void *spool);
-extern void _bt_spool(Relation index, BTItem btitem, void *spool);
-extern void _bt_leafbuild(Relation index, void *spool);
+
+typedef struct BTSpool BTSpool;    /* opaque type known only within nbtsort.c */
+
+extern BTSpool *_bt_spoolinit(Relation index, bool isunique);
+extern void _bt_spooldestroy(BTSpool *btspool);
+extern void _bt_spool(BTItem btitem, BTSpool *btspool);
+extern void _bt_leafbuild(BTSpool *btspool);
  
  #endif  /* NBTREE_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h

index 092fa57acb1174ecfdea28852796be71ada3304f..44aa8b8ace57ff9845e91e86a013f2ed32cc06df 100644 (file)
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -6,7 +6,7 @@
   *
   * Copyright (c) 1994, Regents of the University of California
   *
- * $Id: execnodes.h,v 1.36 1999/09/26 21:21:04 tgl Exp $
+ * $Id: execnodes.h,v 1.37 1999/10/17 22:15:07 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -597,17 +597,9 @@ typedef struct GroupState
  /* ----------------
   *  SortState information
   *
- *|        sort nodes are really just a kind of a scan since
- *|        we implement sorts by retrieving the entire subplan
- *|        into a temp relation, sorting the temp relation into
- *|        another sorted relation, and then preforming a simple
- *|        unqualified sequential scan on the sorted relation..
- *|        -cim 10/15/89
- *
- *     Flag            indicated whether relation has been sorted
- *     Keys            scan key structures used to keep info on sort keys
- *     TempRelation    temporary relation containing result of executing
- *                     the subplan.
+ *     sort_Done       indicates whether sort has been performed yet
+ *     sort_Keys       scan key structures describing the sort keys
+ *     tuplesortstate  private state of tuplesort.c
   *
   *  CommonScanState information
   *
@@ -628,9 +620,9 @@ typedef struct GroupState
  typedef struct SortState
  {
     CommonScanState csstate;    /* its first field is NodeTag */
-   bool        sort_Flag;
+   bool        sort_Done;
     ScanKey     sort_Keys;
-   bool        cleaned;
+   void       *tuplesortstate;
  } SortState;
  
  /* ----------------
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h

index 095ee074d386bb53bd7c495694bb43367bc00b57..a03dacfb02b453aa10c6039eeb1099c432fe3824 100644 (file)
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -6,7 +6,7 @@
   *
   * Copyright (c) 1994, Regents of the University of California
   *
- * $Id: plannodes.h,v 1.30 1999/08/21 03:49:09 tgl Exp $
+ * $Id: plannodes.h,v 1.31 1999/10/17 22:15:07 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -288,8 +288,6 @@ typedef struct Sort
     Oid         nonameid;
     int         keycount;
     SortState  *sortstate;
-   void       *psortstate;
-   bool        cleaned;
  } Sort;
  
  /* ----------------
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h

new file mode 100644 (file)

index 0000000..7c5a320
--- /dev/null
+++ b/src/include/utils/tuplesort.h
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * tuplesort.h
+ *   Generalized tuple sorting routines.
+ *
+ * This module handles sorting of either heap tuples or index tuples
+ * (and could fairly easily support other kinds of sortable objects,
+ * if necessary).  It works efficiently for both small and large amounts
+ * of data.  Small amounts are sorted in-memory using qsort().  Large
+ * amounts are sorted using temporary files and a standard external sort
+ * algorithm.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: tuplesort.h,v 1.1 1999/10/17 22:15:09 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TUPLESORT_H
+#define TUPLESORT_H
+
+#include "access/htup.h"
+#include "access/itup.h"
+#include "access/skey.h"
+#include "access/tupdesc.h"
+#include "utils/rel.h"
+
+/* Tuplesortstate is an opaque type whose details are not known outside tuplesort.c. */
+
+typedef struct Tuplesortstate Tuplesortstate;
+
+/*
+ * We provide two different interfaces to what is essentially the same
+ * code: one for sorting HeapTuples and one for sorting IndexTuples.
+ * They differ primarily in the way that the sort key information is
+ * supplied.
+ */
+
+extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
+                                           int nkeys, ScanKey keys,
+                                           bool randomAccess);
+extern Tuplesortstate *tuplesort_begin_index(Relation indexRel,
+                                            bool enforceUnique,
+                                            bool randomAccess);
+
+extern void tuplesort_puttuple(Tuplesortstate *state, void *tuple);
+
+extern void tuplesort_performsort(Tuplesortstate *state);
+
+extern void *tuplesort_gettuple(Tuplesortstate *state, bool forward,
+                               bool *should_free);
+#define tuplesort_getheaptuple(state, forward, should_free) \
+   ((HeapTuple) tuplesort_gettuple(state, forward, should_free))
+#define tuplesort_getindextuple(state, forward, should_free) \
+   ((IndexTuple) tuplesort_gettuple(state, forward, should_free))
+
+extern void tuplesort_end(Tuplesortstate *state);
+
+/*
+ * These routines may only be called if randomAccess was specified 'true'.
+ * Backwards scan in gettuple is likewise only allowed if randomAccess.
+ */
+
+extern void tuplesort_rescan(Tuplesortstate *state);
+extern void tuplesort_markpos(Tuplesortstate *state);
+extern void tuplesort_restorepos(Tuplesortstate *state);
+
+#endif  /* TUPLESORT_H */
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 17 Oct 1999 22:15:09 +0000 (22:15 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 17 Oct 1999 22:15:09 +0000 (22:15 +0000)
src/backend/access/nbtree/nbtree.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtsort.c		patch \| blob \| blame \| history
src/backend/executor/nodeSort.c		patch \| blob \| blame \| history
src/backend/utils/sort/Makefile		patch \| blob \| blame \| history
src/backend/utils/sort/logtape.c		patch \| blob \| blame \| history
src/backend/utils/sort/tuplesort.c	[new file with mode: 0644]	patch \| blob
src/include/access/nbtree.h		patch \| blob \| blame \| history
src/include/nodes/execnodes.h		patch \| blob \| blame \| history
src/include/nodes/plannodes.h		patch \| blob \| blame \| history
src/include/utils/tuplesort.h	[new file with mode: 0644]	patch \| blob