Improve speed of hash index build.

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 28 Jul 2022 18:34:32 +0000 (14:34 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 28 Jul 2022 18:34:32 +0000 (14:34 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 28 Jul 2022 18:34:32 +0000 (14:34 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 28 Jul 2022 18:34:32 +0000 (14:34 -0400)
diff --git a/src/backend/access/hash/hashsort.c b/src/backend/access/hash/hashsort.c

index aa61e39f26acba4b9195ae1657d4ac2fd3812580..19563148d052bddd353b4cf6e6fc472f1df10bec 100644 (file)
--- a/src/backend/access/hash/hashsort.c
+++ b/src/backend/access/hash/hashsort.c
@@ -42,9 +42,10 @@ struct HSpool
         Relation        index;
  
         /*
-        * We sort the hash keys based on the buckets they belong to. Below masks
-        * are used in _hash_hashkey2bucket to determine the bucket of given hash
-        * key.
+        * We sort the hash keys based on the buckets they belong to, then by the
+        * hash values themselves, to optimize insertions onto hash pages.  The
+        * masks below are used in _hash_hashkey2bucket to determine the bucket of
+        * a given hash key.
          */
         uint32          high_mask;
         uint32          low_mask;
diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c

index 2933020dcc85e8202eb6fd8589040d559cebec92..7ad4429ad3d3e1d54572723b17c85edf644e71af 100644 (file)
--- a/src/backend/utils/sort/tuplesortvariants.c
+++ b/src/backend/utils/sort/tuplesortvariants.c
@@ -1387,14 +1387,17 @@ comparetup_index_hash(const SortTuple *a, const SortTuple *b,
  {
         Bucket          bucket1;
         Bucket          bucket2;
+       uint32          hash1;
+       uint32          hash2;
         IndexTuple      tuple1;
         IndexTuple      tuple2;
         TuplesortPublic *base = TuplesortstateGetPublic(state);
         TuplesortIndexHashArg *arg = (TuplesortIndexHashArg *) base->arg;
  
         /*
-        * Fetch hash keys and mask off bits we don't want to sort by. We know
-        * that the first column of the index tuple is the hash key.
+        * Fetch hash keys and mask off bits we don't want to sort by, so that the
+        * initial sort is just on the bucket number.  We know that the first
+        * column of the index tuple is the hash key.
          */
         Assert(!a->isnull1);
         bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1),
@@ -1409,6 +1412,18 @@ comparetup_index_hash(const SortTuple *a, const SortTuple *b,
         else if (bucket1 < bucket2)
                 return -1;
  
+       /*
+        * If bucket values are equal, sort by hash values.  This allows us to
+        * insert directly onto bucket/overflow pages, where the index tuples are
+        * stored in hash order to allow fast binary search within each page.
+        */
+       hash1 = DatumGetUInt32(a->datum1);
+       hash2 = DatumGetUInt32(b->datum1);
+       if (hash1 > hash2)
+               return 1;
+       else if (hash1 < hash2)
+               return -1;
+
         /*
          * If hash values are equal, we sort on ItemPointer.  This does not affect
          * validity of the finished index, but it may be useful to have index
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 28 Jul 2022 18:34:32 +0000 (14:34 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 28 Jul 2022 18:34:32 +0000 (14:34 -0400)
src/backend/access/hash/hashsort.c		patch \| blob \| blame \| history
src/backend/utils/sort/tuplesortvariants.c		patch \| blob \| blame \| history