Merge branch 'XL_MASTER_MERGE_9_4' into XL_NEW_MASTER

Conflicts: src/test/regress/expected/aggregates.out src/test/regress/expected/create_index.out src/test/regress/expected/inherit.out src/test/regress/expected/join.out src/test/regress/expected/window.out src/test/regress/expected/with.out
author: Pavan Deolasee 2015-05-05 09:19:18 +0000
committer: Pavan Deolasee 2015-05-05 09:19:18 +0000
commit: 73fa25c67cbfa24c03e28c96bf356f2592671730 (patch)
tree: 10ded7e26abd78d93658cb72fc5cb9d4672eff2a /src/backend
parent: da4d108859bcd7a308ca75aba54281e32968822c (diff)
parent: 4a9ab6d8619817f9e3989c99b65140e19041dab7 (diff)
711 files changed, 252214 insertions, 117889 deletions
diff --git a/src/backend/Makefile b/src/backend/Makefile
index 611d29fa87..3d5d3eae30 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for the postgres backend
 #
-# Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+# Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
 # Portions Copyright (c) 1994, Regents of the University of California
 #
 # src/backend/Makefile
@@ -51,10 +51,12 @@ OBJS = $(SUBDIROBJS) $(LOCALOBJS) \
 	$(top_builddir)/src/port/libpgport_srv.a \
 	$(top_builddir)/src/gtm/client/libgtmclient.a \
 	$(top_builddir)/src/gtm/common/libgtm.a \
-	$(top_builddir)/src/gtm/libpq/libpqcomm.a	
+	$(top_builddir)/src/gtm/libpq/libpqcomm.a	 \
+       $(top_builddir)/src/common/libpgcommon_srv.a
 
-# We put libpgport into OBJS, so remove it from LIBS; also add libldap
-LIBS := $(filter-out -lpgport, $(LIBS)) $(LDAP_LIBS_BE)
+# We put libpgport and libpgcommon into OBJS, so remove it from LIBS; also add
+# libldap
+LIBS := $(filter-out -lpgport -lpgcommon, $(LIBS)) $(LDAP_LIBS_BE)
 
 # The backend doesn't need everything that's in LIBS, however
 LIBS := $(filter-out -lz -lreadline -ledit -ltermcap -lncurses -lcurses, $(LIBS))
@@ -78,36 +80,26 @@ endif
 
 ifeq ($(PORTNAME), cygwin)
 
-postgres: $(OBJS) postgres.def libpostgres.a
-	$(DLLTOOL) --dllname $@$(X) --output-exp $@.exp --def postgres.def
-	$(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -o $@$(X) -Wl,--base-file,$@.base $@.exp $(call expand_subsys,$(OBJS)) $(LIBS)
-	$(DLLTOOL) --dllname $@$(X) --base-file $@.base --output-exp $@.exp --def postgres.def
-	$(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -Wl,--stack,$(WIN32_STACK_RLIMIT) -o $@$(X) $@.exp $(call expand_subsys,$(OBJS)) $(LIBS)
-	rm -f $@.exp $@.base
+postgres: $(OBJS)
+	$(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) $(export_dynamic) -Wl,--stack,$(WIN32_STACK_RLIMIT) -Wl,--export-all-symbols -Wl,--out-implib=libpostgres.a $(call expand_subsys,$^) $(LIBS) -o $@
 
-postgres.def: $(OBJS)
-	$(DLLTOOL) --export-all --output-def $@ $(call expand_subsys,$^)
+# There is no correct way to write a rule that generates two files.
+# Rules with two targets don't have that meaning, they are merely
+# shorthand for two otherwise separate rules.  To be safe for parallel
+# make, we must chain the dependencies like this.  The semicolon is
+# important, otherwise make will choose some built-in rule.
 
-libpostgres.a: postgres.def
-	$(DLLTOOL) --dllname postgres.exe --def postgres.def --output-lib $@
+libpostgres.a: postgres ;
 
 endif # cygwin
 
 ifeq ($(PORTNAME), win32)
 LIBS += -lsecur32
 
-postgres: $(OBJS) postgres.def libpostgres.a $(WIN32RES)
-	$(DLLTOOL) --dllname $@$(X) --output-exp $@.exp --def postgres.def
-	$(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -o $@$(X) -Wl,--base-file,$@.base $@.exp $(call expand_subsys,$(OBJS)) $(WIN32RES) $(LIBS)
-	$(DLLTOOL) --dllname $@$(X) --base-file $@.base --output-exp $@.exp --def postgres.def
-	$(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -Wl,--stack=$(WIN32_STACK_RLIMIT) -o $@$(X) $@.exp $(call expand_subsys,$(OBJS)) $(WIN32RES) $(LIBS)
-	rm -f $@.exp $@.base
-
-postgres.def: $(OBJS)
-	$(DLLTOOL) --export-all --output-def $@ $(call expand_subsys,$^)
+postgres: $(OBJS) $(WIN32RES)
+	$(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -Wl,--stack=$(WIN32_STACK_RLIMIT) -Wl,--export-all-symbols -Wl,--out-implib=libpostgres.a $(call expand_subsys,$(OBJS)) $(WIN32RES) $(LIBS) -o $@$(X)
 
-libpostgres.a: postgres.def
-	$(DLLTOOL) --dllname postgres.exe --def postgres.def --output-lib $@
+libpostgres.a: postgres ;
 
 endif # win32
 
@@ -253,6 +245,7 @@ else
 endif
 ifeq ($(MAKE_EXPORTS), true)
 	$(INSTALL_DATA) $(POSTGRES_IMP) '$(DESTDIR)$(pkglibdir)/$(POSTGRES_IMP)'
+	$(INSTALL_PROGRAM) $(MKLDEXPORT) '$(DESTDIR)$(pgxsdir)/$(MKLDEXPORT_DIR)/mkldexport.sh'
 endif
 
 .PHONY: install-bin
@@ -271,6 +264,7 @@ endif
 endif
 ifeq ($(MAKE_EXPORTS), true)
 	$(MKDIR_P) '$(DESTDIR)$(pkglibdir)'
+	$(MKDIR_P) '$(DESTDIR)$(pgxsdir)/$(MKLDEXPORT_DIR)'
 endif
 
 
@@ -280,6 +274,7 @@ uninstall:
 	rm -f '$(DESTDIR)$(bindir)/postgres$(X)' '$(DESTDIR)$(bindir)/postmaster'
 ifeq ($(MAKE_EXPORTS), true)
 	rm -f '$(DESTDIR)$(pkglibdir)/$(POSTGRES_IMP)'
+	rm -f '$(DESTDIR)$(pgxsdir)/$(MKLDEXPORT_DIR)/mkldexport.sh'
 endif
 ifeq ($(PORTNAME), cygwin)
 ifeq ($(MAKE_DLL), true)
@@ -308,10 +303,10 @@ clean:
 		$(top_builddir)/src/include/utils/fmgroids.h \
 		$(top_builddir)/src/include/utils/probes.h
 ifeq ($(PORTNAME), cygwin)
-	rm -f postgres.dll postgres.def libpostgres.a
+	rm -f postgres.dll libpostgres.a
 endif
 ifeq ($(PORTNAME), win32)
-	rm -f postgres.dll postgres.def libpostgres.a $(WIN32RES)
+	rm -f postgres.dll libpostgres.a $(WIN32RES)
 endif
 
 distclean: clean
@@ -329,7 +324,6 @@ maintainer-clean: distclean
 	      catalog/postgres.description \
 	      catalog/postgres.shdescription \
 	      replication/repl_gram.c \
-	      replication/repl_gram.h \
 	      replication/repl_scanner.c \
 	      utils/fmgroids.h \
 	      utils/fmgrtab.c \
diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile
index 0366d59624..c32088f81d 100644
--- a/src/backend/access/Makefile
+++ b/src/backend/access/Makefile
@@ -8,6 +8,6 @@ subdir = src/backend/access
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-SUBDIRS	    = common gist hash heap index nbtree transam gin spgist
+SUBDIRS	    = common gin gist hash heap index nbtree rmgrdesc spgist transam
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c
index 62479c04bb..b81bddf3c1 100644
--- a/src/backend/access/common/heaptuple.c
+++ b/src/backend/access/common/heaptuple.c
@@ -21,7 +21,7 @@
  * tuptoaster.c.
  *
  * This change will break any code that assumes it needn't detoast values
- * that have been put into a tuple but never sent to disk.	Hopefully there
+ * that have been put into a tuple but never sent to disk.  Hopefully there
  * are few such places.
  *
  * Varlenas still have alignment 'i' (or 'd') in pg_type/pg_attribute, since
@@ -50,7 +50,7 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  *
  * Portions Copyright (c) 2012-2014, TransLattice, Inc.
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -69,6 +69,7 @@
 #include "access/tuptoaster.h"
 #include "executor/tuptable.h"
 #ifdef XCP
+#include "lib/stringinfo.h"
 #include "utils/memutils.h"
 #endif
 
@@ -400,7 +401,7 @@ nocachegetattr(HeapTuple tuple,
 
 		/*
 		 * Otherwise, check for non-fixed-length attrs up to and including
-		 * target.	If there aren't any, it's safe to cheaply initialize the
+		 * target.  If there aren't any, it's safe to cheaply initialize the
 		 * cached offsets for these attrs.
 		 */
 		if (HeapTupleHasVarWidth(tuple))
@@ -467,7 +468,7 @@ nocachegetattr(HeapTuple tuple,
 		 *
 		 * Note - This loop is a little tricky.  For each non-null attribute,
 		 * we have to first account for alignment padding before the attr,
-		 * then advance over the attr based on its length.	Nulls have no
+		 * then advance over the attr based on its length.  Nulls have no
 		 * storage and no alignment padding either.  We can use/set
 		 * attcacheoff until we reach either a null or a var-width attribute.
 		 */
@@ -552,17 +553,17 @@ heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
 			result = ObjectIdGetDatum(HeapTupleGetOid(tup));
 			break;
 		case MinTransactionIdAttributeNumber:
-			result = TransactionIdGetDatum(HeapTupleHeaderGetXmin(tup->t_data));
+			result = TransactionIdGetDatum(HeapTupleHeaderGetRawXmin(tup->t_data));
 			break;
 		case MaxTransactionIdAttributeNumber:
-			result = TransactionIdGetDatum(HeapTupleHeaderGetXmax(tup->t_data));
+			result = TransactionIdGetDatum(HeapTupleHeaderGetRawXmax(tup->t_data));
 			break;
 		case MinCommandIdAttributeNumber:
 		case MaxCommandIdAttributeNumber:
 
 			/*
 			 * cmin and cmax are now both aliases for the same field, which
-			 * can in fact also be a combo command id.	XXX perhaps we should
+			 * can in fact also be a combo command id.  XXX perhaps we should
 			 * return the "real" cmin or cmax if possible, that is if we are
 			 * inside the originating transaction?
 			 */
@@ -641,6 +642,41 @@ heap_copytuple_with_tuple(HeapTuple src, HeapTuple dest)
 	memcpy((char *) dest->t_data, (char *) src->t_data, src->t_len);
 }
 
+/* ----------------
+ *		heap_copy_tuple_as_datum
+ *
+ *		copy a tuple as a composite-type Datum
+ * ----------------
+ */
+Datum
+heap_copy_tuple_as_datum(HeapTuple tuple, TupleDesc tupleDesc)
+{
+	HeapTupleHeader td;
+
+	/*
+	 * If the tuple contains any external TOAST pointers, we have to inline
+	 * those fields to meet the conventions for composite-type Datums.
+	 */
+	if (HeapTupleHasExternal(tuple))
+		return toast_flatten_tuple_to_datum(tuple->t_data,
+											tuple->t_len,
+											tupleDesc);
+
+	/*
+	 * Fast path for easy case: just make a palloc'd copy and insert the
+	 * correct composite-Datum header fields (since those may not be set if
+	 * the given tuple came from disk, rather than from heap_form_tuple).
+	 */
+	td = (HeapTupleHeader) palloc(tuple->t_len);
+	memcpy((char *) td, (char *) tuple->t_data, tuple->t_len);
+
+	HeapTupleHeaderSetDatumLength(td, tuple->t_len);
+	HeapTupleHeaderSetTypeId(td, tupleDesc->tdtypeid);
+	HeapTupleHeaderSetTypMod(td, tupleDesc->tdtypmod);
+
+	return PointerGetDatum(td);
+}
+
 /*
  * heap_form_tuple
  *		construct a tuple from the given values[] and isnull[] arrays,
@@ -659,7 +695,6 @@ heap_form_tuple(TupleDesc tupleDescriptor,
 				data_len;
 	int			hoff;
 	bool		hasnull = false;
-	Form_pg_attribute *att = tupleDescriptor->attrs;
 	int			numberOfAttributes = tupleDescriptor->natts;
 	int			i;
 
@@ -670,28 +705,14 @@ heap_form_tuple(TupleDesc tupleDescriptor,
 						numberOfAttributes, MaxTupleAttributeNumber)));
 
 	/*
-	 * Check for nulls and embedded tuples; expand any toasted attributes in
-	 * embedded tuples.  This preserves the invariant that toasting can only
-	 * go one level deep.
-	 *
-	 * We can skip calling toast_flatten_tuple_attribute() if the attribute
-	 * couldn't possibly be of composite type.  All composite datums are
-	 * varlena and have alignment 'd'; furthermore they aren't arrays. Also,
-	 * if an attribute is already toasted, it must have been sent to disk
-	 * already and so cannot contain toasted attributes.
+	 * Check for nulls
 	 */
 	for (i = 0; i < numberOfAttributes; i++)
 	{
 		if (isnull[i])
-			hasnull = true;
-		else if (att[i]->attlen == -1 &&
-				 att[i]->attalign == 'd' &&
-				 att[i]->attndims == 0 &&
-				 !VARATT_IS_EXTENDED(DatumGetPointer(values[i])))
 		{
-			values[i] = toast_flatten_tuple_attribute(values[i],
-													  att[i]->atttypid,
-													  att[i]->atttypmod);
+			hasnull = true;
+			break;
 		}
 	}
 
@@ -713,7 +734,7 @@ heap_form_tuple(TupleDesc tupleDescriptor,
 	len += data_len;
 
 	/*
-	 * Allocate and zero the space needed.	Note that the tuple body and
+	 * Allocate and zero the space needed.  Note that the tuple body and
 	 * HeapTupleData management structure are allocated in one chunk.
 	 */
 	tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + len);
@@ -721,7 +742,8 @@ heap_form_tuple(TupleDesc tupleDescriptor,
 
 	/*
 	 * And fill in the information.  Note we fill the Datum fields even though
-	 * this tuple may never become a Datum.
+	 * this tuple may never become a Datum.  This lets HeapTupleHeaderGetDatum
+	 * identify the tuple type if needed.
 	 */
 	tuple->t_len = len;
 	ItemPointerSetInvalid(&(tuple->t_self));
@@ -1641,7 +1663,6 @@ heap_form_minimal_tuple(TupleDesc tupleDescriptor,
 				data_len;
 	int			hoff;
 	bool		hasnull = false;
-	Form_pg_attribute *att = tupleDescriptor->attrs;
 	int			numberOfAttributes = tupleDescriptor->natts;
 	int			i;
 
@@ -1652,28 +1673,14 @@ heap_form_minimal_tuple(TupleDesc tupleDescriptor,
 						numberOfAttributes, MaxTupleAttributeNumber)));
 
 	/*
-	 * Check for nulls and embedded tuples; expand any toasted attributes in
-	 * embedded tuples.  This preserves the invariant that toasting can only
-	 * go one level deep.
-	 *
-	 * We can skip calling toast_flatten_tuple_attribute() if the attribute
-	 * couldn't possibly be of composite type.  All composite datums are
-	 * varlena and have alignment 'd'; furthermore they aren't arrays. Also,
-	 * if an attribute is already toasted, it must have been sent to disk
-	 * already and so cannot contain toasted attributes.
+	 * Check for nulls
 	 */
 	for (i = 0; i < numberOfAttributes; i++)
 	{
 		if (isnull[i])
-			hasnull = true;
-		else if (att[i]->attlen == -1 &&
-				 att[i]->attalign == 'd' &&
-				 att[i]->attndims == 0 &&
-				 !VARATT_IS_EXTENDED(values[i]))
 		{
-			values[i] = toast_flatten_tuple_attribute(values[i],
-													  att[i]->atttypid,
-													  att[i]->atttypmod);
+			hasnull = true;
+			break;
 		}
 	}
 
diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c
index 76c76e9e89..5fd400990b 100644
--- a/src/backend/access/common/indextuple.c
+++ b/src/backend/access/common/indextuple.c
@@ -4,7 +4,7 @@
  *	   This file contains index tuple accessor and mutator routines,
  *	   as well as various tuple utilities.
  *
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -71,7 +71,7 @@ index_form_tuple(TupleDesc tupleDescriptor,
 
 		/*
 		 * If value is stored EXTERNAL, must fetch it so we are not depending
-		 * on outside storage.	This should be improved someday.
+		 * on outside storage.  This should be improved someday.
 		 */
 		if (VARATT_IS_EXTERNAL(DatumGetPointer(values[i])))
 		{
@@ -158,6 +158,11 @@ index_form_tuple(TupleDesc tupleDescriptor,
 	if (tupmask & HEAP_HASVARWIDTH)
 		infomask |= INDEX_VAR_MASK;
 
+	/* Also assert we got rid of external attributes */
+#ifdef TOAST_INDEX_HACK
+	Assert((tupmask & HEAP_HASEXTERNAL) == 0);
+#endif
+
 	/*
 	 * Here we make sure that the size will fit in the field reserved for it
 	 * in t_info.
@@ -165,9 +170,8 @@ index_form_tuple(TupleDesc tupleDescriptor,
 	if ((size & INDEX_SIZE_MASK) != size)
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("index row requires %lu bytes, maximum size is %lu",
-						(unsigned long) size,
-						(unsigned long) INDEX_SIZE_MASK)));
+				 errmsg("index row requires %zu bytes, maximum size is %zu",
+						size, (Size) INDEX_SIZE_MASK)));
 
 	infomask |= size;
 
@@ -276,7 +280,7 @@ nocache_index_getattr(IndexTuple tup,
 
 		/*
 		 * Otherwise, check for non-fixed-length attrs up to and including
-		 * target.	If there aren't any, it's safe to cheaply initialize the
+		 * target.  If there aren't any, it's safe to cheaply initialize the
 		 * cached offsets for these attrs.
 		 */
 		if (IndexTupleHasVarwidths(tup))
@@ -343,7 +347,7 @@ nocache_index_getattr(IndexTuple tup,
 		 *
 		 * Note - This loop is a little tricky.  For each non-null attribute,
 		 * we have to first account for alignment padding before the attr,
-		 * then advance over the attr based on its length.	Nulls have no
+		 * then advance over the attr based on its length.  Nulls have no
 		 * storage and no alignment padding either.  We can use/set
 		 * attcacheoff until we reach either a null or a var-width attribute.
 		 */
diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c
index 10a45a3146..d033cdd09a 100644
--- a/src/backend/access/common/printtup.c
+++ b/src/backend/access/common/printtup.c
@@ -10,7 +10,7 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  *
  * Portions Copyright (c) 2012-2014, TransLattice, Inc.
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -28,6 +28,9 @@
 #ifdef PGXC
 #include "pgxc/pgxc.h"
 #endif
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+
 
 static void printtup_startup(DestReceiver *self, int operation,
 				 TupleDesc typeinfo);
@@ -67,6 +70,7 @@ typedef struct
 	TupleDesc	attrinfo;		/* The attr info we are set up for */
 	int			nattrs;
 	PrinttupAttrInfo *myinfo;	/* Cached info about each attr */
+	MemoryContext tmpcontext;	/* Memory context for per-row workspace */
 } DR_printtup;
 
 /* ----------------
@@ -93,6 +97,7 @@ printtup_create_DR(CommandDest dest)
 	self->attrinfo = NULL;
 	self->nattrs = 0;
 	self->myinfo = NULL;
+	self->tmpcontext = NULL;
 
 	return (DestReceiver *) self;
 }
@@ -130,6 +135,18 @@ printtup_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
 	DR_printtup *myState = (DR_printtup *) self;
 	Portal		portal = myState->portal;
 
+	/*
+	 * Create a temporary memory context that we can reset once per row to
+	 * recover palloc'd memory.  This avoids any problems with leaks inside
+	 * datatype output routines, and should be faster than retail pfree's
+	 * anyway.
+	 */
+	myState->tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
+												"printtup",
+												ALLOCSET_DEFAULT_MINSIZE,
+												ALLOCSET_DEFAULT_INITSIZE,
+												ALLOCSET_DEFAULT_MAXSIZE);
+
 	if (PG_PROTOCOL_MAJOR(FrontendProtocol) < 3)
 	{
 		/*
@@ -173,7 +190,7 @@ printtup_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
  * or some similar function; it does not contain a full set of fields.
  * The targetlist will be NIL when executing a utility function that does
  * not have a plan.  If the targetlist isn't NIL then it is a Query node's
- * targetlist; it is up to us to ignore resjunk columns in it.	The formats[]
+ * targetlist; it is up to us to ignore resjunk columns in it.  The formats[]
  * array pointer might be NULL (if we are doing Describe on a prepared stmt);
  * send zeroes for the format codes in that case.
  */
@@ -309,6 +326,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
 {
 	TupleDesc	typeinfo = slot->tts_tupleDescriptor;
 	DR_printtup *myState = (DR_printtup *) self;
+	MemoryContext oldcontext;
 	StringInfoData buf;
 	int			natts = typeinfo->natts;
 	int			i;
@@ -341,8 +359,11 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
 	/* Make sure the tuple is fully deconstructed */
 	slot_getallattrs(slot);
 
+	/* Switch into per-row context so we can recover memory below */
+	oldcontext = MemoryContextSwitchTo(myState->tmpcontext);
+
 	/*
-	 * Prepare a DataRow message
+	 * Prepare a DataRow message (note buffer is in per-row context)
 	 */
 	pq_beginmessage(&buf, 'D');
 
@@ -354,8 +375,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
 	for (i = 0; i < natts; ++i)
 	{
 		PrinttupAttrInfo *thisState = myState->myinfo + i;
-		Datum		origattr = slot->tts_values[i],
-					attr;
+		Datum		attr = slot->tts_values[i];
 
 		if (slot->tts_isnull[i])
 		{
@@ -364,13 +384,15 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
 		}
 
 		/*
-		 * If we have a toasted datum, forcibly detoast it here to avoid
-		 * memory leakage inside the type's output routine.
+		 * Here we catch undefined bytes in datums that are returned to the
+		 * client without hitting disk; see comments at the related check in
+		 * PageAddItem().  This test is most useful for uncompressed,
+		 * non-external datums, but we're quite likely to see such here when
+		 * testing new C functions.
 		 */
 		if (thisState->typisvarlena)
-			attr = PointerGetDatum(PG_DETOAST_DATUM(origattr));
-		else
-			attr = origattr;
+			VALGRIND_CHECK_MEM_IS_DEFINED(DatumGetPointer(attr),
+										  VARSIZE_ANY(attr));
 
 		if (thisState->format == 0)
 		{
@@ -379,7 +401,6 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
 
 			outputstr = OutputFunctionCall(&thisState->finfo, attr);
 			pq_sendcountedtext(&buf, outputstr, strlen(outputstr), false);
-			pfree(outputstr);
 		}
 		else
 		{
@@ -390,15 +411,14 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
 			pq_sendint(&buf, VARSIZE(outputbytes) - VARHDRSZ, 4);
 			pq_sendbytes(&buf, VARDATA(outputbytes),
 						 VARSIZE(outputbytes) - VARHDRSZ);
-			pfree(outputbytes);
 		}
-
-		/* Clean up detoasted copy, if any */
-		if (DatumGetPointer(attr) != DatumGetPointer(origattr))
-			pfree(DatumGetPointer(attr));
 	}
 
 	pq_endmessage(&buf);
+
+	/* Return to caller's context, and flush row's temporary memory */
+	MemoryContextSwitchTo(oldcontext);
+	MemoryContextReset(myState->tmpcontext);
 }
 
 /* ----------------
@@ -410,6 +430,7 @@ printtup_20(TupleTableSlot *slot, DestReceiver *self)
 {
 	TupleDesc	typeinfo = slot->tts_tupleDescriptor;
 	DR_printtup *myState = (DR_printtup *) self;
+	MemoryContext oldcontext;
 	StringInfoData buf;
 	int			natts = typeinfo->natts;
 	int			i,
@@ -423,6 +444,9 @@ printtup_20(TupleTableSlot *slot, DestReceiver *self)
 	/* Make sure the tuple is fully deconstructed */
 	slot_getallattrs(slot);
 
+	/* Switch into per-row context so we can recover memory below */
+	oldcontext = MemoryContextSwitchTo(myState->tmpcontext);
+
 	/*
 	 * tell the frontend to expect new tuple data (in ASCII style)
 	 */
@@ -454,8 +478,7 @@ printtup_20(TupleTableSlot *slot, DestReceiver *self)
 	for (i = 0; i < natts; ++i)
 	{
 		PrinttupAttrInfo *thisState = myState->myinfo + i;
-		Datum		origattr = slot->tts_values[i],
-					attr;
+		Datum		attr = slot->tts_values[i];
 		char	   *outputstr;
 
 		if (slot->tts_isnull[i])
@@ -463,25 +486,15 @@ printtup_20(TupleTableSlot *slot, DestReceiver *self)
 
 		Assert(thisState->format == 0);
 
-		/*
-		 * If we have a toasted datum, forcibly detoast it here to avoid
-		 * memory leakage inside the type's output routine.
-		 */
-		if (thisState->typisvarlena)
-			attr = PointerGetDatum(PG_DETOAST_DATUM(origattr));
-		else
-			attr = origattr;
-
 		outputstr = OutputFunctionCall(&thisState->finfo, attr);
 		pq_sendcountedtext(&buf, outputstr, strlen(outputstr), true);
-		pfree(outputstr);
-
-		/* Clean up detoasted copy, if any */
-		if (DatumGetPointer(attr) != DatumGetPointer(origattr))
-			pfree(DatumGetPointer(attr));
 	}
 
 	pq_endmessage(&buf);
+
+	/* Return to caller's context, and flush row's temporary memory */
+	MemoryContextSwitchTo(oldcontext);
+	MemoryContextReset(myState->tmpcontext);
 }
 
 /* ----------------
@@ -498,6 +511,10 @@ printtup_shutdown(DestReceiver *self)
 	myState->myinfo = NULL;
 
 	myState->attrinfo = NULL;
+
+	if (myState->tmpcontext)
+		MemoryContextDelete(myState->tmpcontext);
+	myState->tmpcontext = NULL;
 }
 
 /* ----------------
@@ -560,8 +577,7 @@ debugtup(TupleTableSlot *slot, DestReceiver *self)
 	TupleDesc	typeinfo = slot->tts_tupleDescriptor;
 	int			natts = typeinfo->natts;
 	int			i;
-	Datum		origattr,
-				attr;
+	Datum		attr;
 	char	   *value;
 	bool		isnull;
 	Oid			typoutput;
@@ -569,30 +585,15 @@ debugtup(TupleTableSlot *slot, DestReceiver *self)
 
 	for (i = 0; i < natts; ++i)
 	{
-		origattr = slot_getattr(slot, i + 1, &isnull);
+		attr = slot_getattr(slot, i + 1, &isnull);
 		if (isnull)
 			continue;
 		getTypeOutputInfo(typeinfo->attrs[i]->atttypid,
 						  &typoutput, &typisvarlena);
 
-		/*
-		 * If we have a toasted datum, forcibly detoast it here to avoid
-		 * memory leakage inside the type's output routine.
-		 */
-		if (typisvarlena)
-			attr = PointerGetDatum(PG_DETOAST_DATUM(origattr));
-		else
-			attr = origattr;
-
 		value = OidOutputFunctionCall(typoutput, attr);
 
 		printatt((unsigned) i + 1, typeinfo->attrs[i], value);
-
-		pfree(value);
-
-		/* Clean up detoasted copy, if any */
-		if (DatumGetPointer(attr) != DatumGetPointer(origattr))
-			pfree(DatumGetPointer(attr));
 	}
 	printf("\t----\n");
 }
@@ -611,6 +612,7 @@ printtup_internal_20(TupleTableSlot *slot, DestReceiver *self)
 {
 	TupleDesc	typeinfo = slot->tts_tupleDescriptor;
 	DR_printtup *myState = (DR_printtup *) self;
+	MemoryContext oldcontext;
 	StringInfoData buf;
 	int			natts = typeinfo->natts;
 	int			i,
@@ -624,6 +626,9 @@ printtup_internal_20(TupleTableSlot *slot, DestReceiver *self)
 	/* Make sure the tuple is fully deconstructed */
 	slot_getallattrs(slot);
 
+	/* Switch into per-row context so we can recover memory below */
+	oldcontext = MemoryContextSwitchTo(myState->tmpcontext);
+
 	/*
 	 * tell the frontend to expect new tuple data (in binary style)
 	 */
@@ -655,8 +660,7 @@ printtup_internal_20(TupleTableSlot *slot, DestReceiver *self)
 	for (i = 0; i < natts; ++i)
 	{
 		PrinttupAttrInfo *thisState = myState->myinfo + i;
-		Datum		origattr = slot->tts_values[i],
-					attr;
+		Datum		attr = slot->tts_values[i];
 		bytea	   *outputbytes;
 
 		if (slot->tts_isnull[i])
@@ -664,26 +668,15 @@ printtup_internal_20(TupleTableSlot *slot, DestReceiver *self)
 
 		Assert(thisState->format == 1);
 
-		/*
-		 * If we have a toasted datum, forcibly detoast it here to avoid
-		 * memory leakage inside the type's output routine.
-		 */
-		if (thisState->typisvarlena)
-			attr = PointerGetDatum(PG_DETOAST_DATUM(origattr));
-		else
-			attr = origattr;
-
 		outputbytes = SendFunctionCall(&thisState->finfo, attr);
-		/* We assume the result will not have been toasted */
 		pq_sendint(&buf, VARSIZE(outputbytes) - VARHDRSZ, 4);
 		pq_sendbytes(&buf, VARDATA(outputbytes),
 					 VARSIZE(outputbytes) - VARHDRSZ);
-		pfree(outputbytes);
-
-		/* Clean up detoasted copy, if any */
-		if (DatumGetPointer(attr) != DatumGetPointer(origattr))
-			pfree(DatumGetPointer(attr));
 	}
 
 	pq_endmessage(&buf);
+
+	/* Return to caller's context, and flush row's temporary memory */
+	MemoryContextSwitchTo(oldcontext);
+	MemoryContextReset(myState->tmpcontext);
 }
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 607a80d7d4..522b671993 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -3,7 +3,7 @@
  * reloptions.c
  *	  Core support for relation options (pg_class.reloptions)
  *
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -17,12 +17,14 @@
 
 #include "access/gist_private.h"
 #include "access/hash.h"
+#include "access/htup_details.h"
 #include "access/nbtree.h"
 #include "access/reloptions.h"
 #include "access/spgist.h"
 #include "catalog/pg_type.h"
 #include "commands/defrem.h"
 #include "commands/tablespace.h"
+#include "commands/view.h"
 #include "nodes/makefuncs.h"
 #include "utils/array.h"
 #include "utils/attoptcache.h"
@@ -61,6 +63,14 @@ static relopt_bool boolRelOpts[] =
 	},
 	{
 		{
+			"user_catalog_table",
+			"Declare a table as an additional catalog table, e.g. for the purpose of logical replication",
+			RELOPT_KIND_HEAP
+		},
+		false
+	},
+	{
+		{
 			"fastupdate",
 			"Enables \"fast update\" feature for this GIN index",
 			RELOPT_KIND_GIN
@@ -163,6 +173,14 @@ static relopt_int intRelOpts[] =
 	},
 	{
 		{
+			"autovacuum_multixact_freeze_min_age",
+			"Minimum multixact age at which VACUUM should freeze a row multixact's, for autovacuum",
+			RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
+		},
+		-1, 0, 1000000000
+	},
+	{
+		{
 			"autovacuum_freeze_max_age",
 			"Age at which to autovacuum a table to prevent transaction ID wraparound",
 			RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
@@ -171,11 +189,27 @@ static relopt_int intRelOpts[] =
 	},
 	{
 		{
+			"autovacuum_multixact_freeze_max_age",
+			"Multixact age at which to autovacuum a table to prevent multixact wraparound",
+			RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
+		},
+		-1, 100000000, 2000000000
+	},
+	{
+		{
 			"autovacuum_freeze_table_age",
-			"Age at which VACUUM should perform a full table sweep to replace old Xid values with FrozenXID",
+			"Age at which VACUUM should perform a full table sweep to freeze row versions",
+			RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
+		}, -1, 0, 2000000000
+	},
+	{
+		{
+			"autovacuum_multixact_freeze_table_age",
+			"Age of multixact at which VACUUM should perform a full table sweep to freeze row versions",
 			RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
 		}, -1, 0, 2000000000
 	},
+
 	/* list terminator */
 	{{NULL}}
 };
@@ -247,6 +281,17 @@ static relopt_string stringRelOpts[] =
 		gistValidateBufferingOption,
 		"auto"
 	},
+	{
+		{
+			"check_option",
+			"View has WITH CHECK OPTION defined (local or cascaded).",
+			RELOPT_KIND_VIEW
+		},
+		0,
+		true,
+		validateWithCheckOption,
+		NULL
+	},
 	/* list terminator */
 	{{NULL}}
 };
@@ -495,7 +540,7 @@ add_real_reloption(bits32 kinds, char *name, char *desc, double default_val,
  *		Add a new string reloption
  *
  * "validator" is an optional function pointer that can be used to test the
- * validity of the values.	It must elog(ERROR) when the argument string is
+ * validity of the values.  It must elog(ERROR) when the argument string is
  * not acceptable for the variable.  Note that the default value must pass
  * the validation.
  */
@@ -790,7 +835,7 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, Oid amoptions)
 		case RELKIND_RELATION:
 		case RELKIND_TOASTVALUE:
 		case RELKIND_VIEW:
-		case RELKIND_UNCATALOGED:
+		case RELKIND_MATVIEW:
 			options = heap_reloptions(classForm->relkind, datum, false);
 			break;
 		case RELKIND_INDEX:
@@ -823,7 +868,7 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, Oid amoptions)
  * is returned.
  *
  * Note: values of type int, bool and real are allocated as part of the
- * returned array.	Values of type string are allocated separately and must
+ * returned array.  Values of type string are allocated separately and must
  * be freed by the caller.
  */
 relopt_value *
@@ -1145,12 +1190,22 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
 		offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, freeze_max_age)},
 		{"autovacuum_freeze_table_age", RELOPT_TYPE_INT,
 		offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, freeze_table_age)},
+		{"autovacuum_multixact_freeze_min_age", RELOPT_TYPE_INT,
+		offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, multixact_freeze_min_age)},
+		{"autovacuum_multixact_freeze_max_age", RELOPT_TYPE_INT,
+		offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, multixact_freeze_max_age)},
+		{"autovacuum_multixact_freeze_table_age", RELOPT_TYPE_INT,
+		offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, multixact_freeze_table_age)},
 		{"autovacuum_vacuum_scale_factor", RELOPT_TYPE_REAL,
 		offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, vacuum_scale_factor)},
 		{"autovacuum_analyze_scale_factor", RELOPT_TYPE_REAL,
 		offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, analyze_scale_factor)},
 		{"security_barrier", RELOPT_TYPE_BOOL,
 		offsetof(StdRdOptions, security_barrier)},
+		{"check_option", RELOPT_TYPE_STRING,
+		offsetof(StdRdOptions, check_option_offset)},
+		{"user_catalog_table", RELOPT_TYPE_BOOL,
+		offsetof(StdRdOptions, user_catalog_table)}
 	};
 
 	options = parseRelOptions(reloptions, validate, kind, &numoptions);
@@ -1191,6 +1246,7 @@ heap_reloptions(char relkind, Datum reloptions, bool validate)
 			}
 			return (bytea *) rdopts;
 		case RELKIND_RELATION:
+		case RELKIND_MATVIEW:
 			return default_reloptions(reloptions, validate, RELOPT_KIND_HEAP);
 		case RELKIND_VIEW:
 			return default_reloptions(reloptions, validate, RELOPT_KIND_VIEW);
diff --git a/src/backend/access/common/scankey.c b/src/backend/access/common/scankey.c
index 37d77ed7d5..85c31c49c1 100644
--- a/src/backend/access/common/scankey.c
+++ b/src/backend/access/common/scankey.c
@@ -3,7 +3,7 @@
  * scankey.c
  *	  scan key support code
  *
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
diff --git a/src/backend/access/common/tupconvert.c b/src/backend/access/common/tupconvert.c
index f5a43d47b8..2e48b32ba3 100644
--- a/src/backend/access/common/tupconvert.c
+++ b/src/backend/access/common/tupconvert.c
@@ -5,11 +5,11 @@
  *
  * These functions provide conversion between rowtypes that are logically
  * equivalent but might have columns in a different order or different sets
- * of dropped columns.	There is some overlap of functionality with the
+ * of dropped columns.  There is some overlap of functionality with the
  * executor's "junkfilter" routines, but these functions work on bare
  * HeapTuples rather than TupleTableSlots.
  *
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -20,6 +20,7 @@
  */
 #include "postgres.h"
 
+#include "access/htup_details.h"
 #include "access/tupconvert.h"
 #include "utils/builtins.h"
 
diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c
index daac277f03..f3b36893f7 100644
--- a/src/backend/access/common/tupdesc.c
+++ b/src/backend/access/common/tupdesc.c
@@ -3,7 +3,7 @@
  * tupdesc.c
  *	  POSTGRES tuple descriptor support code
  *
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -19,12 +19,13 @@
 
 #include "postgres.h"
 
+#include "access/htup_details.h"
 #include "catalog/pg_type.h"
 #include "miscadmin.h"
 #include "parser/parse_type.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
-#include "utils/resowner.h"
+#include "utils/resowner_private.h"
 #include "utils/syscache.h"
 
 
@@ -203,6 +204,7 @@ CreateTupleDescCopyConstr(TupleDesc tupdesc)
 				if (constr->check[i].ccbin)
 					cpy->check[i].ccbin = pstrdup(constr->check[i].ccbin);
 				cpy->check[i].ccvalid = constr->check[i].ccvalid;
+				cpy->check[i].ccnoinherit = constr->check[i].ccnoinherit;
 			}
 		}
 
@@ -216,6 +218,47 @@ CreateTupleDescCopyConstr(TupleDesc tupdesc)
 }
 
 /*
+ * TupleDescCopyEntry
+ *		This function copies a single attribute structure from one tuple
+ *		descriptor to another.
+ *
+ * !!! Constraints and defaults are not copied !!!
+ */
+void
+TupleDescCopyEntry(TupleDesc dst, AttrNumber dstAttno,
+				   TupleDesc src, AttrNumber srcAttno)
+{
+	/*
+	 * sanity checks
+	 */
+	AssertArg(PointerIsValid(src));
+	AssertArg(PointerIsValid(dst));
+	AssertArg(srcAttno >= 1);
+	AssertArg(srcAttno <= src->natts);
+	AssertArg(dstAttno >= 1);
+	AssertArg(dstAttno <= dst->natts);
+
+	memcpy(dst->attrs[dstAttno - 1], src->attrs[srcAttno - 1],
+		   ATTRIBUTE_FIXED_PART_SIZE);
+
+	/*
+	 * Aside from updating the attno, we'd better reset attcacheoff.
+	 *
+	 * XXX Actually, to be entirely safe we'd need to reset the attcacheoff of
+	 * all following columns in dst as well.  Current usage scenarios don't
+	 * require that though, because all following columns will get initialized
+	 * by other uses of this function or TupleDescInitEntry.  So we cheat a
+	 * bit to avoid a useless O(N^2) penalty.
+	 */
+	dst->attrs[dstAttno - 1]->attnum = dstAttno;
+	dst->attrs[dstAttno - 1]->attcacheoff = -1;
+
+	/* since we're not copying constraints or defaults, clear these */
+	dst->attrs[dstAttno - 1]->attnotnull = false;
+	dst->attrs[dstAttno - 1]->atthasdef = false;
+}
+
+/*
  * Free a TupleDesc including all substructure
  */
 void
@@ -416,7 +459,9 @@ equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2)
 			for (j = 0; j < n; check2++, j++)
 			{
 				if (strcmp(check1->ccname, check2->ccname) == 0 &&
-					strcmp(check1->ccbin, check2->ccbin) == 0)
+					strcmp(check1->ccbin, check2->ccbin) == 0 &&
+					check1->ccvalid == check2->ccvalid &&
+					check1->ccnoinherit == check2->ccnoinherit)
 					break;
 			}
 			if (j >= n)
@@ -536,7 +581,7 @@ TupleDescInitEntryCollation(TupleDesc desc,
  * Given a relation schema (list of ColumnDef nodes), build a TupleDesc.
  *
  * Note: the default assumption is no OIDs; caller may modify the returned
- * TupleDesc if it wants OIDs.	Also, tdtypeid will need to be filled in
+ * TupleDesc if it wants OIDs.  Also, tdtypeid will need to be filled in
  * later on.
  */
 TupleDesc
@@ -579,8 +624,7 @@ BuildDescForRelation(List *schema)
 
 		aclresult = pg_type_aclcheck(atttypid, GetUserId(), ACL_USAGE);
 		if (aclresult != ACLCHECK_OK)
-			aclcheck_error(aclresult, ACL_KIND_TYPE,
-						   format_type_be(atttypid));
+			aclcheck_error_type(aclresult, atttypid);
 
 		attcollation = GetColumnDefCollation(NULL, entry, atttypid);
 		attdim = list_length(entry->typeName->arrayBounds);
diff --git a/src/backend/access/gin/Makefile b/src/backend/access/gin/Makefile
index 889dde6a27..db4f496a4d 100644
--- a/src/backend/access/gin/Makefile
+++ b/src/backend/access/gin/Makefile
@@ -14,6 +14,6 @@ include $(top_builddir)/src/Makefile.global
 
 OBJS = ginutil.o gininsert.o ginxlog.o ginentrypage.o gindatapage.o \
 	ginbtree.o ginscan.o ginget.o ginvacuum.o ginarrayproc.o \
-	ginbulk.o ginfast.o
+	ginbulk.o ginfast.o ginpostinglist.o ginlogic.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README
index 67159d8529..fade0cbb61 100644
--- a/src/backend/access/gin/README
+++ b/src/backend/access/gin/README
@@ -104,7 +104,7 @@ a few thousand entries can be much faster than retail insertion.  (The win
 comes mainly from not having to do multiple searches/insertions when the
 same key appears in multiple new heap tuples.)
 
-Key entries are nominally of the same IndexEntry format as used in other
+Key entries are nominally of the same IndexTuple format as used in other
 index types, but since a leaf key entry typically refers to multiple heap
 tuples, there are significant differences.  (See GinFormTuple, which works
 by building a "normal" index tuple and then modifying it.)  The points to
@@ -135,15 +135,15 @@ same category of null entry are merged into one index entry just as happens
 with ordinary key entries.
 
 * In a key entry at the btree leaf level, at the next SHORTALIGN boundary,
-there is an array of zero or more ItemPointers, which store the heap tuple
-TIDs for which the indexable items contain this key.  This is called the
-"posting list".  The TIDs in a posting list must appear in sorted order.
-If the list would be too big for the index tuple to fit on an index page,
-the ItemPointers are pushed out to a separate posting page or pages, and
-none appear in the key entry itself.  The separate pages are called a
-"posting tree"; they are organized as a btree of ItemPointer values.
-Note that in either case, the ItemPointers associated with a key can
-easily be read out in sorted order; this is relied on by the scan
+there is a list of item pointers, in compressed format (see Posting List
+Compression section), pointing to the heap tuples for which the indexable
+items contain this key. This is called the "posting list".
+
+If the list would be too big for the index tuple to fit on an index page, the
+ItemPointers are pushed out to a separate posting page or pages, and none
+appear in the key entry itself.  The separate pages are called a "posting
+tree" (see below); Note that in either case, the ItemPointers associated with
+a key can easily be read out in sorted order; this is relied on by the scan
 algorithms.
 
 * The index tuple header fields of a leaf key entry are abused as follows:
@@ -163,6 +163,11 @@ algorithms.
 
 * The posting list can be accessed with GinGetPosting(itup)
 
+* If GinITupIsCompressed(itup), the posting list is stored in compressed
+  format. Otherwise it is just an array of ItemPointers. New tuples are always
+  stored in compressed format, uncompressed items can be present if the
+  database was migrated from 9.3 or earlier version.
+
 2) Posting tree case:
 
 * ItemPointerGetBlockNumber(&itup->t_tid) contains the index block number
@@ -176,7 +181,7 @@ algorithms.
   GinSetPostingTree macro.
 
 * If IndexTupleHasNulls(itup) is true, the null category byte can be
-  accessed/set with GinGetNullCategory(itup) / GinSetNullCategory(itup,c)
+  accessed/set with GinGetNullCategory(itup,gs) / GinSetNullCategory(itup,gs,c)
 
 * The posting list is not present and must not be accessed.
 
@@ -210,6 +215,145 @@ fit on one pending-list page must have those pages to itself, even if this
 results in wasting much of the space on the preceding page and the last
 page for the tuple.)
 
+Posting tree
+------------
+
+If a posting list is too large to store in-line in a key entry, a posting tree
+is created. A posting tree is a B-tree structure, where the ItemPointer is
+used as the key.
+
+Internal posting tree pages use the standard PageHeader and the same "opaque"
+struct as other GIN page, but do not contain regular index tuples. Instead,
+the contents of the page is an array of PostingItem structs. Each PostingItem
+consists of the block number of the child page, and the right bound of that
+child page, as an ItemPointer. The right bound of the page is stored right
+after the page header, before the PostingItem array.
+
+Posting tree leaf pages also use the standard PageHeader and opaque struct,
+and the right bound of the page is stored right after the page header, but
+the page content comprises of a number of compressed posting lists. The
+compressed posting lists are stored one after each other, between page header
+and pd_lower. The space between pd_lower and pd_upper is unused, which allows
+full-page images of posting tree leaf pages to skip the unused space in middle
+(buffer_std = true in XLogRecData).
+
+The item pointers are stored in a number of independent compressed posting
+lists (also called segments), instead of one big one, to make random access
+to a given item pointer faster: to find an item in a compressed list, you
+have to read the list from the beginning, but when the items are split into
+multiple lists, you can first skip over to the list containing the item you're
+looking for, and read only that segment. Also, an update only needs to
+re-encode the affected segment.
+
+Posting List Compression
+------------------------
+
+To fit as many item pointers on a page as possible, posting tree leaf pages
+and posting lists stored inline in entry tree leaf tuples use a lightweight
+form of compression. We take advantage of the fact that the item pointers
+are stored in sorted order. Instead of storing the block and offset number of
+each item pointer separately, we store the difference from the previous item.
+That in itself doesn't do much, but it allows us to use so-called varbyte
+encoding to compress them.
+
+Varbyte encoding is a method to encode integers, allowing smaller numbers to
+take less space at the cost of larger numbers. Each integer is represented by
+variable number of bytes. High bit of each byte in varbyte encoding determines
+whether the next byte is still part of this number. Therefore, to read a single
+varbyte encoded number, you have to read bytes until you find a byte with the
+high bit not set.
+
+When encoding, the block and offset number forming the item pointer are
+combined into a single integer. The offset number is stored in the 11 low
+bits (see MaxHeapTuplesPerPageBits in ginpostinglist.c), and the block number
+is stored in the higher bits. That requires 43 bits in total, which
+conveniently fits in at most 6 bytes.
+
+A compressed posting list is passed around and stored on disk in a
+PackedPostingList struct. The first item in the list is stored uncompressed
+as a regular ItemPointerData, followed by the length of the list in bytes,
+followed by the packed items.
+
+Concurrency
+-----------
+
+The entry tree and each posting tree is a B-tree, with right-links connecting
+sibling pages at the same level. This is the same structure that is used in
+the regular B-tree indexam (invented by Lehman & Yao), but we don't support
+scanning a GIN trees backwards, so we don't need left-links.
+
+To avoid deadlocks, B-tree pages must always be locked in the same order:
+left to right, and bottom to top. When searching, the tree is traversed from
+top to bottom, so the lock on the parent page must be released before
+descending to the next level. Concurrent page splits move the keyspace to
+right, so after following a downlink, the page actually containing the key
+we're looking for might be somewhere to the right of the page we landed on.
+In that case, we follow the right-links until we find the page we're looking
+for.
+
+To delete a page, the page's left sibling, the target page, and its parent,
+are locked in that order, and the page is marked as deleted. However, a
+concurrent search might already have read a pointer to the page, and might be
+just about to follow it. A page can be reached via the right-link of its left
+sibling, or via its downlink in the parent.
+
+To prevent a backend from reaching a deleted page via a right-link, when
+following a right-link the lock on the previous page is not released until
+the lock on next page has been acquired.
+
+The downlink is more tricky. A search descending the tree must release the
+lock on the parent page before locking the child, or it could deadlock with
+a concurrent split of the child page; a page split locks the parent, while
+already holding a lock on the child page. However, posting trees are only
+fully searched from left to right, starting from the leftmost leaf. (The
+tree-structure is only needed by insertions, to quickly find the correct
+insert location). So as long as we don't delete the leftmost page on each
+level, a search can never follow a downlink to page that's about to be
+deleted.
+
+The previous paragraph's reasoning only applies to searches, and only to
+posting trees. To protect from inserters following a downlink to a deleted
+page, vacuum simply locks out all concurrent insertions to the posting tree,
+by holding a super-exclusive lock on the posting tree root. Inserters hold a
+pin on the root page, but searches do not, so while new searches cannot begin
+while root page is locked, any already-in-progress scans can continue
+concurrently with vacuum. In the entry tree, we never delete pages.
+
+(This is quite different from the mechanism the btree indexam uses to make
+page-deletions safe; it stamps the deleted pages with an XID and keeps the
+deleted pages around with the right-link intact until all concurrent scans
+have finished.)
+
+Compatibility
+-------------
+
+Compression of TIDs was introduced in 9.4. Some GIN indexes could remain in
+uncompressed format because of pg_upgrade from 9.3 or earlier versions.
+For compatibility, old uncompressed format is also supported. Following
+rules are used to handle it:
+
+* GIN_ITUP_COMPRESSED flag marks index tuples that contain a posting list.
+This flag is stored in high bit of ItemPointerGetBlockNumber(&itup->t_tid).
+Use GinItupIsCompressed(itup) to check the flag.
+
+* Posting tree pages in the new format are marked with the GIN_COMPRESSED flag.
+  Macros GinPageIsCompressed(page) and GinPageSetCompressed(page) are used to
+  check and set this flag.
+
+* All scan operations check format of posting list add use corresponding code
+to read its content.
+
+* When updating an index tuple containing an uncompressed posting list, it
+will be replaced with new index tuple containing a compressed list.
+
+* When updating an uncompressed posting tree leaf page, it's compressed.
+
+* If vacuum finds some dead TIDs in uncompressed posting lists, they are
+converted into compressed posting lists. This assumes that the compressed
+posting list fits in the space occupied by the uncompressed list. IOW, we
+assume that the compressed version of the page, with the dead items removed,
+takes less space than the old uncompressed version.
+
 Limitations
 -----------
 
diff --git a/src/backend/access/gin/ginarrayproc.c b/src/backend/access/gin/ginarrayproc.c
index f1f934331c..66cea28113 100644
--- a/src/backend/access/gin/ginarrayproc.c
+++ b/src/backend/access/gin/ginarrayproc.c
@@ -4,7 +4,7 @@
  *	  support functions for GIN's indexing of any array
  *
  *
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -197,7 +197,7 @@ ginarrayconsistent(PG_FUNCTION_ARGS)
 
 			/*
 			 * Must have all elements in check[] true; no discrimination
-			 * against nulls here.	This is because array_contain_compare and
+			 * against nulls here.  This is because array_contain_compare and
 			 * array_eq handle nulls differently ...
 			 */
 			res = true;
@@ -218,3 +218,88 @@ ginarrayconsistent(PG_FUNCTION_ARGS)
 
 	PG_RETURN_BOOL(res);
 }
+
+/*
+ * triconsistent support function
+ */
+Datum
+ginarraytriconsistent(PG_FUNCTION_ARGS)
+{
+	GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
+	StrategyNumber strategy = PG_GETARG_UINT16(1);
+
+	/* ArrayType  *query = PG_GETARG_ARRAYTYPE_P(2); */
+	int32		nkeys = PG_GETARG_INT32(3);
+
+	/* Pointer	   *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
+	/* Datum	   *queryKeys = (Datum *) PG_GETARG_POINTER(5); */
+	bool	   *nullFlags = (bool *) PG_GETARG_POINTER(6);
+	GinTernaryValue res;
+	int32		i;
+
+	switch (strategy)
+	{
+		case GinOverlapStrategy:
+			/* must have a match for at least one non-null element */
+			res = GIN_FALSE;
+			for (i = 0; i < nkeys; i++)
+			{
+				if (!nullFlags[i])
+				{
+					if (check[i] == GIN_TRUE)
+					{
+						res = GIN_TRUE;
+						break;
+					}
+					else if (check[i] == GIN_MAYBE && res == GIN_FALSE)
+					{
+						res = GIN_MAYBE;
+					}
+				}
+			}
+			break;
+		case GinContainsStrategy:
+			/* must have all elements in check[] true, and no nulls */
+			res = GIN_TRUE;
+			for (i = 0; i < nkeys; i++)
+			{
+				if (check[i] == GIN_FALSE || nullFlags[i])
+				{
+					res = GIN_FALSE;
+					break;
+				}
+				if (check[i] == GIN_MAYBE)
+				{
+					res = GIN_MAYBE;
+				}
+			}
+			break;
+		case GinContainedStrategy:
+			/* can't do anything else useful here */
+			res = GIN_MAYBE;
+			break;
+		case GinEqualStrategy:
+
+			/*
+			 * Must have all elements in check[] true; no discrimination
+			 * against nulls here.  This is because array_contain_compare and
+			 * array_eq handle nulls differently ...
+			 */
+			res = GIN_MAYBE;
+			for (i = 0; i < nkeys; i++)
+			{
+				if (check[i] == GIN_FALSE)
+				{
+					res = GIN_FALSE;
+					break;
+				}
+			}
+			break;
+		default:
+			elog(ERROR, "ginarrayconsistent: unknown strategy number: %d",
+				 strategy);
+			res = false;
+	}
+
+	PG_RETURN_GIN_TERNARY_VALUE(res);
+}
diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c
index b160551b5e..27f88e0eb2 100644
--- a/src/backend/access/gin/ginbtree.c
+++ b/src/backend/access/gin/ginbtree.c
@@ -4,7 +4,7 @@
  *	  page utilities routines for the postgres inverted index access method.
  *
  *
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -18,8 +18,15 @@
 #include "miscadmin.h"
 #include "utils/rel.h"
 
+static void ginFindParents(GinBtree btree, GinBtreeStack *stack);
+static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
+			   void *insertdata, BlockNumber updateblkno,
+			   Buffer childbuf, GinStatsData *buildStats);
+static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack,
+			   bool freestack, GinStatsData *buildStats);
+
 /*
- * Locks buffer by needed method for search.
+ * Lock buffer by needed method for search.
  */
 static int
 ginTraverseLock(Buffer buffer, bool searchMode)
@@ -52,58 +59,51 @@ ginTraverseLock(Buffer buffer, bool searchMode)
 	return access;
 }
 
-GinBtreeStack *
-ginPrepareFindLeafPage(GinBtree btree, BlockNumber blkno)
-{
-	GinBtreeStack *stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
-
-	stack->blkno = blkno;
-	stack->buffer = ReadBuffer(btree->index, stack->blkno);
-	stack->parent = NULL;
-	stack->predictNumber = 1;
-
-	ginTraverseLock(stack->buffer, btree->searchMode);
-
-	return stack;
-}
-
 /*
- * Locates leaf page contained tuple
+ * Descend the tree to the leaf page that contains or would contain the key
+ * we're searching for. The key should already be filled in 'btree', in
+ * tree-type specific manner. If btree->fullScan is true, descends to the
+ * leftmost leaf page.
+ *
+ * If 'searchmode' is false, on return stack->buffer is exclusively locked,
+ * and the stack represents the full path to the root. Otherwise stack->buffer
+ * is share-locked, and stack->parent is NULL.
  */
 GinBtreeStack *
-ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
+ginFindLeafPage(GinBtree btree, bool searchMode)
 {
-	bool		isfirst = TRUE;
-	BlockNumber rootBlkno;
+	GinBtreeStack *stack;
 
-	if (!stack)
-		stack = ginPrepareFindLeafPage(btree, GIN_ROOT_BLKNO);
-	rootBlkno = stack->blkno;
+	stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
+	stack->blkno = btree->rootBlkno;
+	stack->buffer = ReadBuffer(btree->index, btree->rootBlkno);
+	stack->parent = NULL;
+	stack->predictNumber = 1;
 
 	for (;;)
 	{
 		Page		page;
 		BlockNumber child;
-		int			access = GIN_SHARE;
+		int			access;
 
 		stack->off = InvalidOffsetNumber;
 
 		page = BufferGetPage(stack->buffer);
 
-		if (isfirst)
-		{
-			if (GinPageIsLeaf(page) && !btree->searchMode)
-				access = GIN_EXCLUSIVE;
-			isfirst = FALSE;
-		}
-		else
-			access = ginTraverseLock(stack->buffer, btree->searchMode);
+		access = ginTraverseLock(stack->buffer, searchMode);
+
+		/*
+		 * If we're going to modify the tree, finish any incomplete splits we
+		 * encounter on the way.
+		 */
+		if (!searchMode && GinPageIsIncompleteSplit(page))
+			ginFinishSplit(btree, stack, false, NULL);
 
 		/*
 		 * ok, page is correctly locked, we should check to move right ..,
 		 * root never has a right link, so small optimization
 		 */
-		while (btree->fullScan == FALSE && stack->blkno != rootBlkno &&
+		while (btree->fullScan == FALSE && stack->blkno != btree->rootBlkno &&
 			   btree->isMoveRight(btree, page))
 		{
 			BlockNumber rightlink = GinPageGetOpaque(page)->rightlink;
@@ -112,11 +112,12 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
 				/* rightmost page */
 				break;
 
+			stack->buffer = ginStepRight(stack->buffer, btree->index, access);
 			stack->blkno = rightlink;
-			LockBuffer(stack->buffer, GIN_UNLOCK);
-			stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno);
-			LockBuffer(stack->buffer, access);
 			page = BufferGetPage(stack->buffer);
+
+			if (!searchMode && GinPageIsIncompleteSplit(page))
+				ginFinishSplit(btree, stack, false, NULL);
 		}
 
 		if (GinPageIsLeaf(page))	/* we found, return locked page */
@@ -129,7 +130,7 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
 		Assert(child != InvalidBlockNumber);
 		Assert(stack->blkno != child);
 
-		if (btree->searchMode)
+		if (searchMode)
 		{
 			/* in search mode we may forget path to leaf */
 			stack->blkno = child;
@@ -146,9 +147,41 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
 			stack->predictNumber = 1;
 		}
 	}
+}
 
-	/* keep compiler happy */
-	return NULL;
+/*
+ * Step right from current page.
+ *
+ * The next page is locked first, before releasing the current page. This is
+ * crucial to protect from concurrent page deletion (see comment in
+ * ginDeletePage).
+ */
+Buffer
+ginStepRight(Buffer buffer, Relation index, int lockmode)
+{
+	Buffer		nextbuffer;
+	Page		page = BufferGetPage(buffer);
+	bool		isLeaf = GinPageIsLeaf(page);
+	bool		isData = GinPageIsData(page);
+	BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
+
+	nextbuffer = ReadBuffer(index, blkno);
+	LockBuffer(nextbuffer, lockmode);
+	UnlockReleaseBuffer(buffer);
+
+	/* Sanity check that the page we stepped to is of similar kind. */
+	page = BufferGetPage(nextbuffer);
+	if (isLeaf != GinPageIsLeaf(page) || isData != GinPageIsData(page))
+		elog(ERROR, "right sibling of GIN page is of different type");
+
+	/*
+	 * Given the proper lock sequence above, we should never land on a deleted
+	 * page.
+	 */
+	if (GinPageIsDeleted(page))
+		elog(ERROR, "right sibling of GIN page was deleted");
+
+	return nextbuffer;
 }
 
 void
@@ -167,172 +200,293 @@ freeGinBtreeStack(GinBtreeStack *stack)
 }
 
 /*
- * Try to find parent for current stack position, returns correct
- * parent and child's offset in  stack->parent.
- * Function should never release root page to prevent conflicts
- * with vacuum process
+ * Try to find parent for current stack position. Returns correct parent and
+ * child's offset in stack->parent. The root page is never released, to
+ * to prevent conflict with vacuum process.
  */
-void
-ginFindParents(GinBtree btree, GinBtreeStack *stack,
-			   BlockNumber rootBlkno)
+static void
+ginFindParents(GinBtree btree, GinBtreeStack *stack)
 {
-
 	Page		page;
 	Buffer		buffer;
 	BlockNumber blkno,
 				leftmostBlkno;
 	OffsetNumber offset;
-	GinBtreeStack *root = stack->parent;
+	GinBtreeStack *root;
 	GinBtreeStack *ptr;
 
-	if (!root)
+	/*
+	 * Unwind the stack all the way up to the root, leaving only the root
+	 * item.
+	 *
+	 * Be careful not to release the pin on the root page! The pin on root
+	 * page is required to lock out concurrent vacuums on the tree.
+	 */
+	root = stack->parent;
+	while (root->parent)
 	{
-		/* XLog mode... */
-		root = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
-		root->blkno = rootBlkno;
-		root->buffer = ReadBuffer(btree->index, rootBlkno);
-		LockBuffer(root->buffer, GIN_EXCLUSIVE);
-		root->parent = NULL;
+		ReleaseBuffer(root->buffer);
+		root = root->parent;
 	}
-	else
-	{
-		/*
-		 * find root, we should not release root page until update is
-		 * finished!!
-		 */
-		while (root->parent)
-		{
-			ReleaseBuffer(root->buffer);
-			root = root->parent;
-		}
 
-		Assert(root->blkno == rootBlkno);
-		Assert(BufferGetBlockNumber(root->buffer) == rootBlkno);
-		LockBuffer(root->buffer, GIN_EXCLUSIVE);
-	}
+	Assert(root->blkno == btree->rootBlkno);
+	Assert(BufferGetBlockNumber(root->buffer) == btree->rootBlkno);
 	root->off = InvalidOffsetNumber;
 
-	page = BufferGetPage(root->buffer);
-	Assert(!GinPageIsLeaf(page));
-
-	/* check trivial case */
-	if ((root->off = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) != InvalidOffsetNumber)
-	{
-		stack->parent = root;
-		return;
-	}
+	blkno = root->blkno;
+	buffer = root->buffer;
+	offset = InvalidOffsetNumber;
 
-	leftmostBlkno = blkno = btree->getLeftMostPage(btree, page);
-	LockBuffer(root->buffer, GIN_UNLOCK);
-	Assert(blkno != InvalidBlockNumber);
+	ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
 
 	for (;;)
 	{
-		buffer = ReadBuffer(btree->index, blkno);
 		LockBuffer(buffer, GIN_EXCLUSIVE);
 		page = BufferGetPage(buffer);
 		if (GinPageIsLeaf(page))
 			elog(ERROR, "Lost path");
 
-		leftmostBlkno = btree->getLeftMostPage(btree, page);
+		if (GinPageIsIncompleteSplit(page))
+		{
+			Assert(blkno != btree->rootBlkno);
+			ptr->blkno = blkno;
+			ptr->buffer = buffer;
+
+			/*
+			 * parent may be wrong, but if so, the ginFinishSplit call will
+			 * recurse to call ginFindParents again to fix it.
+			 */
+			ptr->parent = root;
+			ptr->off = InvalidOffsetNumber;
+
+			ginFinishSplit(btree, ptr, false, NULL);
+		}
+
+		leftmostBlkno = btree->getLeftMostChild(btree, page);
 
 		while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber)
 		{
 			blkno = GinPageGetOpaque(page)->rightlink;
-			LockBuffer(buffer, GIN_UNLOCK);
-			ReleaseBuffer(buffer);
 			if (blkno == InvalidBlockNumber)
+			{
+				UnlockReleaseBuffer(buffer);
 				break;
-			buffer = ReadBuffer(btree->index, blkno);
-			LockBuffer(buffer, GIN_EXCLUSIVE);
+			}
+			buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE);
 			page = BufferGetPage(buffer);
+
+			/* finish any incomplete splits, as above */
+			if (GinPageIsIncompleteSplit(page))
+			{
+				Assert(blkno != btree->rootBlkno);
+				ptr->blkno = blkno;
+				ptr->buffer = buffer;
+				ptr->parent = root;
+				ptr->off = InvalidOffsetNumber;
+
+				ginFinishSplit(btree, ptr, false, NULL);
+			}
 		}
 
 		if (blkno != InvalidBlockNumber)
 		{
-			ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
 			ptr->blkno = blkno;
 			ptr->buffer = buffer;
-			ptr->parent = root; /* it's may be wrong, but in next call we will
+			ptr->parent = root; /* it may be wrong, but in next call we will
 								 * correct */
 			ptr->off = offset;
 			stack->parent = ptr;
 			return;
 		}
 
+		/* Descend down to next level */
 		blkno = leftmostBlkno;
+		buffer = ReadBuffer(btree->index, blkno);
 	}
 }
 
 /*
- * Insert value (stored in GinBtree) to tree described by stack
+ * Insert a new item to a page.
  *
- * During an index build, buildStats is non-null and the counters
- * it contains should be incremented as needed.
+ * Returns true if the insertion was finished. On false, the page was split and
+ * the parent needs to be updated. (a root split returns true as it doesn't
+ * need any further action by the caller to complete)
  *
- * NB: the passed-in stack is freed, as though by freeGinBtreeStack.
+ * When inserting a downlink to a internal page, 'childbuf' contains the
+ * child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared
+ * atomically with the insert. Also, the existing item at the given location
+ * is updated to point to 'updateblkno'.
+ *
+ * stack->buffer is locked on entry, and is kept locked.
  */
-void
-ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats)
+static bool
+ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
+			   void *insertdata, BlockNumber updateblkno,
+			   Buffer childbuf, GinStatsData *buildStats)
 {
-	GinBtreeStack *parent = stack;
-	BlockNumber rootBlkno = InvalidBuffer;
-	Page		page,
-				rpage,
-				lpage;
-
-	/* remember root BlockNumber */
-	while (parent)
+	Page		page = BufferGetPage(stack->buffer);
+	XLogRecData *payloadrdata;
+	GinPlaceToPageRC rc;
+	uint16		xlflags = 0;
+	Page		childpage = NULL;
+	Page		newlpage = NULL,
+				newrpage = NULL;
+
+	if (GinPageIsData(page))
+		xlflags |= GIN_INSERT_ISDATA;
+	if (GinPageIsLeaf(page))
+	{
+		xlflags |= GIN_INSERT_ISLEAF;
+		Assert(!BufferIsValid(childbuf));
+		Assert(updateblkno == InvalidBlockNumber);
+	}
+	else
 	{
-		rootBlkno = parent->blkno;
-		parent = parent->parent;
+		Assert(BufferIsValid(childbuf));
+		Assert(updateblkno != InvalidBlockNumber);
+		childpage = BufferGetPage(childbuf);
 	}
 
-	while (stack)
+	/*
+	 * Try to put the incoming tuple on the page. placeToPage will decide if
+	 * the page needs to be split.
+	 */
+	rc = btree->placeToPage(btree, stack->buffer, stack,
+							insertdata, updateblkno,
+							&payloadrdata, &newlpage, &newrpage);
+	if (rc == UNMODIFIED)
+		return true;
+	else if (rc == INSERTED)
 	{
-		XLogRecData *rdata;
-		BlockNumber savedRightLink;
+		/* placeToPage did START_CRIT_SECTION() */
+		MarkBufferDirty(stack->buffer);
 
-		page = BufferGetPage(stack->buffer);
-		savedRightLink = GinPageGetOpaque(page)->rightlink;
+		/* An insert to an internal page finishes the split of the child. */
+		if (childbuf != InvalidBuffer)
+		{
+			GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
+			MarkBufferDirty(childbuf);
+		}
 
-		if (btree->isEnoughSpace(btree, stack->buffer, stack->off))
+		if (RelationNeedsWAL(btree->index))
 		{
-			START_CRIT_SECTION();
-			btree->placeToPage(btree, stack->buffer, stack->off, &rdata);
+			XLogRecPtr	recptr;
+			XLogRecData rdata[3];
+			ginxlogInsert xlrec;
+			BlockIdData childblknos[2];
 
-			MarkBufferDirty(stack->buffer);
+			xlrec.node = btree->index->rd_node;
+			xlrec.blkno = BufferGetBlockNumber(stack->buffer);
+			xlrec.flags = xlflags;
 
-			if (RelationNeedsWAL(btree->index))
+			rdata[0].buffer = InvalidBuffer;
+			rdata[0].data = (char *) &xlrec;
+			rdata[0].len = sizeof(ginxlogInsert);
+
+			/*
+			 * Log information about child if this was an insertion of a
+			 * downlink.
+			 */
+			if (childbuf != InvalidBuffer)
 			{
-				XLogRecPtr	recptr;
+				rdata[0].next = &rdata[1];
 
-				recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata);
-				PageSetLSN(page, recptr);
-				PageSetTLI(page, ThisTimeLineID);
+				BlockIdSet(&childblknos[0], BufferGetBlockNumber(childbuf));
+				BlockIdSet(&childblknos[1], GinPageGetOpaque(childpage)->rightlink);
+
+				rdata[1].buffer = InvalidBuffer;
+				rdata[1].data = (char *) childblknos;
+				rdata[1].len = sizeof(BlockIdData) * 2;
+				rdata[1].next = &rdata[2];
+
+				rdata[2].buffer = childbuf;
+				rdata[2].buffer_std = false;
+				rdata[2].data = NULL;
+				rdata[2].len = 0;
+				rdata[2].next = payloadrdata;
 			}
+			else
+				rdata[0].next = payloadrdata;
 
-			LockBuffer(stack->buffer, GIN_UNLOCK);
-			END_CRIT_SECTION();
+			recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata);
+			PageSetLSN(page, recptr);
+			if (childbuf != InvalidBuffer)
+				PageSetLSN(childpage, recptr);
+		}
 
-			freeGinBtreeStack(stack);
+		END_CRIT_SECTION();
 
-			return;
+		return true;
+	}
+	else if (rc == SPLIT)
+	{
+		/* Didn't fit, have to split */
+		Buffer		rbuffer;
+		BlockNumber savedRightLink;
+		XLogRecData rdata[2];
+		ginxlogSplit data;
+		Buffer		lbuffer = InvalidBuffer;
+		Page		newrootpg = NULL;
+
+		rbuffer = GinNewBuffer(btree->index);
+
+		/* During index build, count the new page */
+		if (buildStats)
+		{
+			if (btree->isData)
+				buildStats->nDataPages++;
+			else
+				buildStats->nEntryPages++;
+		}
+
+		savedRightLink = GinPageGetOpaque(page)->rightlink;
+
+		/*
+		 * newlpage and newrpage are pointers to memory pages, not associated
+		 * with buffers. stack->buffer is not touched yet.
+		 */
+
+		data.node = btree->index->rd_node;
+		data.rblkno = BufferGetBlockNumber(rbuffer);
+		data.flags = xlflags;
+		if (childbuf != InvalidBuffer)
+		{
+			Page		childpage = BufferGetPage(childbuf);
+
+			GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
+
+			data.leftChildBlkno = BufferGetBlockNumber(childbuf);
+			data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink;
 		}
 		else
+			data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber;
+
+		rdata[0].buffer = InvalidBuffer;
+		rdata[0].data = (char *) &data;
+		rdata[0].len = sizeof(ginxlogSplit);
+
+		if (childbuf != InvalidBuffer)
 		{
-			Buffer		rbuffer = GinNewBuffer(btree->index);
-			Page		newlpage;
+			rdata[0].next = &rdata[1];
+
+			rdata[1].buffer = childbuf;
+			rdata[1].buffer_std = false;
+			rdata[1].data = NULL;
+			rdata[1].len = 0;
+			rdata[1].next = payloadrdata;
+		}
+		else
+			rdata[0].next = payloadrdata;
 
+		if (stack->parent == NULL)
+		{
 			/*
-			 * newlpage is a pointer to memory page, it doesn't associate with
-			 * buffer, stack->buffer should be untouched
+			 * split root, so we need to allocate new left page and place
+			 * pointer on root to left and right page
 			 */
-			newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off, &rdata);
-
-			((ginxlogSplit *) (rdata->data))->rootBlkno = rootBlkno;
+			lbuffer = GinNewBuffer(btree->index);
 
-			/* During index build, count the newly-split page */
+			/* During index build, count the newly-added root page */
 			if (buildStats)
 			{
 				if (btree->isData)
@@ -341,134 +495,259 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats)
 					buildStats->nEntryPages++;
 			}
 
-			parent = stack->parent;
+			/*
+			 * root never has a right-link, so we borrow the rrlink field to
+			 * store the root block number.
+			 */
+			data.rrlink = BufferGetBlockNumber(stack->buffer);
+			data.lblkno = BufferGetBlockNumber(lbuffer);
+			data.flags |= GIN_SPLIT_ROOT;
 
-			if (parent == NULL)
-			{
-				/*
-				 * split root, so we need to allocate new left page and place
-				 * pointer on root to left and right page
-				 */
-				Buffer		lbuffer = GinNewBuffer(btree->index);
-
-				((ginxlogSplit *) (rdata->data))->isRootSplit = TRUE;
-				((ginxlogSplit *) (rdata->data))->rrlink = InvalidBlockNumber;
-
-				page = BufferGetPage(stack->buffer);
-				lpage = BufferGetPage(lbuffer);
-				rpage = BufferGetPage(rbuffer);
-
-				GinPageGetOpaque(rpage)->rightlink = InvalidBlockNumber;
-				GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
-				((ginxlogSplit *) (rdata->data))->lblkno = BufferGetBlockNumber(lbuffer);
-
-				START_CRIT_SECTION();
-
-				GinInitBuffer(stack->buffer, GinPageGetOpaque(newlpage)->flags & ~GIN_LEAF);
-				PageRestoreTempPage(newlpage, lpage);
-				btree->fillRoot(btree, stack->buffer, lbuffer, rbuffer);
-
-				MarkBufferDirty(rbuffer);
-				MarkBufferDirty(lbuffer);
-				MarkBufferDirty(stack->buffer);
-
-				if (RelationNeedsWAL(btree->index))
-				{
-					XLogRecPtr	recptr;
-
-					recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata);
-					PageSetLSN(page, recptr);
-					PageSetTLI(page, ThisTimeLineID);
-					PageSetLSN(lpage, recptr);
-					PageSetTLI(lpage, ThisTimeLineID);
-					PageSetLSN(rpage, recptr);
-					PageSetTLI(rpage, ThisTimeLineID);
-				}
-
-				UnlockReleaseBuffer(rbuffer);
-				UnlockReleaseBuffer(lbuffer);
-				LockBuffer(stack->buffer, GIN_UNLOCK);
-				END_CRIT_SECTION();
-
-				freeGinBtreeStack(stack);
-
-				/* During index build, count the newly-added root page */
-				if (buildStats)
-				{
-					if (btree->isData)
-						buildStats->nDataPages++;
-					else
-						buildStats->nEntryPages++;
-				}
-
-				return;
-			}
-			else
-			{
-				/* split non-root page */
-				((ginxlogSplit *) (rdata->data))->isRootSplit = FALSE;
-				((ginxlogSplit *) (rdata->data))->rrlink = savedRightLink;
-
-				lpage = BufferGetPage(stack->buffer);
-				rpage = BufferGetPage(rbuffer);
-
-				GinPageGetOpaque(rpage)->rightlink = savedRightLink;
-				GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
-
-				START_CRIT_SECTION();
-				PageRestoreTempPage(newlpage, lpage);
-
-				MarkBufferDirty(rbuffer);
-				MarkBufferDirty(stack->buffer);
-
-				if (RelationNeedsWAL(btree->index))
-				{
-					XLogRecPtr	recptr;
-
-					recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata);
-					PageSetLSN(lpage, recptr);
-					PageSetTLI(lpage, ThisTimeLineID);
-					PageSetLSN(rpage, recptr);
-					PageSetTLI(rpage, ThisTimeLineID);
-				}
-				UnlockReleaseBuffer(rbuffer);
-				END_CRIT_SECTION();
-			}
+			GinPageGetOpaque(newrpage)->rightlink = InvalidBlockNumber;
+			GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
+
+			/*
+			 * Construct a new root page containing downlinks to the new left
+			 * and right pages. (do this in a temporary copy first rather than
+			 * overwriting the original page directly, so that we can still
+			 * abort gracefully if this fails.)
+			 */
+			newrootpg = PageGetTempPage(newrpage);
+			GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ);
+
+			btree->fillRoot(btree, newrootpg,
+							BufferGetBlockNumber(lbuffer), newlpage,
+							BufferGetBlockNumber(rbuffer), newrpage);
+		}
+		else
+		{
+			/* split non-root page */
+			data.rrlink = savedRightLink;
+			data.lblkno = BufferGetBlockNumber(stack->buffer);
+
+			GinPageGetOpaque(newrpage)->rightlink = savedRightLink;
+			GinPageGetOpaque(newlpage)->flags |= GIN_INCOMPLETE_SPLIT;
+			GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
+		}
+
+		/*
+		 * Ok, we have the new contents of the left page in a temporary copy
+		 * now (newlpage), and the newly-allocated right block has been filled
+		 * in. The original page is still unchanged.
+		 *
+		 * If this is a root split, we also have a temporary page containing
+		 * the new contents of the root. Copy the new left page to a
+		 * newly-allocated block, and initialize the (original) root page the
+		 * new copy. Otherwise, copy over the temporary copy of the new left
+		 * page over the old left page.
+		 */
+
+		START_CRIT_SECTION();
+
+		MarkBufferDirty(rbuffer);
+		MarkBufferDirty(stack->buffer);
+		if (BufferIsValid(childbuf))
+			MarkBufferDirty(childbuf);
+
+		/*
+		 * Restore the temporary copies over the real buffers. But don't free
+		 * the temporary copies yet, WAL record data points to them.
+		 */
+		if (stack->parent == NULL)
+		{
+			MarkBufferDirty(lbuffer);
+			memcpy(BufferGetPage(stack->buffer), newrootpg, BLCKSZ);
+			memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ);
+			memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ);
 		}
+		else
+		{
+			memcpy(BufferGetPage(stack->buffer), newlpage, BLCKSZ);
+			memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ);
+		}
+
+		/* write WAL record */
+		if (RelationNeedsWAL(btree->index))
+		{
+			XLogRecPtr	recptr;
+
+			recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata);
+			PageSetLSN(BufferGetPage(stack->buffer), recptr);
+			PageSetLSN(BufferGetPage(rbuffer), recptr);
+			if (stack->parent == NULL)
+				PageSetLSN(BufferGetPage(lbuffer), recptr);
+			if (BufferIsValid(childbuf))
+				PageSetLSN(childpage, recptr);
+		}
+		END_CRIT_SECTION();
+
+		/*
+		 * We can release the lock on the right page now, but keep the
+		 * original buffer locked.
+		 */
+		UnlockReleaseBuffer(rbuffer);
+		if (stack->parent == NULL)
+			UnlockReleaseBuffer(lbuffer);
+
+		pfree(newlpage);
+		pfree(newrpage);
+		if (newrootpg)
+			pfree(newrootpg);
+
+		/*
+		 * If we split the root, we're done. Otherwise the split is not
+		 * complete until the downlink for the new page has been inserted to
+		 * the parent.
+		 */
+		if (stack->parent == NULL)
+			return true;
+		else
+			return false;
+	}
+	else
+	{
+		elog(ERROR, "unknown return code from GIN placeToPage method: %d", rc);
+		return false;			/* keep compiler quiet */
+	}
+}
 
-		btree->isDelete = FALSE;
+/*
+ * Finish a split by inserting the downlink for the new page to parent.
+ *
+ * On entry, stack->buffer is exclusively locked.
+ *
+ * If freestack is true, all the buffers are released and unlocked as we
+ * crawl up the tree, and 'stack' is freed. Otherwise stack->buffer is kept
+ * locked, and stack is unmodified, except for possibly moving right to find
+ * the correct parent of page.
+ */
+static void
+ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
+			   GinStatsData *buildStats)
+{
+	Page		page;
+	bool		done;
+	bool		first = true;
+
+	/*
+	 * freestack == false when we encounter an incompletely split page during
+	 * a scan, while freestack == true is used in the normal scenario that a
+	 * split is finished right after the initial insert.
+	 */
+	if (!freestack)
+		elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"",
+			 stack->blkno, RelationGetRelationName(btree->index));
+
+	/* this loop crawls up the stack until the insertion is complete */
+	do
+	{
+		GinBtreeStack *parent = stack->parent;
+		void	   *insertdata;
+		BlockNumber updateblkno;
 
 		/* search parent to lock */
 		LockBuffer(parent->buffer, GIN_EXCLUSIVE);
 
+		/*
+		 * If the parent page was incompletely split, finish that split first,
+		 * then continue with the current one.
+		 *
+		 * Note: we have to finish *all* incomplete splits we encounter, even
+		 * if we have to move right. Otherwise we might choose as the target a
+		 * page that has no downlink in the parent, and splitting it further
+		 * would fail.
+		 */
+		if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer)))
+			ginFinishSplit(btree, parent, false, buildStats);
+
 		/* move right if it's needed */
 		page = BufferGetPage(parent->buffer);
 		while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber)
 		{
-			BlockNumber rightlink = GinPageGetOpaque(page)->rightlink;
-
-			LockBuffer(parent->buffer, GIN_UNLOCK);
-
-			if (rightlink == InvalidBlockNumber)
+			if (GinPageRightMost(page))
 			{
 				/*
 				 * rightmost page, but we don't find parent, we should use
 				 * plain search...
 				 */
-				ginFindParents(btree, stack, rootBlkno);
+				LockBuffer(parent->buffer, GIN_UNLOCK);
+				ginFindParents(btree, stack);
 				parent = stack->parent;
-				page = BufferGetPage(parent->buffer);
+				Assert(parent != NULL);
 				break;
 			}
 
-			parent->blkno = rightlink;
-			parent->buffer = ReleaseAndReadBuffer(parent->buffer, btree->index, parent->blkno);
-			LockBuffer(parent->buffer, GIN_EXCLUSIVE);
+			parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE);
+			parent->blkno = BufferGetBlockNumber(parent->buffer);
 			page = BufferGetPage(parent->buffer);
+
+			if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer)))
+				ginFinishSplit(btree, parent, false, buildStats);
 		}
 
-		UnlockReleaseBuffer(stack->buffer);
-		pfree(stack);
+		/* insert the downlink */
+		insertdata = btree->prepareDownlink(btree, stack->buffer);
+		updateblkno = GinPageGetOpaque(BufferGetPage(stack->buffer))->rightlink;
+		done = ginPlaceToPage(btree, parent,
+							  insertdata, updateblkno,
+							  stack->buffer, buildStats);
+		pfree(insertdata);
+
+		/*
+		 * If the caller requested to free the stack, unlock and release the
+		 * child buffer now. Otherwise keep it pinned and locked, but if we
+		 * have to recurse up the tree, we can unlock the upper pages, only
+		 * keeping the page at the bottom of the stack locked.
+		 */
+		if (!first || freestack)
+			LockBuffer(stack->buffer, GIN_UNLOCK);
+		if (freestack)
+		{
+			ReleaseBuffer(stack->buffer);
+			pfree(stack);
+		}
 		stack = parent;
+
+		first = false;
+	} while (!done);
+
+	/* unlock the parent */
+	LockBuffer(stack->buffer, GIN_UNLOCK);
+
+	if (freestack)
+		freeGinBtreeStack(stack);
+}
+
+/*
+ * Insert a value to tree described by stack.
+ *
+ * The value to be inserted is given in 'insertdata'. Its format depends
+ * on whether this is an entry or data tree, ginInsertValue just passes it
+ * through to the tree-specific callback function.
+ *
+ * During an index build, buildStats is non-null and the counters it contains
+ * are incremented as needed.
+ *
+ * NB: the passed-in stack is freed, as though by freeGinBtreeStack.
+ */
+void
+ginInsertValue(GinBtree btree, GinBtreeStack *stack, void *insertdata,
+			   GinStatsData *buildStats)
+{
+	bool		done;
+
+	/* If the leaf page was incompletely split, finish the split first */
+	if (GinPageIsIncompleteSplit(BufferGetPage(stack->buffer)))
+		ginFinishSplit(btree, stack, false, buildStats);
+
+	done = ginPlaceToPage(btree, stack,
+						  insertdata, InvalidBlockNumber,
+						  InvalidBuffer, buildStats);
+	if (done)
+	{
+		LockBuffer(stack->buffer, GIN_UNLOCK);
+		freeGinBtreeStack(stack);
 	}
+	else
+		ginFinishSplit(btree, stack, true, buildStats);
 }
diff --git a/src/backend/access/gin/ginbulk.c b/src/backend/access/gin/ginbulk.c
index ddae862720..3af027187a 100644
--- a/src/backend/access/gin/ginbulk.c
+++ b/src/backend/access/gin/ginbulk.c
@@ -4,7 +4,7 @@
  *	  routines for fast build of inverted index
  *
  *
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -187,7 +187,7 @@ ginInsertBAEntry(BuildAccumulator *accum,
  * Since the entries are being inserted into a balanced binary tree, you
  * might think that the order of insertion wouldn't be critical, but it turns
  * out that inserting the entries in sorted order results in a lot of
- * rebalancing operations and is slow.	To prevent this, we attempt to insert
+ * rebalancing operations and is slow.  To prevent this, we attempt to insert
  * the nodes in an order that will produce a nearly-balanced tree if the input
  * is in fact sorted.
  *
diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c
index 13601468ed..272a9ca7c0 100644
--- a/src/backend/access/gin/gindatapage.c
+++ b/src/backend/access/gin/gindatapage.c
@@ -1,10 +1,10 @@
 /*-------------------------------------------------------------------------
  *
  * gindatapage.c
- *	  page utilities routines for the postgres inverted index access method.
+ *	  routines for handling GIN posting tree pages.
  *
  *
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -15,69 +15,214 @@
 #include "postgres.h"
 
 #include "access/gin_private.h"
+#include "access/heapam_xlog.h"
+#include "lib/ilist.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
 #include "utils/rel.h"
 
-int
-ginCompareItemPointers(ItemPointer a, ItemPointer b)
+/*
+ * Min, Max and Target size of posting lists stored on leaf pages, in bytes.
+ *
+ * The code can deal with any size, but random access is more efficient when
+ * a number of smaller lists are stored, rather than one big list. If a
+ * posting list would become larger than Max size as a result of insertions,
+ * it is split into two. If a posting list would be smaller than minimum
+ * size, it is merged with the next posting list.
+ */
+#define GinPostingListSegmentMaxSize 384
+#define GinPostingListSegmentTargetSize 256
+#define GinPostingListSegmentMinSize 128
+
+/*
+ * At least this many items fit in a GinPostingListSegmentMaxSize-bytes
+ * long segment. This is used when estimating how much space is required
+ * for N items, at minimum.
+ */
+#define MinTuplesPerSegment ((GinPostingListSegmentMaxSize - 2) / 6)
+
+/*
+ * A working struct for manipulating a posting tree leaf page.
+ */
+typedef struct
 {
-	BlockNumber ba = GinItemPointerGetBlockNumber(a);
-	BlockNumber bb = GinItemPointerGetBlockNumber(b);
+	dlist_head	segments;		/* a list of leafSegmentInfos */
 
-	if (ba == bb)
-	{
-		OffsetNumber oa = GinItemPointerGetOffsetNumber(a);
-		OffsetNumber ob = GinItemPointerGetOffsetNumber(b);
+	/*
+	 * The following fields represent how the segments are split across pages,
+	 * if a page split is required. Filled in by leafRepackItems.
+	 */
+	dlist_node *lastleft;		/* last segment on left page */
+	int			lsize;			/* total size on left page */
+	int			rsize;			/* total size on right page */
 
-		if (oa == ob)
-			return 0;
-		return (oa > ob) ? 1 : -1;
-	}
+	bool		oldformat;		/* page is in pre-9.4 format on disk */
+} disassembledLeaf;
 
-	return (ba > bb) ? 1 : -1;
-}
+typedef struct
+{
+	dlist_node	node;			/* linked list pointers */
+
+	/*-------------
+	 * 'action' indicates the status of this in-memory segment, compared to
+	 * what's on disk. It is one of the GIN_SEGMENT_* action codes:
+	 *
+	 * UNMODIFIED	no changes
+	 * DELETE		the segment is to be removed. 'seg' and 'items' are
+	 *				ignored
+	 * INSERT		this is a completely new segment
+	 * REPLACE		this replaces an existing segment with new content
+	 * ADDITEMS		like REPLACE, but no items have been removed, and we track
+	 *				in detail what items have been added to this segment, in
+	 *				'modifieditems'
+	 *-------------
+	 */
+	char		action;
+
+	ItemPointerData *modifieditems;
+	int			nmodifieditems;
+
+	/*
+	 * The following fields represent the items in this segment. If 'items' is
+	 * not NULL, it contains a palloc'd array of the itemsin this segment. If
+	 * 'seg' is not NULL, it contains the items in an already-compressed
+	 * format. It can point to an on-disk page (!modified), or a palloc'd
+	 * segment in memory. If both are set, they must represent the same items.
+	 */
+	GinPostingList *seg;
+	ItemPointer items;
+	int			nitems;			/* # of items in 'items', if items != NULL */
+} leafSegmentInfo;
+
+static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems);
+static void dataSplitPageInternal(GinBtree btree, Buffer origbuf,
+					  GinBtreeStack *stack,
+					  void *insertdata, BlockNumber updateblkno,
+					  XLogRecData **prdata, Page *newlpage, Page *newrpage);
+
+static disassembledLeaf *disassembleLeaf(Page page);
+static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining);
+static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems,
+			   int nNewItems);
+
+static XLogRecData *constructLeafRecompressWALData(Buffer buf,
+							   disassembledLeaf *leaf);
+static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf);
+static void dataPlaceToPageLeafSplit(Buffer buf,
+						 disassembledLeaf *leaf,
+						 ItemPointerData lbound, ItemPointerData rbound,
+						 XLogRecData **prdata, Page lpage, Page rpage);
 
 /*
- * Merge two ordered arrays of itempointers, eliminating any duplicates.
- * Returns the number of items in the result.
- * Caller is responsible that there is enough space at *dst.
+ * Read TIDs from leaf data page to single uncompressed array. The TIDs are
+ * returned in ascending order.
+ *
+ * advancePast is a hint, indicating that the caller is only interested in
+ * TIDs > advancePast. To return all items, use ItemPointerSetMin.
+ *
+ * Note: This function can still return items smaller than advancePast that
+ * are in the same posting list as the items of interest, so the caller must
+ * still check all the returned items. But passing it allows this function to
+ * skip whole posting lists.
  */
-uint32
-ginMergeItemPointers(ItemPointerData *dst,
-					 ItemPointerData *a, uint32 na,
-					 ItemPointerData *b, uint32 nb)
+ItemPointer
+GinDataLeafPageGetItems(Page page, int *nitems, ItemPointerData advancePast)
 {
-	ItemPointerData *dptr = dst;
-	ItemPointerData *aptr = a,
-			   *bptr = b;
+	ItemPointer result;
 
-	while (aptr - a < na && bptr - b < nb)
+	if (GinPageIsCompressed(page))
 	{
-		int			cmp = ginCompareItemPointers(aptr, bptr);
+		GinPostingList *seg = GinDataLeafPageGetPostingList(page);
+		Size		len = GinDataLeafPageGetPostingListSize(page);
+		Pointer		endptr = ((Pointer) seg) + len;
+		GinPostingList *next;
 
-		if (cmp > 0)
-			*dptr++ = *bptr++;
-		else if (cmp == 0)
+		/* Skip to the segment containing advancePast+1 */
+		if (ItemPointerIsValid(&advancePast))
 		{
-			/* we want only one copy of the identical items */
-			*dptr++ = *bptr++;
-			aptr++;
+			next = GinNextPostingListSegment(seg);
+			while ((Pointer) next < endptr &&
+				   ginCompareItemPointers(&next->first, &advancePast) <= 0)
+			{
+				seg = next;
+				next = GinNextPostingListSegment(seg);
+			}
+			len = endptr - (Pointer) seg;
 		}
+
+		if (len > 0)
+			result = ginPostingListDecodeAllSegments(seg, len, nitems);
 		else
-			*dptr++ = *aptr++;
+		{
+			result = NULL;
+			*nitems = 0;
+		}
 	}
+	else
+	{
+		ItemPointer tmp = dataLeafPageGetUncompressed(page, nitems);
 
-	while (aptr - a < na)
-		*dptr++ = *aptr++;
+		result = palloc((*nitems) * sizeof(ItemPointerData));
+		memcpy(result, tmp, (*nitems) * sizeof(ItemPointerData));
+	}
 
-	while (bptr - b < nb)
-		*dptr++ = *bptr++;
+	return result;
+}
 
-	return dptr - dst;
+/*
+ * Places all TIDs from leaf data page to bitmap.
+ */
+int
+GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm)
+{
+	ItemPointer uncompressed;
+	int			nitems;
+
+	if (GinPageIsCompressed(page))
+	{
+		GinPostingList *segment = GinDataLeafPageGetPostingList(page);
+		Size		len = GinDataLeafPageGetPostingListSize(page);
+
+		nitems = ginPostingListDecodeAllSegmentsToTbm(segment, len, tbm);
+	}
+	else
+	{
+		uncompressed = dataLeafPageGetUncompressed(page, &nitems);
+
+		if (nitems > 0)
+			tbm_add_tuples(tbm, uncompressed, nitems, false);
+	}
+
+	return nitems;
 }
 
 /*
- * Checks, should we move to right link...
- * Compares inserting itemp pointer with right bound of current page
+ * Get pointer to the uncompressed array of items on a pre-9.4 format
+ * uncompressed leaf page. The number of items in the array is returned in
+ * *nitems.
+ */
+static ItemPointer
+dataLeafPageGetUncompressed(Page page, int *nitems)
+{
+	ItemPointer items;
+
+	Assert(!GinPageIsCompressed(page));
+
+	/*
+	 * In the old pre-9.4 page format, the whole page content is used for
+	 * uncompressed items, and the number of items is stored in 'maxoff'
+	 */
+	items = (ItemPointer) GinDataPageGetData(page);
+	*nitems = GinPageGetOpaque(page)->maxoff;
+
+	return items;
+}
+
+/*
+ * Check if we should follow the right link to find the item we're searching
+ * for.
+ *
+ * Compares inserting item pointer with the right bound of the current page.
  */
 static bool
 dataIsMoveRight(GinBtree btree, Page page)
@@ -87,12 +232,12 @@ dataIsMoveRight(GinBtree btree, Page page)
 	if (GinPageRightMost(page))
 		return FALSE;
 
-	return (ginCompareItemPointers(btree->items + btree->curitem, iptr) > 0) ? TRUE : FALSE;
+	return (ginCompareItemPointers(&btree->itemptr, iptr) > 0) ? TRUE : FALSE;
 }
 
 /*
- * Find correct PostingItem in non-leaf page. It supposed that page
- * correctly chosen and searching value SHOULD be on page
+ * Find correct PostingItem in non-leaf page. It is assumed that this is
+ * the correct page, and the searched value SHOULD be on the page.
  */
 static BlockNumber
 dataLocateItem(GinBtree btree, GinBtreeStack *stack)
@@ -111,7 +256,7 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack)
 	{
 		stack->off = FirstOffsetNumber;
 		stack->predictNumber *= GinPageGetOpaque(page)->maxoff;
-		return btree->getLeftMostPage(btree, page);
+		return btree->getLeftMostChild(btree, page);
 	}
 
 	low = FirstOffsetNumber;
@@ -124,7 +269,7 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack)
 	{
 		OffsetNumber mid = low + ((high - low) / 2);
 
-		pitem = (PostingItem *) GinDataPageGetItem(page, mid);
+		pitem = GinDataPageGetPostingItem(page, mid);
 
 		if (mid == maxoff)
 		{
@@ -136,8 +281,8 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack)
 		}
 		else
 		{
-			pitem = (PostingItem *) GinDataPageGetItem(page, mid);
-			result = ginCompareItemPointers(btree->items + btree->curitem, &(pitem->key));
+			pitem = GinDataPageGetPostingItem(page, mid);
+			result = ginCompareItemPointers(&btree->itemptr, &(pitem->key));
 		}
 
 		if (result == 0)
@@ -154,67 +299,12 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack)
 	Assert(high >= FirstOffsetNumber && high <= maxoff);
 
 	stack->off = high;
-	pitem = (PostingItem *) GinDataPageGetItem(page, high);
+	pitem = GinDataPageGetPostingItem(page, high);
 	return PostingItemGetBlockNumber(pitem);
 }
 
 /*
- * Searches correct position for value on leaf page.
- * Page should be correctly chosen.
- * Returns true if value found on page.
- */
-static bool
-dataLocateLeafItem(GinBtree btree, GinBtreeStack *stack)
-{
-	Page		page = BufferGetPage(stack->buffer);
-	OffsetNumber low,
-				high;
-	int			result;
-
-	Assert(GinPageIsLeaf(page));
-	Assert(GinPageIsData(page));
-
-	if (btree->fullScan)
-	{
-		stack->off = FirstOffsetNumber;
-		return TRUE;
-	}
-
-	low = FirstOffsetNumber;
-	high = GinPageGetOpaque(page)->maxoff;
-
-	if (high < low)
-	{
-		stack->off = FirstOffsetNumber;
-		return false;
-	}
-
-	high++;
-
-	while (high > low)
-	{
-		OffsetNumber mid = low + ((high - low) / 2);
-
-		result = ginCompareItemPointers(btree->items + btree->curitem, (ItemPointer) GinDataPageGetItem(page, mid));
-
-		if (result == 0)
-		{
-			stack->off = mid;
-			return true;
-		}
-		else if (result > 0)
-			low = mid + 1;
-		else
-			high = mid;
-	}
-
-	stack->off = high;
-	return false;
-}
-
-/*
- * Finds links to blkno on non-leaf page, returns
- * offset of PostingItem
+ * Find link to blkno on non-leaf page, returns offset of PostingItem
  */
 static OffsetNumber
 dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff)
@@ -229,7 +319,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor
 	/* if page isn't changed, we return storedOff */
 	if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
 	{
-		pitem = (PostingItem *) GinDataPageGetItem(page, storedOff);
+		pitem = GinDataPageGetPostingItem(page, storedOff);
 		if (PostingItemGetBlockNumber(pitem) == blkno)
 			return storedOff;
 
@@ -239,7 +329,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor
 		 */
 		for (i = storedOff + 1; i <= maxoff; i++)
 		{
-			pitem = (PostingItem *) GinDataPageGetItem(page, i);
+			pitem = GinDataPageGetPostingItem(page, i);
 			if (PostingItemGetBlockNumber(pitem) == blkno)
 				return i;
 		}
@@ -250,7 +340,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor
 	/* last chance */
 	for (i = FirstOffsetNumber; i <= maxoff; i++)
 	{
-		pitem = (PostingItem *) GinDataPageGetItem(page, i);
+		pitem = GinDataPageGetPostingItem(page, i);
 		if (PostingItemGetBlockNumber(pitem) == blkno)
 			return i;
 	}
@@ -259,7 +349,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor
 }
 
 /*
- * returns blkno of leftmost child
+ * Return blkno of leftmost child
  */
 static BlockNumber
 dataGetLeftMostPage(GinBtree btree, Page page)
@@ -270,39 +360,49 @@ dataGetLeftMostPage(GinBtree btree, Page page)
 	Assert(GinPageIsData(page));
 	Assert(GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber);
 
-	pitem = (PostingItem *) GinDataPageGetItem(page, FirstOffsetNumber);
+	pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber);
 	return PostingItemGetBlockNumber(pitem);
 }
 
 /*
- * add ItemPointer or PostingItem to page. data should point to
- * correct value! depending on leaf or non-leaf page
+ * Add PostingItem to a non-leaf page.
  */
 void
-GinDataPageAddItem(Page page, void *data, OffsetNumber offset)
+GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset)
 {
 	OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
 	char	   *ptr;
 
+	Assert(PostingItemGetBlockNumber(data) != InvalidBlockNumber);
+	Assert(!GinPageIsLeaf(page));
+
 	if (offset == InvalidOffsetNumber)
 	{
-		ptr = GinDataPageGetItem(page, maxoff + 1);
+		ptr = (char *) GinDataPageGetPostingItem(page, maxoff + 1);
 	}
 	else
 	{
-		ptr = GinDataPageGetItem(page, offset);
-		if (maxoff + 1 - offset != 0)
-			memmove(ptr + GinSizeOfDataPageItem(page),
+		ptr = (char *) GinDataPageGetPostingItem(page, offset);
+		if (offset != maxoff + 1)
+			memmove(ptr + sizeof(PostingItem),
 					ptr,
-					(maxoff - offset + 1) * GinSizeOfDataPageItem(page));
+					(maxoff - offset + 1) *sizeof(PostingItem));
 	}
-	memcpy(ptr, data, GinSizeOfDataPageItem(page));
+	memcpy(ptr, data, sizeof(PostingItem));
+
+	maxoff++;
+	GinPageGetOpaque(page)->maxoff = maxoff;
 
-	GinPageGetOpaque(page)->maxoff++;
+	/*
+	 * Also set pd_lower to the end of the posting items, to follow the
+	 * "standard" page layout, so that we can squeeze out the unused space
+	 * from full-page images.
+	 */
+	GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem));
 }
 
 /*
- * Deletes posting item from non-leaf page
+ * Delete posting item from non-leaf page
  */
 void
 GinPageDeletePostingItem(Page page, OffsetNumber offset)
@@ -313,271 +413,892 @@ GinPageDeletePostingItem(Page page, OffsetNumber offset)
 	Assert(offset >= FirstOffsetNumber && offset <= maxoff);
 
 	if (offset != maxoff)
-		memmove(GinDataPageGetItem(page, offset), GinDataPageGetItem(page, offset + 1),
+		memmove(GinDataPageGetPostingItem(page, offset),
+				GinDataPageGetPostingItem(page, offset + 1),
 				sizeof(PostingItem) * (maxoff - offset));
 
-	GinPageGetOpaque(page)->maxoff--;
+	maxoff--;
+	GinPageGetOpaque(page)->maxoff = maxoff;
+
+	GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem));
 }
 
 /*
- * checks space to install new value,
- * item pointer never deletes!
+ * Places keys to leaf data page and fills WAL record.
  */
-static bool
-dataIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off)
+static GinPlaceToPageRC
+dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+					void *insertdata, XLogRecData **prdata,
+					Page *newlpage, Page *newrpage)
 {
+	GinBtreeDataLeafInsertData *items = insertdata;
+	ItemPointer newItems = &items->items[items->curitem];
+	int			maxitems = items->nitem - items->curitem;
 	Page		page = BufferGetPage(buf);
+	int			i;
+	ItemPointerData rbound;
+	ItemPointerData lbound;
+	bool		needsplit;
+	bool		append;
+	int			segsize;
+	Size		freespace;
+	MemoryContext tmpCxt;
+	MemoryContext oldCxt;
+	disassembledLeaf *leaf;
+	leafSegmentInfo *lastleftinfo;
+	ItemPointerData maxOldItem;
+	ItemPointerData remaining;
 
 	Assert(GinPageIsData(page));
-	Assert(!btree->isDelete);
 
-	if (GinPageIsLeaf(page))
+	rbound = *GinDataPageGetRightBound(page);
+
+	/*
+	 * Count how many of the new items belong to this page.
+	 */
+	if (!GinPageRightMost(page))
 	{
-		if (GinPageRightMost(page) && off > GinPageGetOpaque(page)->maxoff)
+		for (i = 0; i < maxitems; i++)
 		{
-			if ((btree->nitem - btree->curitem) * sizeof(ItemPointerData) <= GinDataPageGetFreeSpace(page))
-				return true;
+			if (ginCompareItemPointers(&newItems[i], &rbound) > 0)
+			{
+				/*
+				 * This needs to go to some other location in the tree. (The
+				 * caller should've chosen the insert location so that at
+				 * least the first item goes here.)
+				 */
+				Assert(i > 0);
+				break;
+			}
 		}
-		else if (sizeof(ItemPointerData) <= GinDataPageGetFreeSpace(page))
-			return true;
+		maxitems = i;
 	}
-	else if (sizeof(PostingItem) <= GinDataPageGetFreeSpace(page))
-		return true;
 
-	return false;
-}
+	/*
+	 * The following operations do quite a lot of small memory allocations,
+	 * create a temporary memory context so that we don't need to keep track
+	 * of them individually.
+	 */
+	tmpCxt = AllocSetContextCreate(CurrentMemoryContext,
+								   "Gin split temporary context",
+								   ALLOCSET_DEFAULT_MINSIZE,
+								   ALLOCSET_DEFAULT_INITSIZE,
+								   ALLOCSET_DEFAULT_MAXSIZE);
+	oldCxt = MemoryContextSwitchTo(tmpCxt);
 
-/*
- * In case of previous split update old child blkno to
- * new right page
- * item pointer never deletes!
- */
-static BlockNumber
-dataPrepareData(GinBtree btree, Page page, OffsetNumber off)
-{
-	BlockNumber ret = InvalidBlockNumber;
+	leaf = disassembleLeaf(page);
 
-	Assert(GinPageIsData(page));
+	/*
+	 * Are we appending to the end of the page? IOW, are all the new items
+	 * larger than any of the existing items.
+	 */
+	if (!dlist_is_empty(&leaf->segments))
+	{
+		lastleftinfo = dlist_container(leafSegmentInfo, node,
+									   dlist_tail_node(&leaf->segments));
+		if (!lastleftinfo->items)
+			lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg,
+													   &lastleftinfo->nitems);
+		maxOldItem = lastleftinfo->items[lastleftinfo->nitems - 1];
+		if (ginCompareItemPointers(&newItems[0], &maxOldItem) >= 0)
+			append = true;
+		else
+			append = false;
+	}
+	else
+	{
+		ItemPointerSetMin(&maxOldItem);
+		append = true;
+	}
 
-	if (!GinPageIsLeaf(page) && btree->rightblkno != InvalidBlockNumber)
+	/*
+	 * If we're appending to the end of the page, we will append as many items
+	 * as we can fit (after splitting), and stop when the pages becomes full.
+	 * Otherwise we have to limit the number of new items to insert, because
+	 * once we start packing we can't just stop when we run out of space,
+	 * because we must make sure that all the old items still fit.
+	 */
+	if (GinPageIsCompressed(page))
+		freespace = GinDataLeafPageGetFreeSpace(page);
+	else
+		freespace = 0;
+	if (append)
 	{
-		PostingItem *pitem = (PostingItem *) GinDataPageGetItem(page, off);
+		/*
+		 * Even when appending, trying to append more items than will fit is
+		 * not completely free, because we will merge the new items and old
+		 * items into an array below. In the best case, every new item fits in
+		 * a single byte, and we can use all the free space on the old page as
+		 * well as the new page. For simplicity, ignore segment overhead etc.
+		 */
+		maxitems = Min(maxitems, freespace + GinDataPageMaxDataSize);
+	}
+	else
+	{
+		/*
+		 * Calculate a conservative estimate of how many new items we can fit
+		 * on the two pages after splitting.
+		 *
+		 * We can use any remaining free space on the old page to store full
+		 * segments, as well as the new page. Each full-sized segment can hold
+		 * at least MinTuplesPerSegment items
+		 */
+		int			nnewsegments;
+
+		nnewsegments = freespace / GinPostingListSegmentMaxSize;
+		nnewsegments += GinDataPageMaxDataSize / GinPostingListSegmentMaxSize;
+		maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment);
+	}
+
+	/* Add the new items to the segments */
+	if (!addItemsToLeaf(leaf, newItems, maxitems))
+	{
+		/* all items were duplicates, we have nothing to do */
+		items->curitem += maxitems;
+
+		MemoryContextSwitchTo(oldCxt);
+		MemoryContextDelete(tmpCxt);
+
+		return UNMODIFIED;
+	}
+
+	/*
+	 * Pack the items back to compressed segments, ready for writing to disk.
+	 */
+	needsplit = leafRepackItems(leaf, &remaining);
+
+	/*
+	 * Did all the new items fit?
+	 *
+	 * If we're appending, it's OK if they didn't. But as a sanity check,
+	 * verify that all the old items fit.
+	 */
+	if (ItemPointerIsValid(&remaining))
+	{
+		if (!append || ItemPointerCompare(&maxOldItem, &remaining) >= 0)
+			elog(ERROR, "could not split GIN page; all old items didn't fit");
+
+		/* Count how many of the new items did fit. */
+		for (i = 0; i < maxitems; i++)
+		{
+			if (ginCompareItemPointers(&newItems[i], &remaining) >= 0)
+				break;
+		}
+		if (i == 0)
+			elog(ERROR, "could not split GIN page; no new items fit");
+		maxitems = i;
+	}
+
+	if (!needsplit)
+	{
+		/*
+		 * Great, all the items fit on a single page. Construct a WAL record
+		 * describing the changes we made, and write the segments back to the
+		 * page.
+		 *
+		 * Once we start modifying the page, there's no turning back. The
+		 * caller is responsible for calling END_CRIT_SECTION() after writing
+		 * the WAL record.
+		 */
+		MemoryContextSwitchTo(oldCxt);
+		if (RelationNeedsWAL(btree->index))
+			*prdata = constructLeafRecompressWALData(buf, leaf);
+		else
+			*prdata = NULL;
+		START_CRIT_SECTION();
+		dataPlaceToPageLeafRecompress(buf, leaf);
+
+		if (append)
+			elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)",
+				 maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize,
+				 items->nitem - items->curitem - maxitems);
+		else
+			elog(DEBUG2, "inserted %d new items to block %u; %d bytes (%d to go)",
+				 maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize,
+				 items->nitem - items->curitem - maxitems);
+	}
+	else
+	{
+		/*
+		 * Had to split.
+		 *
+		 * We already divided the segments between the left and the right
+		 * page. The left page was filled as full as possible, and the rest
+		 * overflowed to the right page. When building a new index, that's
+		 * good, because the table is scanned from beginning to end and there
+		 * won't be any more insertions to the left page during the build.
+		 * This packs the index as tight as possible. But otherwise, split
+		 * 50/50, by moving segments from the left page to the right page
+		 * until they're balanced.
+		 *
+		 * As a further heuristic, when appending items to the end of the
+		 * page, split 75/25, one the assumption that subsequent insertions
+		 * will probably also go to the end. This packs the index somewhat
+		 * tighter when appending to a table, which is very common.
+		 */
+		if (!btree->isBuild)
+		{
+			while (dlist_has_prev(&leaf->segments, leaf->lastleft))
+			{
+				lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft);
+
+				segsize = SizeOfGinPostingList(lastleftinfo->seg);
+				if (append)
+				{
+					if ((leaf->lsize - segsize) - (leaf->lsize - segsize) < BLCKSZ / 4)
+						break;
+				}
+				else
+				{
+					if ((leaf->lsize - segsize) - (leaf->rsize + segsize) < 0)
+						break;
+				}
+
+				leaf->lsize -= segsize;
+				leaf->rsize += segsize;
+				leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft);
+			}
+		}
+		Assert(leaf->lsize <= GinDataPageMaxDataSize);
+		Assert(leaf->rsize <= GinDataPageMaxDataSize);
 
-		PostingItemSetBlockNumber(pitem, btree->rightblkno);
-		ret = btree->rightblkno;
+		/*
+		 * Fetch the max item in the left page's last segment; it becomes the
+		 * right bound of the page.
+		 */
+		lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft);
+		if (!lastleftinfo->items)
+			lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg,
+													   &lastleftinfo->nitems);
+		lbound = lastleftinfo->items[lastleftinfo->nitems - 1];
+
+		*newlpage = MemoryContextAlloc(oldCxt, BLCKSZ);
+		*newrpage = MemoryContextAlloc(oldCxt, BLCKSZ);
+
+		dataPlaceToPageLeafSplit(buf, leaf, lbound, rbound,
+								 prdata, *newlpage, *newrpage);
+
+		Assert(GinPageRightMost(page) ||
+			   ginCompareItemPointers(GinDataPageGetRightBound(*newlpage),
+								   GinDataPageGetRightBound(*newrpage)) < 0);
+
+		if (append)
+			elog(DEBUG2, "appended %d items to block %u; split %d/%d (%d to go)",
+				 maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize,
+				 items->nitem - items->curitem - maxitems);
+		else
+			elog(DEBUG2, "inserted %d items to block %u; split %d/%d (%d to go)",
+				 maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize,
+				 items->nitem - items->curitem - maxitems);
 	}
 
-	btree->rightblkno = InvalidBlockNumber;
+	MemoryContextSwitchTo(oldCxt);
+	MemoryContextDelete(tmpCxt);
+
+	items->curitem += maxitems;
 
-	return ret;
+	return needsplit ? SPLIT : INSERTED;
 }
 
 /*
- * Places keys to page and fills WAL record. In case leaf page and
- * build mode puts all ItemPointers to page.
+ * Vacuum a posting tree leaf page.
  */
-static void
-dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata)
+void
+ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
 {
-	Page		page = BufferGetPage(buf);
-	int			sizeofitem = GinSizeOfDataPageItem(page);
-	int			cnt = 0;
+	Page		page = BufferGetPage(buffer);
+	disassembledLeaf *leaf;
+	bool		removedsomething = false;
+	dlist_iter	iter;
 
-	/* these must be static so they can be returned to caller */
-	static XLogRecData rdata[3];
-	static ginxlogInsert data;
+	leaf = disassembleLeaf(page);
 
-	*prdata = rdata;
-	Assert(GinPageIsData(page));
-
-	data.updateBlkno = dataPrepareData(btree, page, off);
+	/* Vacuum each segment. */
+	dlist_foreach(iter, &leaf->segments)
+	{
+		leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
+		int			oldsegsize;
+		ItemPointer cleaned;
+		int			ncleaned;
+
+		if (!seginfo->items)
+			seginfo->items = ginPostingListDecode(seginfo->seg,
+												  &seginfo->nitems);
+		if (seginfo->seg)
+			oldsegsize = SizeOfGinPostingList(seginfo->seg);
+		else
+			oldsegsize = GinDataPageMaxDataSize;
+
+		cleaned = ginVacuumItemPointers(gvs,
+										seginfo->items,
+										seginfo->nitems,
+										&ncleaned);
+		pfree(seginfo->items);
+		seginfo->items = NULL;
+		seginfo->nitems = 0;
+		if (cleaned)
+		{
+			if (ncleaned > 0)
+			{
+				int			npacked;
+
+				seginfo->seg = ginCompressPostingList(cleaned,
+													  ncleaned,
+													  oldsegsize,
+													  &npacked);
+				/* Removing an item never increases the size of the segment */
+				if (npacked != ncleaned)
+					elog(ERROR, "could not fit vacuumed posting list");
+				seginfo->action = GIN_SEGMENT_REPLACE;
+			}
+			else
+			{
+				seginfo->seg = NULL;
+				seginfo->items = NULL;
+				seginfo->action = GIN_SEGMENT_DELETE;
+			}
+			seginfo->nitems = ncleaned;
 
-	data.node = btree->index->rd_node;
-	data.blkno = BufferGetBlockNumber(buf);
-	data.offset = off;
-	data.nitem = 1;
-	data.isDelete = FALSE;
-	data.isData = TRUE;
-	data.isLeaf = GinPageIsLeaf(page) ? TRUE : FALSE;
+			removedsomething = true;
+		}
+	}
 
 	/*
-	 * Prevent full page write if child's split occurs. That is needed to
-	 * remove incomplete splits while replaying WAL
+	 * If we removed any items, reconstruct the page from the pieces.
 	 *
-	 * data.updateBlkno contains new block number (of newly created right
-	 * page) for recently splited page.
+	 * We don't try to re-encode the segments here, even though some of them
+	 * might be really small now that we've removed some items from them. It
+	 * seems like a waste of effort, as there isn't really any benefit from
+	 * larger segments per se; larger segments only help to pack more items in
+	 * the same space. We might as well delay doing that until the next
+	 * insertion, which will need to re-encode at least part of the page
+	 * anyway.
+	 *
+	 * Also note if the page was in uncompressed, pre-9.4 format before, it is
+	 * now represented as one huge segment that contains all the items. It
+	 * might make sense to split that, to speed up random access, but we don't
+	 * bother. You'll have to REINDEX anyway if you want the full gain of the
+	 * new tighter index format.
 	 */
-	if (data.updateBlkno == InvalidBlockNumber)
+	if (removedsomething)
 	{
-		rdata[0].buffer = buf;
-		rdata[0].buffer_std = FALSE;
-		rdata[0].data = NULL;
-		rdata[0].len = 0;
-		rdata[0].next = &rdata[1];
-		cnt++;
+		XLogRecData *payloadrdata = NULL;
+		bool		modified;
+
+		/*
+		 * Make sure we have a palloc'd copy of all segments, after the first
+		 * segment that is modified. (dataPlaceToPageLeafRecompress requires
+		 * this).
+		 */
+		modified = false;
+		dlist_foreach(iter, &leaf->segments)
+		{
+			leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+													   iter.cur);
+
+			if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+				modified = true;
+			if (modified && seginfo->action != GIN_SEGMENT_DELETE)
+			{
+				int			segsize = SizeOfGinPostingList(seginfo->seg);
+				GinPostingList *tmp = (GinPostingList *) palloc(segsize);
+
+				memcpy(tmp, seginfo->seg, segsize);
+				seginfo->seg = tmp;
+			}
+		}
+
+		if (RelationNeedsWAL(indexrel))
+			payloadrdata = constructLeafRecompressWALData(buffer, leaf);
+		START_CRIT_SECTION();
+		dataPlaceToPageLeafRecompress(buffer, leaf);
+
+		MarkBufferDirty(buffer);
+
+		if (RelationNeedsWAL(indexrel))
+		{
+			XLogRecPtr	recptr;
+			XLogRecData rdata;
+			ginxlogVacuumDataLeafPage xlrec;
+
+			xlrec.node = indexrel->rd_node;
+			xlrec.blkno = BufferGetBlockNumber(buffer);
+
+			rdata.buffer = InvalidBuffer;
+			rdata.data = (char *) &xlrec;
+			rdata.len = offsetof(ginxlogVacuumDataLeafPage, data);
+			rdata.next = payloadrdata;
+
+			recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE, &rdata);
+			PageSetLSN(page, recptr);
+		}
+
+		END_CRIT_SECTION();
 	}
+}
 
-	rdata[cnt].buffer = InvalidBuffer;
-	rdata[cnt].data = (char *) &data;
-	rdata[cnt].len = sizeof(ginxlogInsert);
-	rdata[cnt].next = &rdata[cnt + 1];
-	cnt++;
+/*
+ * Construct a ginxlogRecompressDataLeaf record representing the changes
+ * in *leaf.
+ */
+static XLogRecData *
+constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf)
+{
+	int			nmodified = 0;
+	char	   *walbufbegin;
+	char	   *walbufend;
+	XLogRecData *rdata;
+	dlist_iter	iter;
+	int			segno;
+	ginxlogRecompressDataLeaf *recompress_xlog;
+
+	/* Count the modified segments */
+	dlist_foreach(iter, &leaf->segments)
+	{
+		leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+												   iter.cur);
 
-	rdata[cnt].buffer = InvalidBuffer;
-	rdata[cnt].data = (GinPageIsLeaf(page)) ? ((char *) (btree->items + btree->curitem)) : ((char *) &(btree->pitem));
-	rdata[cnt].len = sizeofitem;
-	rdata[cnt].next = NULL;
+		if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+			nmodified++;
+	}
 
-	if (GinPageIsLeaf(page))
+	walbufbegin = palloc(
+						 sizeof(ginxlogRecompressDataLeaf) +
+						 BLCKSZ +		/* max size needed to hold the segment
+										 * data */
+						 nmodified * 2 +		/* (segno + action) per action */
+						 sizeof(XLogRecData));
+	walbufend = walbufbegin;
+
+	recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend;
+	walbufend += sizeof(ginxlogRecompressDataLeaf);
+
+	recompress_xlog->nactions = nmodified;
+
+	segno = 0;
+	dlist_foreach(iter, &leaf->segments)
 	{
-		if (GinPageRightMost(page) && off > GinPageGetOpaque(page)->maxoff)
+		leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+												   iter.cur);
+		int			segsize = 0;
+		int			datalen;
+		uint8		action = seginfo->action;
+
+		if (action == GIN_SEGMENT_UNMODIFIED)
 		{
-			/* usually, create index... */
-			uint32		savedPos = btree->curitem;
+			segno++;
+			continue;
+		}
 
-			while (btree->curitem < btree->nitem)
-			{
-				GinDataPageAddItem(page, btree->items + btree->curitem, off);
-				off++;
-				btree->curitem++;
-			}
-			data.nitem = btree->curitem - savedPos;
-			rdata[cnt].len = sizeofitem * data.nitem;
+		if (action != GIN_SEGMENT_DELETE)
+			segsize = SizeOfGinPostingList(seginfo->seg);
+
+		/*
+		 * If storing the uncompressed list of added item pointers would take
+		 * more space than storing the compressed segment as is, do that
+		 * instead.
+		 */
+		if (action == GIN_SEGMENT_ADDITEMS &&
+			seginfo->nmodifieditems * sizeof(ItemPointerData) > segsize)
+		{
+			action = GIN_SEGMENT_REPLACE;
 		}
-		else
+
+		*((uint8 *) (walbufend++)) = segno;
+		*(walbufend++) = action;
+
+		switch (action)
 		{
-			GinDataPageAddItem(page, btree->items + btree->curitem, off);
-			btree->curitem++;
+			case GIN_SEGMENT_DELETE:
+				datalen = 0;
+				break;
+
+			case GIN_SEGMENT_ADDITEMS:
+				datalen = seginfo->nmodifieditems * sizeof(ItemPointerData);
+				memcpy(walbufend, &seginfo->nmodifieditems, sizeof(uint16));
+				memcpy(walbufend + sizeof(uint16), seginfo->modifieditems, datalen);
+				datalen += sizeof(uint16);
+				break;
+
+			case GIN_SEGMENT_INSERT:
+			case GIN_SEGMENT_REPLACE:
+				datalen = SHORTALIGN(segsize);
+				memcpy(walbufend, seginfo->seg, segsize);
+				break;
+
+			default:
+				elog(ERROR, "unexpected GIN leaf action %d", action);
 		}
+		walbufend += datalen;
+
+		if (action != GIN_SEGMENT_INSERT)
+			segno++;
 	}
-	else
-		GinDataPageAddItem(page, &(btree->pitem), off);
+
+	rdata = (XLogRecData *) MAXALIGN(walbufend);
+	rdata->buffer = buf;
+	rdata->buffer_std = TRUE;
+	rdata->data = walbufbegin;
+	rdata->len = walbufend - walbufbegin;
+	rdata->next = NULL;
+
+	return rdata;
 }
 
 /*
- * split page and fills WAL record. original buffer(lbuf) leaves untouched,
- * returns shadow page of lbuf filled new data. In leaf page and build mode puts all
- * ItemPointers to pages. Also, in build mode splits data by way to full fulled
- * left page
+ * Assemble a disassembled posting tree leaf page back to a buffer.
+ *
+ * *prdata is filled with WAL information about this operation. The caller
+ * is responsible for inserting to the WAL, along with any other information
+ * about the operation that triggered this recompression.
+ *
+ * NOTE: The segment pointers must not point directly to the same buffer,
+ * except for segments that have not been modified and whose preceding
+ * segments have not been modified either.
  */
-static Page
-dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata)
+static void
+dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf)
 {
+	Page		page = BufferGetPage(buf);
 	char	   *ptr;
-	OffsetNumber separator;
-	ItemPointer bound;
-	Page		lpage = PageGetTempPageCopy(BufferGetPage(lbuf));
-	ItemPointerData oldbound = *GinDataPageGetRightBound(lpage);
-	int			sizeofitem = GinSizeOfDataPageItem(lpage);
-	OffsetNumber maxoff = GinPageGetOpaque(lpage)->maxoff;
-	Page		rpage = BufferGetPage(rbuf);
-	Size		pageSize = PageGetPageSize(lpage);
-	Size		freeSpace;
-	uint32		nCopied = 1;
+	int			newsize;
+	bool		modified = false;
+	dlist_iter	iter;
+	int			segsize;
 
-	/* these must be static so they can be returned to caller */
-	static ginxlogSplit data;
-	static XLogRecData rdata[4];
-	static char vector[2 * BLCKSZ];
+	/*
+	 * If the page was in pre-9.4 format before, convert the header, and force
+	 * all segments to be copied to the page whether they were modified or
+	 * not.
+	 */
+	if (!GinPageIsCompressed(page))
+	{
+		Assert(leaf->oldformat);
+		GinPageSetCompressed(page);
+		GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
+		modified = true;
+	}
+
+	ptr = (char *) GinDataLeafPageGetPostingList(page);
+	newsize = 0;
+	dlist_foreach(iter, &leaf->segments)
+	{
+		leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
 
-	GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
-	freeSpace = GinDataPageGetFreeSpace(rpage);
+		if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+			modified = true;
 
-	*prdata = rdata;
-	data.leftChildBlkno = (GinPageIsLeaf(lpage)) ?
-		InvalidOffsetNumber : PostingItemGetBlockNumber(&(btree->pitem));
-	data.updateBlkno = dataPrepareData(btree, lpage, off);
+		if (seginfo->action != GIN_SEGMENT_DELETE)
+		{
+			segsize = SizeOfGinPostingList(seginfo->seg);
+
+			if (modified)
+				memcpy(ptr, seginfo->seg, segsize);
 
-	memcpy(vector, GinDataPageGetItem(lpage, FirstOffsetNumber),
-		   maxoff * sizeofitem);
+			ptr += segsize;
+			newsize += segsize;
+		}
+	}
+
+	Assert(newsize <= GinDataPageMaxDataSize);
+	GinDataPageSetDataSize(page, newsize);
+}
 
-	if (GinPageIsLeaf(lpage) && GinPageRightMost(lpage) && off > GinPageGetOpaque(lpage)->maxoff)
+/*
+ * Like dataPlaceToPageLeafRecompress, but writes the disassembled leaf
+ * segments to two pages instead of one.
+ *
+ * This is different from the non-split cases in that this does not modify
+ * the original page directly, but to temporary in-memory copies of the new
+ * left and right pages.
+ */
+static void
+dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf,
+						 ItemPointerData lbound, ItemPointerData rbound,
+						 XLogRecData **prdata, Page lpage, Page rpage)
+{
+	char	   *ptr;
+	int			segsize;
+	int			lsize;
+	int			rsize;
+	dlist_node *node;
+	dlist_node *firstright;
+	leafSegmentInfo *seginfo;
+
+	/* these must be static so they can be returned to caller */
+	static ginxlogSplitDataLeaf split_xlog;
+	static XLogRecData rdata[3];
+
+	/* Initialize temporary pages to hold the new left and right pages */
+	GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
+	GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
+
+	/*
+	 * Copy the segments that go to the left page.
+	 *
+	 * XXX: We should skip copying the unmodified part of the left page, like
+	 * we do when recompressing.
+	 */
+	lsize = 0;
+	ptr = (char *) GinDataLeafPageGetPostingList(lpage);
+	firstright = dlist_next_node(&leaf->segments, leaf->lastleft);
+	for (node = dlist_head_node(&leaf->segments);
+		 node != firstright;
+		 node = dlist_next_node(&leaf->segments, node))
 	{
-		nCopied = 0;
-		while (btree->curitem < btree->nitem &&
-			   maxoff * sizeof(ItemPointerData) < 2 * (freeSpace - sizeof(ItemPointerData)))
+		seginfo = dlist_container(leafSegmentInfo, node, node);
+
+		if (seginfo->action != GIN_SEGMENT_DELETE)
 		{
-			memcpy(vector + maxoff * sizeof(ItemPointerData),
-				   btree->items + btree->curitem,
-				   sizeof(ItemPointerData));
-			maxoff++;
-			nCopied++;
-			btree->curitem++;
+			segsize = SizeOfGinPostingList(seginfo->seg);
+			memcpy(ptr, seginfo->seg, segsize);
+			ptr += segsize;
+			lsize += segsize;
 		}
 	}
-	else
+	Assert(lsize == leaf->lsize);
+	GinDataPageSetDataSize(lpage, lsize);
+	*GinDataPageGetRightBound(lpage) = lbound;
+
+	/* Copy the segments that go to the right page */
+	ptr = (char *) GinDataLeafPageGetPostingList(rpage);
+	rsize = 0;
+	for (node = firstright;
+		 ;
+		 node = dlist_next_node(&leaf->segments, node))
 	{
-		ptr = vector + (off - 1) * sizeofitem;
-		if (maxoff + 1 - off != 0)
-			memmove(ptr + sizeofitem, ptr, (maxoff - off + 1) * sizeofitem);
-		if (GinPageIsLeaf(lpage))
+		seginfo = dlist_container(leafSegmentInfo, node, node);
+
+		if (seginfo->action != GIN_SEGMENT_DELETE)
 		{
-			memcpy(ptr, btree->items + btree->curitem, sizeofitem);
-			btree->curitem++;
+			segsize = SizeOfGinPostingList(seginfo->seg);
+			memcpy(ptr, seginfo->seg, segsize);
+			ptr += segsize;
+			rsize += segsize;
 		}
-		else
-			memcpy(ptr, &(btree->pitem), sizeofitem);
 
-		maxoff++;
+		if (!dlist_has_next(&leaf->segments, node))
+			break;
+	}
+	Assert(rsize == leaf->rsize);
+	GinDataPageSetDataSize(rpage, rsize);
+	*GinDataPageGetRightBound(rpage) = rbound;
+
+	/* Create WAL record */
+	split_xlog.lsize = lsize;
+	split_xlog.rsize = rsize;
+	split_xlog.lrightbound = lbound;
+	split_xlog.rrightbound = rbound;
+
+	rdata[0].buffer = InvalidBuffer;
+	rdata[0].data = (char *) &split_xlog;
+	rdata[0].len = sizeof(ginxlogSplitDataLeaf);
+	rdata[0].next = &rdata[1];
+
+	rdata[1].buffer = InvalidBuffer;
+	rdata[1].data = (char *) GinDataLeafPageGetPostingList(lpage);
+	rdata[1].len = lsize;
+	rdata[1].next = &rdata[2];
+
+	rdata[2].buffer = InvalidBuffer;
+	rdata[2].data = (char *) GinDataLeafPageGetPostingList(rpage);
+	rdata[2].len = rsize;
+	rdata[2].next = NULL;
+
+	*prdata = rdata;
+}
+
+/*
+ * Place a PostingItem to page, and fill a WAL record.
+ *
+ * If the item doesn't fit, returns false without modifying the page.
+ *
+ * In addition to inserting the given item, the downlink of the existing item
+ * at 'off' is updated to point to 'updateblkno'.
+ */
+static GinPlaceToPageRC
+dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+						void *insertdata, BlockNumber updateblkno,
+						XLogRecData **prdata, Page *newlpage, Page *newrpage)
+{
+	Page		page = BufferGetPage(buf);
+	OffsetNumber off = stack->off;
+	PostingItem *pitem;
+
+	/* these must be static so they can be returned to caller */
+	static XLogRecData rdata;
+	static ginxlogInsertDataInternal data;
+
+	/* split if we have to */
+	if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem))
+	{
+		dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno,
+							  prdata, newlpage, newrpage);
+		return SPLIT;
 	}
 
+	*prdata = &rdata;
+	Assert(GinPageIsData(page));
+
+	START_CRIT_SECTION();
+
+	/* Update existing downlink to point to next page (on internal page) */
+	pitem = GinDataPageGetPostingItem(page, off);
+	PostingItemSetBlockNumber(pitem, updateblkno);
+
+	/* Add new item */
+	pitem = (PostingItem *) insertdata;
+	GinDataPageAddPostingItem(page, pitem, off);
+
+	data.offset = off;
+	data.newitem = *pitem;
+
+	rdata.buffer = buf;
+	rdata.buffer_std = TRUE;
+	rdata.data = (char *) &data;
+	rdata.len = sizeof(ginxlogInsertDataInternal);
+	rdata.next = NULL;
+
+	return INSERTED;
+}
+
+/*
+ * Places an item (or items) to a posting tree. Calls relevant function of
+ * internal of leaf page because they are handled very differently.
+ */
+static GinPlaceToPageRC
+dataPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+				void *insertdata, BlockNumber updateblkno,
+				XLogRecData **prdata,
+				Page *newlpage, Page *newrpage)
+{
+	Page		page = BufferGetPage(buf);
+
+	Assert(GinPageIsData(page));
+
+	if (GinPageIsLeaf(page))
+		return dataPlaceToPageLeaf(btree, buf, stack, insertdata,
+								   prdata, newlpage, newrpage);
+	else
+		return dataPlaceToPageInternal(btree, buf, stack,
+									   insertdata, updateblkno,
+									   prdata, newlpage, newrpage);
+}
+
+/*
+ * Split page and fill WAL record. Returns a new temp buffer filled with data
+ * that should go to the left page. The original buffer is left untouched.
+ */
+static void
+dataSplitPageInternal(GinBtree btree, Buffer origbuf,
+					  GinBtreeStack *stack,
+					  void *insertdata, BlockNumber updateblkno,
+					  XLogRecData **prdata, Page *newlpage, Page *newrpage)
+{
+	Page		oldpage = BufferGetPage(origbuf);
+	OffsetNumber off = stack->off;
+	int			nitems = GinPageGetOpaque(oldpage)->maxoff;
+	int			nleftitems;
+	int			nrightitems;
+	Size		pageSize = PageGetPageSize(oldpage);
+	ItemPointerData oldbound = *GinDataPageGetRightBound(oldpage);
+	ItemPointer bound;
+	Page		lpage;
+	Page		rpage;
+	OffsetNumber separator;
+
+	/* these must be static so they can be returned to caller */
+	static ginxlogSplitDataInternal data;
+	static XLogRecData rdata[4];
+	static PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1];
+
+	lpage = PageGetTempPage(oldpage);
+	rpage = PageGetTempPage(oldpage);
+	GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize);
+	GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize);
+
+	*prdata = rdata;
+
 	/*
-	 * we suppose that during index creation table scaned from begin to end,
-	 * so ItemPointers are monotonically increased..
+	 * First construct a new list of PostingItems, which includes all the old
+	 * items, and the new item.
 	 */
-	if (btree->isBuild && GinPageRightMost(lpage))
-		separator = freeSpace / sizeofitem;
-	else
-		separator = maxoff / 2;
+	memcpy(allitems, GinDataPageGetPostingItem(oldpage, FirstOffsetNumber),
+		   (off - 1) * sizeof(PostingItem));
 
-	GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
-	GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize);
+	allitems[off - 1] = *((PostingItem *) insertdata);
+	memcpy(&allitems[off], GinDataPageGetPostingItem(oldpage, off),
+		   (nitems - (off - 1)) * sizeof(PostingItem));
+	nitems++;
 
-	memcpy(GinDataPageGetItem(lpage, FirstOffsetNumber), vector, separator * sizeofitem);
-	GinPageGetOpaque(lpage)->maxoff = separator;
-	memcpy(GinDataPageGetItem(rpage, FirstOffsetNumber),
-		 vector + separator * sizeofitem, (maxoff - separator) * sizeofitem);
-	GinPageGetOpaque(rpage)->maxoff = maxoff - separator;
+	/* Update existing downlink to point to next page */
+	PostingItemSetBlockNumber(&allitems[off], updateblkno);
 
-	PostingItemSetBlockNumber(&(btree->pitem), BufferGetBlockNumber(lbuf));
-	if (GinPageIsLeaf(lpage))
-		btree->pitem.key = *(ItemPointerData *) GinDataPageGetItem(lpage,
-											GinPageGetOpaque(lpage)->maxoff);
+	/*
+	 * When creating a new index, fit as many tuples as possible on the left
+	 * page, on the assumption that the table is scanned from beginning to
+	 * end. This packs the index as tight as possible.
+	 */
+	if (btree->isBuild && GinPageRightMost(oldpage))
+		separator = GinNonLeafDataPageGetFreeSpace(rpage) / sizeof(PostingItem);
 	else
-		btree->pitem.key = ((PostingItem *) GinDataPageGetItem(lpage,
-									  GinPageGetOpaque(lpage)->maxoff))->key;
-	btree->rightblkno = BufferGetBlockNumber(rbuf);
+		separator = nitems / 2;
+	nleftitems = separator;
+	nrightitems = nitems - separator;
+
+	memcpy(GinDataPageGetPostingItem(lpage, FirstOffsetNumber),
+		   allitems,
+		   nleftitems * sizeof(PostingItem));
+	GinPageGetOpaque(lpage)->maxoff = nleftitems;
+	memcpy(GinDataPageGetPostingItem(rpage, FirstOffsetNumber),
+		   &allitems[separator],
+		   nrightitems * sizeof(PostingItem));
+	GinPageGetOpaque(rpage)->maxoff = nrightitems;
+
+	/*
+	 * Also set pd_lower for both pages, like GinDataPageAddPostingItem does.
+	 */
+	GinDataPageSetDataSize(lpage, nleftitems * sizeof(PostingItem));
+	GinDataPageSetDataSize(rpage, nrightitems * sizeof(PostingItem));
 
 	/* set up right bound for left page */
 	bound = GinDataPageGetRightBound(lpage);
-	*bound = btree->pitem.key;
+	*bound = GinDataPageGetPostingItem(lpage, nleftitems)->key;
 
 	/* set up right bound for right page */
-	bound = GinDataPageGetRightBound(rpage);
-	*bound = oldbound;
+	*GinDataPageGetRightBound(rpage) = oldbound;
 
-	data.node = btree->index->rd_node;
-	data.rootBlkno = InvalidBlockNumber;
-	data.lblkno = BufferGetBlockNumber(lbuf);
-	data.rblkno = BufferGetBlockNumber(rbuf);
 	data.separator = separator;
-	data.nitem = maxoff;
-	data.isData = TRUE;
-	data.isLeaf = GinPageIsLeaf(lpage) ? TRUE : FALSE;
-	data.isRootSplit = FALSE;
+	data.nitem = nitems;
 	data.rightbound = oldbound;
 
 	rdata[0].buffer = InvalidBuffer;
 	rdata[0].data = (char *) &data;
-	rdata[0].len = sizeof(ginxlogSplit);
+	rdata[0].len = sizeof(ginxlogSplitDataInternal);
 	rdata[0].next = &rdata[1];
 
 	rdata[1].buffer = InvalidBuffer;
-	rdata[1].data = vector;
-	rdata[1].len = MAXALIGN(maxoff * sizeofitem);
+	rdata[1].data = (char *) allitems;
+	rdata[1].len = nitems * sizeof(PostingItem);
 	rdata[1].next = NULL;
 
-	return lpage;
+	*newlpage = lpage;
+	*newrpage = rpage;
+}
+
+/*
+ * Construct insertion payload for inserting the downlink for given buffer.
+ */
+static void *
+dataPrepareDownlink(GinBtree btree, Buffer lbuf)
+{
+	PostingItem *pitem = palloc(sizeof(PostingItem));
+	Page		lpage = BufferGetPage(lbuf);
+
+	PostingItemSetBlockNumber(pitem, BufferGetBlockNumber(lbuf));
+	pitem->key = *GinDataPageGetRightBound(lpage);
+
+	return pitem;
 }
 
 /*
@@ -585,102 +1306,596 @@ dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRe
  * Also called from ginxlog, should not use btree
  */
 void
-ginDataFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
+ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage)
 {
-	Page		page = BufferGetPage(root),
-				lpage = BufferGetPage(lbuf),
-				rpage = BufferGetPage(rbuf);
 	PostingItem li,
 				ri;
 
 	li.key = *GinDataPageGetRightBound(lpage);
-	PostingItemSetBlockNumber(&li, BufferGetBlockNumber(lbuf));
-	GinDataPageAddItem(page, &li, InvalidOffsetNumber);
+	PostingItemSetBlockNumber(&li, lblkno);
+	GinDataPageAddPostingItem(root, &li, InvalidOffsetNumber);
 
 	ri.key = *GinDataPageGetRightBound(rpage);
-	PostingItemSetBlockNumber(&ri, BufferGetBlockNumber(rbuf));
-	GinDataPageAddItem(page, &ri, InvalidOffsetNumber);
+	PostingItemSetBlockNumber(&ri, rblkno);
+	GinDataPageAddPostingItem(root, &ri, InvalidOffsetNumber);
+}
+
+
+/*** Functions to work with disassembled leaf pages ***/
+
+/*
+ * Disassemble page into a disassembledLeaf struct.
+ */
+static disassembledLeaf *
+disassembleLeaf(Page page)
+{
+	disassembledLeaf *leaf;
+	GinPostingList *seg;
+	Pointer		segbegin;
+	Pointer		segend;
+
+	leaf = palloc0(sizeof(disassembledLeaf));
+	dlist_init(&leaf->segments);
+
+	if (GinPageIsCompressed(page))
+	{
+		/*
+		 * Create a leafSegment entry for each segment.
+		 */
+		seg = GinDataLeafPageGetPostingList(page);
+		segbegin = (Pointer) seg;
+		segend = segbegin + GinDataLeafPageGetPostingListSize(page);
+		while ((Pointer) seg < segend)
+		{
+			leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo));
+
+			seginfo->action = GIN_SEGMENT_UNMODIFIED;
+			seginfo->seg = seg;
+			seginfo->items = NULL;
+			seginfo->nitems = 0;
+			dlist_push_tail(&leaf->segments, &seginfo->node);
+
+			seg = GinNextPostingListSegment(seg);
+		}
+		leaf->oldformat = false;
+	}
+	else
+	{
+		/*
+		 * A pre-9.4 format uncompressed page is represented by a single
+		 * segment, with an array of items.
+		 */
+		ItemPointer uncompressed;
+		int			nuncompressed;
+		leafSegmentInfo *seginfo;
+
+		uncompressed = dataLeafPageGetUncompressed(page, &nuncompressed);
+
+		seginfo = palloc(sizeof(leafSegmentInfo));
+
+		seginfo->action = GIN_SEGMENT_REPLACE;
+		seginfo->seg = NULL;
+		seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData));
+		memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData));
+		seginfo->nitems = nuncompressed;
+
+		dlist_push_tail(&leaf->segments, &seginfo->node);
+
+		leaf->oldformat = true;
+	}
+
+	return leaf;
+}
+
+/*
+ * Distribute newItems to the segments.
+ *
+ * Any segments that acquire new items are decoded, and the new items are
+ * merged with the old items.
+ *
+ * Returns true if any new items were added. False means they were all
+ * duplicates of existing items on the page.
+ */
+static bool
+addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
+{
+	dlist_iter	iter;
+	ItemPointer nextnew = newItems;
+	int			newleft = nNewItems;
+	bool		modified = false;
+	leafSegmentInfo *newseg;
+
+	/*
+	 * If the page is completely empty, just construct one new segment to hold
+	 * all the new items.
+	 */
+	if (dlist_is_empty(&leaf->segments))
+	{
+		newseg = palloc(sizeof(leafSegmentInfo));
+		newseg->seg = NULL;
+		newseg->items = newItems;
+		newseg->nitems = nNewItems;
+		newseg->action = GIN_SEGMENT_INSERT;
+		dlist_push_tail(&leaf->segments, &newseg->node);
+		return true;
+	}
+
+	dlist_foreach(iter, &leaf->segments)
+	{
+		leafSegmentInfo *cur = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, iter.cur);
+		int			nthis;
+		ItemPointer tmpitems;
+		int			ntmpitems;
+
+		/*
+		 * How many of the new items fall into this segment?
+		 */
+		if (!dlist_has_next(&leaf->segments, iter.cur))
+			nthis = newleft;
+		else
+		{
+			leafSegmentInfo *next;
+			ItemPointerData next_first;
+
+			next = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node,
+								 dlist_next_node(&leaf->segments, iter.cur));
+			if (next->items)
+				next_first = next->items[0];
+			else
+			{
+				Assert(next->seg != NULL);
+				next_first = next->seg->first;
+			}
+
+			nthis = 0;
+			while (nthis < newleft && ginCompareItemPointers(&nextnew[nthis], &next_first) < 0)
+				nthis++;
+		}
+		if (nthis == 0)
+			continue;
+
+		/* Merge the new items with the existing items. */
+		if (!cur->items)
+			cur->items = ginPostingListDecode(cur->seg, &cur->nitems);
+
+		/*
+		 * Fast path for the important special case that we're appending to
+		 * the end of the page: don't let the last segment on the page grow
+		 * larger than the target, create a new segment before that happens.
+		 */
+		if (!dlist_has_next(&leaf->segments, iter.cur) &&
+			ginCompareItemPointers(&cur->items[cur->nitems - 1], &nextnew[0]) < 0 &&
+			cur->seg != NULL &&
+			SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize)
+		{
+			newseg = palloc(sizeof(leafSegmentInfo));
+			newseg->seg = NULL;
+			newseg->items = nextnew;
+			newseg->nitems = nthis;
+			newseg->action = GIN_SEGMENT_INSERT;
+			dlist_push_tail(&leaf->segments, &newseg->node);
+			modified = true;
+			break;
+		}
+
+		tmpitems = ginMergeItemPointers(cur->items, cur->nitems,
+										nextnew, nthis,
+										&ntmpitems);
+		if (ntmpitems != cur->nitems)
+		{
+			/*
+			 * If there are no duplicates, track the added items so that we
+			 * can emit a compact ADDITEMS WAL record later on. (it doesn't
+			 * seem worth re-checking which items were duplicates, if there
+			 * were any)
+			 */
+			if (ntmpitems == nthis + cur->nitems &&
+				cur->action == GIN_SEGMENT_UNMODIFIED)
+			{
+				cur->action = GIN_SEGMENT_ADDITEMS;
+				cur->modifieditems = nextnew;
+				cur->nmodifieditems = nthis;
+			}
+			else
+				cur->action = GIN_SEGMENT_REPLACE;
+
+			cur->items = tmpitems;
+			cur->nitems = ntmpitems;
+			cur->seg = NULL;
+			modified = true;
+		}
+
+		nextnew += nthis;
+		newleft -= nthis;
+		if (newleft == 0)
+			break;
+	}
+
+	return modified;
+}
+
+/*
+ * Recompresses all segments that have been modified.
+ *
+ * If not all the items fit on two pages (ie. after split), we store as
+ * many items as fit, and set *remaining to the first item that didn't fit.
+ * If all items fit, *remaining is set to invalid.
+ *
+ * Returns true if the page has to be split.
+ */
+static bool
+leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
+{
+	int			pgused = 0;
+	bool		needsplit = false;
+	dlist_iter	iter;
+	int			segsize;
+	leafSegmentInfo *nextseg;
+	int			npacked;
+	bool		modified;
+	dlist_node *cur_node;
+	dlist_node *next_node;
+
+	ItemPointerSetInvalid(remaining);
+
+	/*
+	 * cannot use dlist_foreach_modify here because we insert adjacent items
+	 * while iterating.
+	 */
+	for (cur_node = dlist_head_node(&leaf->segments);
+		 cur_node != NULL;
+		 cur_node = next_node)
+	{
+		leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+												   cur_node);
+
+		if (dlist_has_next(&leaf->segments, cur_node))
+			next_node = dlist_next_node(&leaf->segments, cur_node);
+		else
+			next_node = NULL;
+
+		/* Compress the posting list, if necessary */
+		if (seginfo->action != GIN_SEGMENT_DELETE)
+		{
+			if (seginfo->seg == NULL)
+			{
+				if (seginfo->nitems > GinPostingListSegmentMaxSize)
+					npacked = 0;	/* no chance that it would fit. */
+				else
+				{
+					seginfo->seg = ginCompressPostingList(seginfo->items,
+														  seginfo->nitems,
+												GinPostingListSegmentMaxSize,
+														  &npacked);
+				}
+				if (npacked != seginfo->nitems)
+				{
+					/*
+					 * Too large. Compress again to the target size, and
+					 * create a new segment to represent the remaining items.
+					 * The new segment is inserted after this one, so it will
+					 * be processed in the next iteration of this loop.
+					 */
+					if (seginfo->seg)
+						pfree(seginfo->seg);
+					seginfo->seg = ginCompressPostingList(seginfo->items,
+														  seginfo->nitems,
+											 GinPostingListSegmentTargetSize,
+														  &npacked);
+					if (seginfo->action != GIN_SEGMENT_INSERT)
+						seginfo->action = GIN_SEGMENT_REPLACE;
+
+					nextseg = palloc(sizeof(leafSegmentInfo));
+					nextseg->action = GIN_SEGMENT_INSERT;
+					nextseg->seg = NULL;
+					nextseg->items = &seginfo->items[npacked];
+					nextseg->nitems = seginfo->nitems - npacked;
+					next_node = &nextseg->node;
+					dlist_insert_after(cur_node, next_node);
+				}
+			}
+
+			/*
+			 * If the segment is very small, merge it with the next segment.
+			 */
+			if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node)
+			{
+				int			nmerged;
+
+				nextseg = dlist_container(leafSegmentInfo, node, next_node);
+
+				if (seginfo->items == NULL)
+					seginfo->items = ginPostingListDecode(seginfo->seg,
+														  &seginfo->nitems);
+				if (nextseg->items == NULL)
+					nextseg->items = ginPostingListDecode(nextseg->seg,
+														  &nextseg->nitems);
+				nextseg->items =
+					ginMergeItemPointers(seginfo->items, seginfo->nitems,
+										 nextseg->items, nextseg->nitems,
+										 &nmerged);
+				Assert(nmerged == seginfo->nitems + nextseg->nitems);
+				nextseg->nitems = nmerged;
+				nextseg->seg = NULL;
+
+				nextseg->action = GIN_SEGMENT_REPLACE;
+				nextseg->modifieditems = NULL;
+				nextseg->nmodifieditems = 0;
+
+				if (seginfo->action == GIN_SEGMENT_INSERT)
+				{
+					dlist_delete(cur_node);
+					continue;
+				}
+				else
+				{
+					seginfo->action = GIN_SEGMENT_DELETE;
+					seginfo->seg = NULL;
+				}
+			}
+
+			seginfo->items = NULL;
+			seginfo->nitems = 0;
+		}
+
+		if (seginfo->action == GIN_SEGMENT_DELETE)
+			continue;
+
+		/*
+		 * OK, we now have a compressed version of this segment ready for
+		 * copying to the page. Did we exceed the size that fits on one page?
+		 */
+		segsize = SizeOfGinPostingList(seginfo->seg);
+		if (pgused + segsize > GinDataPageMaxDataSize)
+		{
+			if (!needsplit)
+			{
+				/* switch to right page */
+				Assert(pgused > 0);
+				leaf->lastleft = dlist_prev_node(&leaf->segments, cur_node);
+				needsplit = true;
+				leaf->lsize = pgused;
+				pgused = 0;
+			}
+			else
+			{
+				/*
+				 * Filled both pages. The last segment we constructed did not
+				 * fit.
+				 */
+				*remaining = seginfo->seg->first;
+
+				/*
+				 * remove all segments that did not fit from the list.
+				 */
+				while (dlist_has_next(&leaf->segments, cur_node))
+					dlist_delete(dlist_next_node(&leaf->segments, cur_node));
+				dlist_delete(cur_node);
+				break;
+			}
+		}
+
+		pgused += segsize;
+	}
+
+	if (!needsplit)
+	{
+		leaf->lsize = pgused;
+		leaf->rsize = 0;
+	}
+	else
+		leaf->rsize = pgused;
+
+	Assert(leaf->lsize <= GinDataPageMaxDataSize);
+	Assert(leaf->rsize <= GinDataPageMaxDataSize);
+
+	/*
+	 * Make a palloc'd copy of every segment after the first modified one,
+	 * because as we start copying items to the original page, we might
+	 * overwrite an existing segment.
+	 */
+	modified = false;
+	dlist_foreach(iter, &leaf->segments)
+	{
+		leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+												   iter.cur);
+
+		if (!modified && seginfo->action != GIN_SEGMENT_UNMODIFIED)
+		{
+			modified = true;
+		}
+		else if (modified && seginfo->action == GIN_SEGMENT_UNMODIFIED)
+		{
+			GinPostingList *tmp;
+
+			segsize = SizeOfGinPostingList(seginfo->seg);
+			tmp = palloc(segsize);
+			memcpy(tmp, seginfo->seg, segsize);
+			seginfo->seg = tmp;
+		}
+	}
+
+	return needsplit;
+}
+
+
+/*** Functions that are exported to the rest of the GIN code ***/
+
+/*
+ * Creates new posting tree containing the given TIDs. Returns the page
+ * number of the root of the new posting tree.
+ *
+ * items[] must be in sorted order with no duplicates.
+ */
+BlockNumber
+createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
+				  GinStatsData *buildStats)
+{
+	BlockNumber blkno;
+	Buffer		buffer;
+	Page		tmppage;
+	Page		page;
+	Pointer		ptr;
+	int			nrootitems;
+	int			rootsize;
+
+	/* Construct the new root page in memory first. */
+	tmppage = (Page) palloc(BLCKSZ);
+	GinInitPage(tmppage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
+	GinPageGetOpaque(tmppage)->rightlink = InvalidBlockNumber;
+
+	/*
+	 * Write as many of the items to the root page as fit. In segments of max
+	 * GinPostingListSegmentMaxSize bytes each.
+	 */
+	nrootitems = 0;
+	rootsize = 0;
+	ptr = (Pointer) GinDataLeafPageGetPostingList(tmppage);
+	while (nrootitems < nitems)
+	{
+		GinPostingList *segment;
+		int			npacked;
+		int			segsize;
+
+		segment = ginCompressPostingList(&items[nrootitems],
+										 nitems - nrootitems,
+										 GinPostingListSegmentMaxSize,
+										 &npacked);
+		segsize = SizeOfGinPostingList(segment);
+		if (rootsize + segsize > GinDataPageMaxDataSize)
+			break;
+
+		memcpy(ptr, segment, segsize);
+		ptr += segsize;
+		rootsize += segsize;
+		nrootitems += npacked;
+		pfree(segment);
+	}
+	GinDataPageSetDataSize(tmppage, rootsize);
+
+	/*
+	 * All set. Get a new physical page, and copy the in-memory page to it.
+	 */
+	buffer = GinNewBuffer(index);
+	page = BufferGetPage(buffer);
+	blkno = BufferGetBlockNumber(buffer);
+
+	START_CRIT_SECTION();
+
+	PageRestoreTempPage(tmppage, page);
+	MarkBufferDirty(buffer);
+
+	if (RelationNeedsWAL(index))
+	{
+		XLogRecPtr	recptr;
+		XLogRecData rdata[2];
+		ginxlogCreatePostingTree data;
+
+		data.node = index->rd_node;
+		data.blkno = blkno;
+		data.size = rootsize;
+
+		rdata[0].buffer = InvalidBuffer;
+		rdata[0].data = (char *) &data;
+		rdata[0].len = sizeof(ginxlogCreatePostingTree);
+		rdata[0].next = &rdata[1];
+
+		rdata[1].buffer = InvalidBuffer;
+		rdata[1].data = (char *) GinDataLeafPageGetPostingList(page);
+		rdata[1].len = rootsize;
+		rdata[1].next = NULL;
+
+		recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE, rdata);
+		PageSetLSN(page, recptr);
+	}
+
+	UnlockReleaseBuffer(buffer);
+
+	END_CRIT_SECTION();
+
+	/* During index build, count the newly-added data page */
+	if (buildStats)
+		buildStats->nDataPages++;
+
+	elog(DEBUG2, "created GIN posting tree with %d items", nrootitems);
+
+	/*
+	 * Add any remaining TIDs to the newly-created posting tree.
+	 */
+	if (nitems > nrootitems)
+	{
+		ginInsertItemPointers(index, blkno,
+							  items + nrootitems,
+							  nitems - nrootitems,
+							  buildStats);
+	}
+
+	return blkno;
 }
 
 void
-ginPrepareDataScan(GinBtree btree, Relation index)
+ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno)
 {
 	memset(btree, 0, sizeof(GinBtreeData));
 
 	btree->index = index;
+	btree->rootBlkno = rootBlkno;
 
 	btree->findChildPage = dataLocateItem;
+	btree->getLeftMostChild = dataGetLeftMostPage;
 	btree->isMoveRight = dataIsMoveRight;
-	btree->findItem = dataLocateLeafItem;
+	btree->findItem = NULL;
 	btree->findChildPtr = dataFindChildPtr;
-	btree->getLeftMostPage = dataGetLeftMostPage;
-	btree->isEnoughSpace = dataIsEnoughSpace;
 	btree->placeToPage = dataPlaceToPage;
-	btree->splitPage = dataSplitPage;
 	btree->fillRoot = ginDataFillRoot;
+	btree->prepareDownlink = dataPrepareDownlink;
 
 	btree->isData = TRUE;
-	btree->searchMode = FALSE;
-	btree->isDelete = FALSE;
 	btree->fullScan = FALSE;
 	btree->isBuild = FALSE;
 }
 
-GinPostingTreeScan *
-ginPrepareScanPostingTree(Relation index, BlockNumber rootBlkno, bool searchMode)
-{
-	GinPostingTreeScan *gdi = (GinPostingTreeScan *) palloc0(sizeof(GinPostingTreeScan));
-
-	ginPrepareDataScan(&gdi->btree, index);
-
-	gdi->btree.searchMode = searchMode;
-	gdi->btree.fullScan = searchMode;
-
-	gdi->stack = ginPrepareFindLeafPage(&gdi->btree, rootBlkno);
-
-	return gdi;
-}
-
 /*
  * Inserts array of item pointers, may execute several tree scan (very rare)
  */
 void
-ginInsertItemPointers(GinPostingTreeScan *gdi,
+ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
 					  ItemPointerData *items, uint32 nitem,
 					  GinStatsData *buildStats)
 {
-	BlockNumber rootBlkno = gdi->stack->blkno;
+	GinBtreeData btree;
+	GinBtreeDataLeafInsertData insertdata;
+	GinBtreeStack *stack;
 
-	gdi->btree.items = items;
-	gdi->btree.nitem = nitem;
-	gdi->btree.curitem = 0;
+	ginPrepareDataScan(&btree, index, rootBlkno);
+	btree.isBuild = (buildStats != NULL);
+	insertdata.items = items;
+	insertdata.nitem = nitem;
+	insertdata.curitem = 0;
 
-	while (gdi->btree.curitem < gdi->btree.nitem)
+	while (insertdata.curitem < insertdata.nitem)
 	{
-		if (!gdi->stack)
-			gdi->stack = ginPrepareFindLeafPage(&gdi->btree, rootBlkno);
-
-		gdi->stack = ginFindLeafPage(&gdi->btree, gdi->stack);
-
-		if (gdi->btree.findItem(&(gdi->btree), gdi->stack))
-		{
-			/*
-			 * gdi->btree.items[gdi->btree.curitem] already exists in index
-			 */
-			gdi->btree.curitem++;
-			LockBuffer(gdi->stack->buffer, GIN_UNLOCK);
-			freeGinBtreeStack(gdi->stack);
-		}
-		else
-			ginInsertValue(&(gdi->btree), gdi->stack, buildStats);
+		/* search for the leaf page where the first item should go to */
+		btree.itemptr = insertdata.items[insertdata.curitem];
+		stack = ginFindLeafPage(&btree, false);
 
-		gdi->stack = NULL;
+		ginInsertValue(&btree, stack, &insertdata, buildStats);
 	}
 }
 
-Buffer
-ginScanBeginPostingTree(GinPostingTreeScan *gdi)
+/*
+ * Starts a new scan on a posting tree.
+ */
+GinBtreeStack *
+ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno)
 {
-	gdi->stack = ginFindLeafPage(&gdi->btree, gdi->stack);
-	return gdi->stack->buffer;
+	GinBtreeStack *stack;
+
+	ginPrepareDataScan(btree, index, rootBlkno);
+
+	btree->fullScan = TRUE;
+
+	stack = ginFindLeafPage(btree, TRUE);
+
+	return stack;
 }
diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c
index 70fcddfe40..412f90da4d 100644
--- a/src/backend/access/gin/ginentrypage.c
+++ b/src/backend/access/gin/ginentrypage.c
@@ -1,10 +1,10 @@
 /*-------------------------------------------------------------------------
  *
  * ginentrypage.c
- *	  page utilities routines for the postgres inverted index access method.
+ *	  routines for handling GIN entry tree pages.
  *
  *
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -15,8 +15,15 @@
 #include "postgres.h"
 
 #include "access/gin_private.h"
+#include "miscadmin.h"
 #include "utils/rel.h"
 
+static void entrySplitPage(GinBtree btree, Buffer origbuf,
+			   GinBtreeStack *stack,
+			   void *insertPayload,
+			   BlockNumber updateblkno, XLogRecData **prdata,
+			   Page *newlpage, Page *newrpage);
+
 /*
  * Form a tuple for entry tree.
  *
@@ -27,15 +34,15 @@
  * format that is being built here.  We build on the assumption that we
  * are making a leaf-level key entry containing a posting list of nipd items.
  * If the caller is actually trying to make a posting-tree entry, non-leaf
- * entry, or pending-list entry, it should pass nipd = 0 and then overwrite
- * the t_tid fields as necessary.  In any case, ipd can be NULL to skip
- * copying any itempointers into the posting list; the caller is responsible
- * for filling the posting list afterwards, if ipd = NULL and nipd > 0.
+ * entry, or pending-list entry, it should pass dataSize = 0 and then overwrite
+ * the t_tid fields as necessary.  In any case, 'data' can be NULL to skip
+ * filling in the posting list; the caller is responsible for filling it
+ * afterwards if data = NULL and nipd > 0.
  */
 IndexTuple
 GinFormTuple(GinState *ginstate,
 			 OffsetNumber attnum, Datum key, GinNullCategory category,
-			 ItemPointerData *ipd, uint32 nipd,
+			 Pointer data, Size dataSize, int nipd,
 			 bool errorTooBig)
 {
 	Datum		datums[2];
@@ -83,24 +90,23 @@ GinFormTuple(GinState *ginstate,
 	newsize = SHORTALIGN(newsize);
 
 	GinSetPostingOffset(itup, newsize);
-
 	GinSetNPosting(itup, nipd);
 
 	/*
 	 * Add space needed for posting list, if any.  Then check that the tuple
 	 * won't be too big to store.
 	 */
-	newsize += sizeof(ItemPointerData) * nipd;
+	newsize += dataSize;
+
 	newsize = MAXALIGN(newsize);
-	if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize))
+
+	if (newsize > GinMaxItemSize)
 	{
 		if (errorTooBig)
 			ereport(ERROR,
 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-			errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
-				   (unsigned long) newsize,
-				   (unsigned long) Min(INDEX_SIZE_MASK,
-									   GinMaxItemSize),
+			errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
+				   (Size) newsize, (Size) GinMaxItemSize,
 				   RelationGetRelationName(ginstate->index))));
 		pfree(itup);
 		return NULL;
@@ -113,12 +119,28 @@ GinFormTuple(GinState *ginstate,
 	{
 		itup = repalloc(itup, newsize);
 
+		/*
+		 * PostgreSQL 9.3 and earlier did not clear this new space, so we
+		 * might find uninitialized padding when reading tuples from disk.
+		 */
+		memset((char *) itup + IndexTupleSize(itup),
+			   0, newsize - IndexTupleSize(itup));
 		/* set new size in tuple header */
 		itup->t_info &= ~INDEX_SIZE_MASK;
 		itup->t_info |= newsize;
 	}
 
 	/*
+	 * Copy in the posting list, if provided
+	 */
+	if (data)
+	{
+		char	   *ptr = GinGetPosting(itup);
+
+		memcpy(ptr, data, dataSize);
+	}
+
+	/*
 	 * Insert category byte, if needed
 	 */
 	if (category != GIN_CAT_NORM_KEY)
@@ -126,44 +148,52 @@ GinFormTuple(GinState *ginstate,
 		Assert(IndexTupleHasNulls(itup));
 		GinSetNullCategory(itup, ginstate, category);
 	}
-
-	/*
-	 * Copy in the posting list, if provided
-	 */
-	if (ipd)
-		memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd);
-
 	return itup;
 }
 
 /*
- * Sometimes we reduce the number of posting list items in a tuple after
- * having built it with GinFormTuple.  This function adjusts the size
- * fields to match.
+ * Read item pointers from leaf entry tuple.
+ *
+ * Returns a palloc'd array of ItemPointers. The number of items is returned
+ * in *nitems.
  */
-void
-GinShortenTuple(IndexTuple itup, uint32 nipd)
+ItemPointer
+ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup,
+			 int *nitems)
 {
-	uint32		newsize;
-
-	Assert(nipd <= GinGetNPosting(itup));
-
-	newsize = GinGetPostingOffset(itup) + sizeof(ItemPointerData) * nipd;
-	newsize = MAXALIGN(newsize);
+	Pointer		ptr = GinGetPosting(itup);
+	int			nipd = GinGetNPosting(itup);
+	ItemPointer ipd;
+	int			ndecoded;
 
-	Assert(newsize <= (itup->t_info & INDEX_SIZE_MASK));
-
-	itup->t_info &= ~INDEX_SIZE_MASK;
-	itup->t_info |= newsize;
-
-	GinSetNPosting(itup, nipd);
+	if (GinItupIsCompressed(itup))
+	{
+		if (nipd > 0)
+		{
+			ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded);
+			if (nipd != ndecoded)
+				elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded",
+					 nipd, ndecoded);
+		}
+		else
+		{
+			ipd = palloc(0);
+		}
+	}
+	else
+	{
+		ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd);
+		memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd);
+	}
+	*nitems = nipd;
+	return ipd;
 }
 
 /*
  * Form a non-leaf entry tuple by copying the key data from the given tuple,
  * which can be either a leaf or non-leaf entry tuple.
  *
- * Any posting list in the source tuple is not copied.	The specified child
+ * Any posting list in the source tuple is not copied.  The specified child
  * block number is inserted into t_tid.
  */
 static IndexTuple
@@ -252,7 +282,7 @@ entryLocateEntry(GinBtree btree, GinBtreeStack *stack)
 	{
 		stack->off = FirstOffsetNumber;
 		stack->predictNumber *= PageGetMaxOffsetNumber(page);
-		return btree->getLeftMostPage(btree, page);
+		return btree->getLeftMostChild(btree, page);
 	}
 
 	low = FirstOffsetNumber;
@@ -425,22 +455,26 @@ entryGetLeftMostPage(GinBtree btree, Page page)
 }
 
 static bool
-entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off)
+entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off,
+				   GinBtreeEntryInsertData *insertData)
 {
-	Size		itupsz = 0;
+	Size		releasedsz = 0;
+	Size		addedsz;
 	Page		page = BufferGetPage(buf);
 
-	Assert(btree->entry);
+	Assert(insertData->entry);
 	Assert(!GinPageIsData(page));
 
-	if (btree->isDelete)
+	if (insertData->isDelete)
 	{
 		IndexTuple	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
 
-		itupsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
+		releasedsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
 	}
 
-	if (PageGetFreeSpace(page) + itupsz >= MAXALIGN(IndexTupleSize(btree->entry)) + sizeof(ItemIdData))
+	addedsz = MAXALIGN(IndexTupleSize(insertData->entry)) + sizeof(ItemIdData);
+
+	if (PageGetFreeSpace(page) + releasedsz >= addedsz)
 		return true;
 
 	return false;
@@ -451,136 +485,140 @@ entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off)
  * should update it, update old child blkno to new right page
  * if child split occurred
  */
-static BlockNumber
-entryPreparePage(GinBtree btree, Page page, OffsetNumber off)
+static void
+entryPreparePage(GinBtree btree, Page page, OffsetNumber off,
+				 GinBtreeEntryInsertData *insertData, BlockNumber updateblkno)
 {
-	BlockNumber ret = InvalidBlockNumber;
-
-	Assert(btree->entry);
+	Assert(insertData->entry);
 	Assert(!GinPageIsData(page));
 
-	if (btree->isDelete)
+	if (insertData->isDelete)
 	{
 		Assert(GinPageIsLeaf(page));
 		PageIndexTupleDelete(page, off);
 	}
 
-	if (!GinPageIsLeaf(page) && btree->rightblkno != InvalidBlockNumber)
+	if (!GinPageIsLeaf(page) && updateblkno != InvalidBlockNumber)
 	{
 		IndexTuple	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
 
-		GinSetDownlink(itup, btree->rightblkno);
-		ret = btree->rightblkno;
+		GinSetDownlink(itup, updateblkno);
 	}
-
-	btree->rightblkno = InvalidBlockNumber;
-
-	return ret;
 }
 
 /*
  * Place tuple on page and fills WAL record
+ *
+ * If the tuple doesn't fit, returns false without modifying the page.
+ *
+ * On insertion to an internal node, in addition to inserting the given item,
+ * the downlink of the existing item at 'off' is updated to point to
+ * 'updateblkno'.
  */
-static void
-entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata)
+static GinPlaceToPageRC
+entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+				 void *insertPayload, BlockNumber updateblkno,
+				 XLogRecData **prdata, Page *newlpage, Page *newrpage)
 {
+	GinBtreeEntryInsertData *insertData = insertPayload;
 	Page		page = BufferGetPage(buf);
+	OffsetNumber off = stack->off;
 	OffsetNumber placed;
 	int			cnt = 0;
 
 	/* these must be static so they can be returned to caller */
 	static XLogRecData rdata[3];
-	static ginxlogInsert data;
+	static ginxlogInsertEntry data;
+
+	/* quick exit if it doesn't fit */
+	if (!entryIsEnoughSpace(btree, buf, off, insertData))
+	{
+		entrySplitPage(btree, buf, stack, insertPayload, updateblkno,
+					   prdata, newlpage, newrpage);
+		return SPLIT;
+	}
+
+	START_CRIT_SECTION();
 
 	*prdata = rdata;
-	data.updateBlkno = entryPreparePage(btree, page, off);
+	entryPreparePage(btree, page, off, insertData, updateblkno);
 
-	placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false, false);
+	placed = PageAddItem(page,
+						 (Item) insertData->entry,
+						 IndexTupleSize(insertData->entry),
+						 off, false, false);
 	if (placed != off)
 		elog(ERROR, "failed to add item to index page in \"%s\"",
 			 RelationGetRelationName(btree->index));
 
-	data.node = btree->index->rd_node;
-	data.blkno = BufferGetBlockNumber(buf);
+	data.isDelete = insertData->isDelete;
 	data.offset = off;
-	data.nitem = 1;
-	data.isDelete = btree->isDelete;
-	data.isData = false;
-	data.isLeaf = GinPageIsLeaf(page) ? TRUE : FALSE;
-
-	/*
-	 * Prevent full page write if child's split occurs. That is needed to
-	 * remove incomplete splits while replaying WAL
-	 *
-	 * data.updateBlkno contains new block number (of newly created right
-	 * page) for recently splited page.
-	 */
-	if (data.updateBlkno == InvalidBlockNumber)
-	{
-		rdata[0].buffer = buf;
-		rdata[0].buffer_std = TRUE;
-		rdata[0].data = NULL;
-		rdata[0].len = 0;
-		rdata[0].next = &rdata[1];
-		cnt++;
-	}
 
-	rdata[cnt].buffer = InvalidBuffer;
+	rdata[cnt].buffer = buf;
+	rdata[cnt].buffer_std = true;
 	rdata[cnt].data = (char *) &data;
-	rdata[cnt].len = sizeof(ginxlogInsert);
+	rdata[cnt].len = offsetof(ginxlogInsertEntry, tuple);
 	rdata[cnt].next = &rdata[cnt + 1];
 	cnt++;
 
-	rdata[cnt].buffer = InvalidBuffer;
-	rdata[cnt].data = (char *) btree->entry;
-	rdata[cnt].len = IndexTupleSize(btree->entry);
+	rdata[cnt].buffer = buf;
+	rdata[cnt].buffer_std = true;
+	rdata[cnt].data = (char *) insertData->entry;
+	rdata[cnt].len = IndexTupleSize(insertData->entry);
 	rdata[cnt].next = NULL;
 
-	btree->entry = NULL;
+	return INSERTED;
 }
 
 /*
  * Place tuple and split page, original buffer(lbuf) leaves untouched,
- * returns shadow page of lbuf filled new data.
+ * returns shadow pages filled with new data.
  * Tuples are distributed between pages by equal size on its, not
  * an equal number!
  */
-static Page
-entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata)
+static void
+entrySplitPage(GinBtree btree, Buffer origbuf,
+			   GinBtreeStack *stack,
+			   void *insertPayload,
+			   BlockNumber updateblkno, XLogRecData **prdata,
+			   Page *newlpage, Page *newrpage)
 {
+	GinBtreeEntryInsertData *insertData = insertPayload;
+	OffsetNumber off = stack->off;
 	OffsetNumber i,
 				maxoff,
 				separator = InvalidOffsetNumber;
 	Size		totalsize = 0;
+	Size		tupstoresize;
 	Size		lsize = 0,
 				size;
 	char	   *ptr;
-	IndexTuple	itup,
-				leftrightmost = NULL;
+	IndexTuple	itup;
 	Page		page;
-	Page		lpage = PageGetTempPageCopy(BufferGetPage(lbuf));
-	Page		rpage = BufferGetPage(rbuf);
+	Page		lpage = PageGetTempPageCopy(BufferGetPage(origbuf));
+	Page		rpage = PageGetTempPageCopy(BufferGetPage(origbuf));
 	Size		pageSize = PageGetPageSize(lpage);
 
 	/* these must be static so they can be returned to caller */
 	static XLogRecData rdata[2];
-	static ginxlogSplit data;
+	static ginxlogSplitEntry data;
 	static char tupstore[2 * BLCKSZ];
 
 	*prdata = rdata;
-	data.leftChildBlkno = (GinPageIsLeaf(lpage)) ?
-		InvalidOffsetNumber : GinGetDownlink(btree->entry);
-	data.updateBlkno = entryPreparePage(btree, lpage, off);
+	entryPreparePage(btree, lpage, off, insertData, updateblkno);
 
+	/*
+	 * First, append all the existing tuples and the new tuple we're inserting
+	 * one after another in a temporary workspace.
+	 */
 	maxoff = PageGetMaxOffsetNumber(lpage);
 	ptr = tupstore;
-
 	for (i = FirstOffsetNumber; i <= maxoff; i++)
 	{
 		if (i == off)
 		{
-			size = MAXALIGN(IndexTupleSize(btree->entry));
-			memcpy(ptr, btree->entry, size);
+			size = MAXALIGN(IndexTupleSize(insertData->entry));
+			memcpy(ptr, insertData->entry, size);
 			ptr += size;
 			totalsize += size + sizeof(ItemIdData);
 		}
@@ -594,12 +632,17 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR
 
 	if (off == maxoff + 1)
 	{
-		size = MAXALIGN(IndexTupleSize(btree->entry));
-		memcpy(ptr, btree->entry, size);
+		size = MAXALIGN(IndexTupleSize(insertData->entry));
+		memcpy(ptr, insertData->entry, size);
 		ptr += size;
 		totalsize += size + sizeof(ItemIdData);
 	}
+	tupstoresize = ptr - tupstore;
 
+	/*
+	 * Initialize the left and right pages, and copy all the tuples back to
+	 * them.
+	 */
 	GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
 	GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize);
 
@@ -620,7 +663,6 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR
 		}
 		else
 		{
-			leftrightmost = itup;
 			lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
 		}
 
@@ -630,48 +672,41 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR
 		ptr += MAXALIGN(IndexTupleSize(itup));
 	}
 
-	btree->entry = GinFormInteriorTuple(leftrightmost, lpage,
-										BufferGetBlockNumber(lbuf));
-
-	btree->rightblkno = BufferGetBlockNumber(rbuf);
-
-	data.node = btree->index->rd_node;
-	data.rootBlkno = InvalidBlockNumber;
-	data.lblkno = BufferGetBlockNumber(lbuf);
-	data.rblkno = BufferGetBlockNumber(rbuf);
 	data.separator = separator;
 	data.nitem = maxoff;
-	data.isData = FALSE;
-	data.isLeaf = GinPageIsLeaf(lpage) ? TRUE : FALSE;
-	data.isRootSplit = FALSE;
 
 	rdata[0].buffer = InvalidBuffer;
 	rdata[0].data = (char *) &data;
-	rdata[0].len = sizeof(ginxlogSplit);
+	rdata[0].len = sizeof(ginxlogSplitEntry);
 	rdata[0].next = &rdata[1];
 
 	rdata[1].buffer = InvalidBuffer;
 	rdata[1].data = tupstore;
-	rdata[1].len = MAXALIGN(totalsize);
+	rdata[1].len = tupstoresize;
 	rdata[1].next = NULL;
 
-	return lpage;
+	*newlpage = lpage;
+	*newrpage = rpage;
 }
 
 /*
- * return newly allocated rightmost tuple
+ * Construct insertion payload for inserting the downlink for given buffer.
  */
-IndexTuple
-ginPageGetLinkItup(Buffer buf)
+static void *
+entryPrepareDownlink(GinBtree btree, Buffer lbuf)
 {
-	IndexTuple	itup,
-				nitup;
-	Page		page = BufferGetPage(buf);
+	GinBtreeEntryInsertData *insertData;
+	Page		lpage = BufferGetPage(lbuf);
+	BlockNumber lblkno = BufferGetBlockNumber(lbuf);
+	IndexTuple	itup;
 
-	itup = getRightMostTuple(page);
-	nitup = GinFormInteriorTuple(itup, page, BufferGetBlockNumber(buf));
+	itup = getRightMostTuple(lpage);
 
-	return nitup;
+	insertData = palloc(sizeof(GinBtreeEntryInsertData));
+	insertData->entry = GinFormInteriorTuple(itup, lpage, lblkno);
+	insertData->isDelete = false;
+
+	return insertData;
 }
 
 /*
@@ -679,20 +714,19 @@ ginPageGetLinkItup(Buffer buf)
  * Also called from ginxlog, should not use btree
  */
 void
-ginEntryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
+ginEntryFillRoot(GinBtree btree, Page root,
+				 BlockNumber lblkno, Page lpage,
+				 BlockNumber rblkno, Page rpage)
 {
-	Page		page;
 	IndexTuple	itup;
 
-	page = BufferGetPage(root);
-
-	itup = ginPageGetLinkItup(lbuf);
-	if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
+	itup = GinFormInteriorTuple(getRightMostTuple(lpage), lpage, lblkno);
+	if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 		elog(ERROR, "failed to add item to index root page");
 	pfree(itup);
 
-	itup = ginPageGetLinkItup(rbuf);
-	if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
+	itup = GinFormInteriorTuple(getRightMostTuple(rpage), rpage, rblkno);
+	if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 		elog(ERROR, "failed to add item to index root page");
 	pfree(itup);
 }
@@ -711,25 +745,23 @@ ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum,
 	memset(btree, 0, sizeof(GinBtreeData));
 
 	btree->index = ginstate->index;
+	btree->rootBlkno = GIN_ROOT_BLKNO;
 	btree->ginstate = ginstate;
 
 	btree->findChildPage = entryLocateEntry;
+	btree->getLeftMostChild = entryGetLeftMostPage;
 	btree->isMoveRight = entryIsMoveRight;
 	btree->findItem = entryLocateLeafEntry;
 	btree->findChildPtr = entryFindChildPtr;
-	btree->getLeftMostPage = entryGetLeftMostPage;
-	btree->isEnoughSpace = entryIsEnoughSpace;
 	btree->placeToPage = entryPlaceToPage;
-	btree->splitPage = entrySplitPage;
 	btree->fillRoot = ginEntryFillRoot;
+	btree->prepareDownlink = entryPrepareDownlink;
 
 	btree->isData = FALSE;
-	btree->searchMode = FALSE;
 	btree->fullScan = FALSE;
 	btree->isBuild = FALSE;
 
 	btree->entryAttnum = attnum;
 	btree->entryKey = key;
 	btree->entryCategory = category;
-	btree->isDelete = FALSE;
 }
diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
index b9bfde2ee4..09c3e39bf3 100644
--- a/src/backend/access/gin/ginfast.c
+++ b/src/backend/access/gin/ginfast.c
@@ -7,7 +7,7 @@
  *	  transfer pending entries into the regular index structure.  This
  *	  wins because bulk insertion is much more efficient than retail.
  *
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -119,15 +119,13 @@ writeListPage(Relation index, Buffer buffer,
 		rdata[0].len = sizeof(ginxlogInsertListPage);
 		rdata[0].next = rdata + 1;
 
-		rdata[1].buffer = buffer;
-		rdata[1].buffer_std = true;
+		rdata[1].buffer = InvalidBuffer;
 		rdata[1].data = workspace;
 		rdata[1].len = size;
 		rdata[1].next = NULL;
 
 		recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE, rdata);
 		PageSetLSN(page, recptr);
-		PageSetTLI(page, ThisTimeLineID);
 	}
 
 	/* get free space before releasing buffer */
@@ -290,7 +288,7 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
 		if (metadata->head == InvalidBlockNumber)
 		{
 			/*
-			 * Main list is empty, so just copy sublist into main list
+			 * Main list is empty, so just insert sublist as main list
 			 */
 			START_CRIT_SECTION();
 
@@ -313,6 +311,14 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
 			LockBuffer(buffer, GIN_EXCLUSIVE);
 			page = BufferGetPage(buffer);
 
+			rdata[0].next = rdata + 1;
+
+			rdata[1].buffer = buffer;
+			rdata[1].buffer_std = true;
+			rdata[1].data = NULL;
+			rdata[1].len = 0;
+			rdata[1].next = NULL;
+
 			Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber);
 
 			START_CRIT_SECTION();
@@ -400,12 +406,10 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
 
 		recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE, rdata);
 		PageSetLSN(metapage, recptr);
-		PageSetTLI(metapage, ThisTimeLineID);
 
 		if (buffer != InvalidBuffer)
 		{
 			PageSetLSN(page, recptr);
-			PageSetTLI(page, ThisTimeLineID);
 		}
 	}
 
@@ -436,7 +440,7 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
  * Create temporary index tuples for a single indexable item (one index column
  * for the heap tuple specified by ht_ctid), and append them to the array
  * in *collector.  They will subsequently be written out using
- * ginHeapTupleFastInsert.	Note that to guarantee consistent state, all
+ * ginHeapTupleFastInsert.  Note that to guarantee consistent state, all
  * temp tuples for a given heap tuple must be written in one call to
  * ginHeapTupleFastInsert.
  */
@@ -482,7 +486,7 @@ ginHeapTupleFastCollect(GinState *ginstate,
 		IndexTuple	itup;
 
 		itup = GinFormTuple(ginstate, attnum, entries[i], categories[i],
-							NULL, 0, true);
+							NULL, 0, 0, true);
 		itup->t_tid = *ht_ctid;
 		collector->tuples[collector->ntuples++] = itup;
 		collector->sumsize += IndexTupleSize(itup);
@@ -586,13 +590,11 @@ shiftList(Relation index, Buffer metabuffer, BlockNumber newHead,
 
 			recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE, rdata);
 			PageSetLSN(metapage, recptr);
-			PageSetTLI(metapage, ThisTimeLineID);
 
 			for (i = 0; i < data.ndeleted; i++)
 			{
 				page = BufferGetPage(buffers[i]);
 				PageSetLSN(page, recptr);
-				PageSetTLI(page, ThisTimeLineID);
 			}
 		}
 
@@ -705,7 +707,7 @@ processPendingPage(BuildAccumulator *accum, KeyArray *ka,
  *
  * This can be called concurrently by multiple backends, so it must cope.
  * On first glance it looks completely not concurrent-safe and not crash-safe
- * either.	The reason it's okay is that multiple insertion of the same entry
+ * either.  The reason it's okay is that multiple insertion of the same entry
  * is detected and treated as a no-op by gininsert.c.  If we crash after
  * posting entries to the main index and before removing them from the
  * pending list, it's okay because when we redo the posting later on, nothing
@@ -759,7 +761,7 @@ ginInsertCleanup(GinState *ginstate,
 	LockBuffer(metabuffer, GIN_UNLOCK);
 
 	/*
-	 * Initialize.	All temporary space will be in opCtx
+	 * Initialize.  All temporary space will be in opCtx
 	 */
 	opCtx = AllocSetContextCreate(CurrentMemoryContext,
 								  "GIN insert cleanup temporary context",
@@ -853,7 +855,7 @@ ginInsertCleanup(GinState *ginstate,
 
 			/*
 			 * While we left the page unlocked, more stuff might have gotten
-			 * added to it.  If so, process those entries immediately.	There
+			 * added to it.  If so, process those entries immediately.  There
 			 * shouldn't be very many, so we don't worry about the fact that
 			 * we're doing this with exclusive lock. Insertion algorithm
 			 * guarantees that inserted row(s) will not continue on next page.
diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
index 022bd27b23..144ed504dc 100644
--- a/src/backend/access/gin/ginget.c
+++ b/src/backend/access/gin/ginget.c
@@ -4,7 +4,7 @@
  *	  fetch tuples from a GIN scan.
  *
  *
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -20,6 +20,8 @@
 #include "utils/datum.h"
 #include "utils/memutils.h"
 
+/* GUC parameter */
+int			GinFuzzySearchLimit = 0;
 
 typedef struct pendingPosition
 {
@@ -32,67 +34,6 @@ typedef struct pendingPosition
 
 
 /*
- * Convenience function for invoking a key's consistentFn
- */
-static bool
-callConsistentFn(GinState *ginstate, GinScanKey key)
-{
-	/*
-	 * If we're dealing with a dummy EVERYTHING key, we don't want to call the
-	 * consistentFn; just claim it matches.
-	 */
-	if (key->searchMode == GIN_SEARCH_MODE_EVERYTHING)
-	{
-		key->recheckCurItem = false;
-		return true;
-	}
-
-	/*
-	 * Initialize recheckCurItem in case the consistentFn doesn't know it
-	 * should set it.  The safe assumption in that case is to force recheck.
-	 */
-	key->recheckCurItem = true;
-
-	return DatumGetBool(FunctionCall8Coll(&ginstate->consistentFn[key->attnum - 1],
-								 ginstate->supportCollation[key->attnum - 1],
-										  PointerGetDatum(key->entryRes),
-										  UInt16GetDatum(key->strategy),
-										  key->query,
-										  UInt32GetDatum(key->nuserentries),
-										  PointerGetDatum(key->extra_data),
-									   PointerGetDatum(&key->recheckCurItem),
-										  PointerGetDatum(key->queryValues),
-									 PointerGetDatum(key->queryCategories)));
-}
-
-/*
- * Tries to refind previously taken ItemPointer on a posting page.
- */
-static bool
-findItemInPostingPage(Page page, ItemPointer item, OffsetNumber *off)
-{
-	OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
-	int			res;
-
-	if (GinPageGetOpaque(page)->flags & GIN_DELETED)
-		/* page was deleted by concurrent vacuum */
-		return false;
-
-	/*
-	 * scan page to find equal or first greater value
-	 */
-	for (*off = FirstOffsetNumber; *off <= maxoff; (*off)++)
-	{
-		res = ginCompareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off));
-
-		if (res <= 0)
-			return true;
-	}
-
-	return false;
-}
-
-/*
  * Goes to the next page if current offset is outside of bounds
  */
 static bool
@@ -105,16 +46,11 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack)
 		/*
 		 * We scanned the whole page, so we should take right page
 		 */
-		stack->blkno = GinPageGetOpaque(page)->rightlink;
-
 		if (GinPageRightMost(page))
 			return false;		/* no more pages */
 
-		LockBuffer(stack->buffer, GIN_UNLOCK);
-		stack->buffer = ReleaseAndReadBuffer(stack->buffer,
-											 btree->index,
-											 stack->blkno);
-		LockBuffer(stack->buffer, GIN_SHARE);
+		stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
+		stack->blkno = BufferGetBlockNumber(stack->buffer);
 		stack->off = FirstOffsetNumber;
 	}
 
@@ -129,19 +65,17 @@ static void
 scanPostingTree(Relation index, GinScanEntry scanEntry,
 				BlockNumber rootPostingTree)
 {
-	GinPostingTreeScan *gdi;
+	GinBtreeData btree;
+	GinBtreeStack *stack;
 	Buffer		buffer;
 	Page		page;
-	BlockNumber blkno;
 
 	/* Descend to the leftmost leaf page */
-	gdi = ginPrepareScanPostingTree(index, rootPostingTree, TRUE);
-
-	buffer = ginScanBeginPostingTree(gdi);
+	stack = ginScanBeginPostingTree(&btree, index, rootPostingTree);
+	buffer = stack->buffer;
 	IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
 
-	freeGinBtreeStack(gdi->stack);
-	pfree(gdi);
+	freeGinBtreeStack(stack);
 
 	/*
 	 * Loop iterates through all leaf pages of posting tree
@@ -149,23 +83,17 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
 	for (;;)
 	{
 		page = BufferGetPage(buffer);
-
-		if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0 &&
-			GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber)
+		if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
 		{
-			tbm_add_tuples(scanEntry->matchBitmap,
-				   (ItemPointer) GinDataPageGetItem(page, FirstOffsetNumber),
-						   GinPageGetOpaque(page)->maxoff, false);
-			scanEntry->predictNumberResult += GinPageGetOpaque(page)->maxoff;
+			int			n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
+
+			scanEntry->predictNumberResult += n;
 		}
 
 		if (GinPageRightMost(page))
 			break;				/* no more pages */
 
-		blkno = GinPageGetOpaque(page)->rightlink;
-		LockBuffer(buffer, GIN_UNLOCK);
-		buffer = ReleaseAndReadBuffer(buffer, index, blkno);
-		LockBuffer(buffer, GIN_SHARE);
+		buffer = ginStepRight(buffer, index, GIN_SHARE);
 	}
 
 	UnlockReleaseBuffer(buffer);
@@ -173,7 +101,7 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
 
 /*
  * Collects TIDs into scanEntry->matchBitmap for all heap tuples that
- * match the search entry.	This supports three different match modes:
+ * match the search entry.  This supports three different match modes:
  *
  * 1. Partial-match support: scan from current point until the
  *	  comparePartialFn says we're done.
@@ -269,7 +197,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
 			/*
 			 * In ALL mode, we are not interested in null items, so we can
 			 * stop if we get to a null-item placeholder (which will be the
-			 * last entry for a given attnum).	We do want to include NULL_KEY
+			 * last entry for a given attnum).  We do want to include NULL_KEY
 			 * and EMPTY_ITEM entries, though.
 			 */
 			if (icategory == GIN_CAT_NULL_ITEM)
@@ -344,8 +272,11 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
 		}
 		else
 		{
-			tbm_add_tuples(scanEntry->matchBitmap,
-						   GinGetPosting(itup), GinGetNPosting(itup), false);
+			ItemPointer ipd;
+			int			nipd;
+
+			ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd);
+			tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false);
 			scanEntry->predictNumberResult += GinGetNPosting(itup);
 		}
 
@@ -354,8 +285,6 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
 		 */
 		stack->off++;
 	}
-
-	return true;
 }
 
 /*
@@ -387,8 +316,7 @@ restartScanEntry:
 	ginPrepareEntryScan(&btreeEntry, entry->attnum,
 						entry->queryKey, entry->queryCategory,
 						ginstate);
-	btreeEntry.searchMode = TRUE;
-	stackEntry = ginFindLeafPage(&btreeEntry, NULL);
+	stackEntry = ginFindLeafPage(&btreeEntry, true);
 	page = BufferGetPage(stackEntry->buffer);
 	needUnlock = TRUE;
 
@@ -438,8 +366,9 @@ restartScanEntry:
 		if (GinIsPostingTree(itup))
 		{
 			BlockNumber rootPostingTree = GinGetPostingTree(itup);
-			GinPostingTreeScan *gdi;
+			GinBtreeStack *stack;
 			Page		page;
+			ItemPointerData minItem;
 
 			/*
 			 * We should unlock entry page before touching posting tree to
@@ -450,9 +379,10 @@ restartScanEntry:
 			 */
 			LockBuffer(stackEntry->buffer, GIN_UNLOCK);
 			needUnlock = FALSE;
-			gdi = ginPrepareScanPostingTree(ginstate->index, rootPostingTree, TRUE);
 
-			entry->buffer = ginScanBeginPostingTree(gdi);
+			stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
+											rootPostingTree);
+			entry->buffer = stack->buffer;
 
 			/*
 			 * We keep buffer pinned because we need to prevent deletion of
@@ -462,26 +392,25 @@ restartScanEntry:
 			IncrBufferRefCount(entry->buffer);
 
 			page = BufferGetPage(entry->buffer);
-			entry->predictNumberResult = gdi->stack->predictNumber * GinPageGetOpaque(page)->maxoff;
 
 			/*
-			 * Keep page content in memory to prevent durable page locking
+			 * Load the first page into memory.
 			 */
-			entry->list = (ItemPointerData *) palloc(BLCKSZ);
-			entry->nlist = GinPageGetOpaque(page)->maxoff;
-			memcpy(entry->list, GinDataPageGetItem(page, FirstOffsetNumber),
-				   GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData));
+			ItemPointerSetMin(&minItem);
+			entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem);
+
+			entry->predictNumberResult = stack->predictNumber * entry->nlist;
 
 			LockBuffer(entry->buffer, GIN_UNLOCK);
-			freeGinBtreeStack(gdi->stack);
-			pfree(gdi);
+			freeGinBtreeStack(stack);
 			entry->isFinished = FALSE;
 		}
 		else if (GinGetNPosting(itup) > 0)
 		{
-			entry->nlist = GinGetNPosting(itup);
-			entry->list = (ItemPointerData *) palloc(sizeof(ItemPointerData) * entry->nlist);
-			memcpy(entry->list, GinGetPosting(itup), sizeof(ItemPointerData) * entry->nlist);
+			entry->list = ginReadTuple(ginstate, entry->attnum, itup,
+									   &entry->nlist);
+			entry->predictNumberResult = entry->nlist;
+
 			entry->isFinished = FALSE;
 		}
 	}
@@ -491,13 +420,106 @@ restartScanEntry:
 	freeGinBtreeStack(stackEntry);
 }
 
+/*
+ * Comparison function for scan entry indexes. Sorts by predictNumberResult,
+ * least frequent items first.
+ */
+static int
+entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg)
+{
+	const GinScanKey key = (const GinScanKey) arg;
+	int			i1 = *(const int *) a1;
+	int			i2 = *(const int *) a2;
+	uint32		n1 = key->scanEntry[i1]->predictNumberResult;
+	uint32		n2 = key->scanEntry[i2]->predictNumberResult;
+
+	if (n1 < n2)
+		return -1;
+	else if (n1 == n2)
+		return 0;
+	else
+		return 1;
+}
+
 static void
-startScanKey(GinState *ginstate, GinScanKey key)
+startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
 {
+	MemoryContext oldCtx = CurrentMemoryContext;
+	int			i;
+	int			j;
+	int		   *entryIndexes;
+
 	ItemPointerSetMin(&key->curItem);
 	key->curItemMatches = false;
 	key->recheckCurItem = false;
 	key->isFinished = false;
+
+	/*
+	 * Divide the entries into two distinct sets: required and additional.
+	 * Additional entries are not enough for a match alone, without any items
+	 * from the required set, but are needed by the consistent function to
+	 * decide if an item matches. When scanning, we can skip over items from
+	 * additional entries that have no corresponding matches in any of the
+	 * required entries. That speeds up queries like "frequent & rare"
+	 * considerably, if the frequent term can be put in the additional set.
+	 *
+	 * There can be many legal ways to divide them entries into these two
+	 * sets. A conservative division is to just put everything in the required
+	 * set, but the more you can put in the additional set, the more you can
+	 * skip during the scan. To maximize skipping, we try to put as many
+	 * frequent items as possible into additional, and less frequent ones into
+	 * required. To do that, sort the entries by frequency
+	 * (predictNumberResult), and put entries into the required set in that
+	 * order, until the consistent function says that none of the remaining
+	 * entries can form a match, without any items from the required set. The
+	 * rest go to the additional set.
+	 */
+	if (key->nentries > 1)
+	{
+		MemoryContextSwitchTo(so->tempCtx);
+
+		entryIndexes = (int *) palloc(sizeof(int) * key->nentries);
+		for (i = 0; i < key->nentries; i++)
+			entryIndexes[i] = i;
+		qsort_arg(entryIndexes, key->nentries, sizeof(int),
+				  entryIndexByFrequencyCmp, key);
+
+		for (i = 0; i < key->nentries - 1; i++)
+		{
+			/* Pass all entries <= i as FALSE, and the rest as MAYBE */
+			for (j = 0; j <= i; j++)
+				key->entryRes[entryIndexes[j]] = GIN_FALSE;
+			for (j = i + 1; j < key->nentries; j++)
+				key->entryRes[entryIndexes[j]] = GIN_MAYBE;
+
+			if (key->triConsistentFn(key) == GIN_FALSE)
+				break;
+		}
+		/* i is now the last required entry. */
+
+		MemoryContextSwitchTo(oldCtx);
+
+		key->nrequired = i + 1;
+		key->nadditional = key->nentries - key->nrequired;
+		key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry));
+		key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
+
+		j = 0;
+		for (i = 0; i < key->nrequired; i++)
+			key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]];
+		for (i = 0; i < key->nadditional; i++)
+			key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]];
+
+		/* clean up after consistentFn calls (also frees entryIndexes) */
+		MemoryContextReset(so->tempCtx);
+	}
+	else
+	{
+		key->nrequired = 1;
+		key->nadditional = 0;
+		key->requiredEntries = palloc(1 * sizeof(GinScanEntry));
+		key->requiredEntries[0] = key->scanEntry[0];
+	}
 }
 
 static void
@@ -532,84 +554,146 @@ startScan(IndexScanDesc scan)
 	}
 
 	for (i = 0; i < so->nkeys; i++)
-		startScanKey(ginstate, so->keys + i);
+		startScanKey(ginstate, so, so->keys + i);
 }
 
 /*
- * Gets next ItemPointer from PostingTree. Note, that we copy
- * page into GinScanEntry->list array and unlock page, but keep it pinned
- * to prevent interference with vacuum
+ * Load the next batch of item pointers from a posting tree.
+ *
+ * Note that we copy the page into GinScanEntry->list array and unlock it, but
+ * keep it pinned to prevent interference with vacuum.
  */
 static void
-entryGetNextItem(GinState *ginstate, GinScanEntry entry)
+entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advancePast)
 {
 	Page		page;
-	BlockNumber blkno;
+	int			i;
+	bool		stepright;
+
+	if (!BufferIsValid(entry->buffer))
+	{
+		entry->isFinished = true;
+		return;
+	}
 
+	/*
+	 * We have two strategies for finding the correct page: step right from
+	 * the current page, or descend the tree again from the root. If
+	 * advancePast equals the current item, the next matching item should be
+	 * on the next page, so we step right. Otherwise, descend from root.
+	 */
+	if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0)
+	{
+		stepright = true;
+		LockBuffer(entry->buffer, GIN_SHARE);
+	}
+	else
+	{
+		GinBtreeStack *stack;
+
+		ReleaseBuffer(entry->buffer);
+
+		/*
+		 * Set the search key, and find the correct leaf page.
+		 */
+		if (ItemPointerIsLossyPage(&advancePast))
+		{
+			ItemPointerSet(&entry->btree.itemptr,
+						   GinItemPointerGetBlockNumber(&advancePast) + 1,
+						   FirstOffsetNumber);
+		}
+		else
+		{
+			entry->btree.itemptr = advancePast;
+			entry->btree.itemptr.ip_posid++;
+		}
+		entry->btree.fullScan = false;
+		stack = ginFindLeafPage(&entry->btree, true);
+
+		/* we don't need the stack, just the buffer. */
+		entry->buffer = stack->buffer;
+		IncrBufferRefCount(entry->buffer);
+		freeGinBtreeStack(stack);
+		stepright = false;
+	}
+
+	elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
+		 GinItemPointerGetBlockNumber(&advancePast),
+		 GinItemPointerGetOffsetNumber(&advancePast),
+		 !stepright);
+
+	page = BufferGetPage(entry->buffer);
 	for (;;)
 	{
-		if (entry->offset < entry->nlist)
+		entry->offset = InvalidOffsetNumber;
+		if (entry->list)
 		{
-			entry->curItem = entry->list[entry->offset++];
-			return;
+			pfree(entry->list);
+			entry->list = NULL;
+			entry->nlist = 0;
 		}
 
-		LockBuffer(entry->buffer, GIN_SHARE);
-		page = BufferGetPage(entry->buffer);
-		for (;;)
+		if (stepright)
 		{
 			/*
-			 * It's needed to go by right link. During that we should refind
-			 * first ItemPointer greater that stored
+			 * We've processed all the entries on this page. If it was the
+			 * last page in the tree, we're done.
 			 */
-
-			blkno = GinPageGetOpaque(page)->rightlink;
-
-			LockBuffer(entry->buffer, GIN_UNLOCK);
-			if (blkno == InvalidBlockNumber)
+			if (GinPageRightMost(page))
 			{
-				ReleaseBuffer(entry->buffer);
-				ItemPointerSetInvalid(&entry->curItem);
+				UnlockReleaseBuffer(entry->buffer);
 				entry->buffer = InvalidBuffer;
 				entry->isFinished = TRUE;
 				return;
 			}
 
-			entry->buffer = ReleaseAndReadBuffer(entry->buffer,
-												 ginstate->index,
-												 blkno);
-			LockBuffer(entry->buffer, GIN_SHARE);
+			/*
+			 * Step to next page, following the right link. then find the
+			 * first ItemPointer greater than advancePast.
+			 */
+			entry->buffer = ginStepRight(entry->buffer,
+										 ginstate->index,
+										 GIN_SHARE);
 			page = BufferGetPage(entry->buffer);
+		}
+		stepright = true;
 
-			entry->offset = InvalidOffsetNumber;
-			if (!ItemPointerIsValid(&entry->curItem) ||
-				findItemInPostingPage(page, &entry->curItem, &entry->offset))
-			{
-				/*
-				 * Found position equal to or greater than stored
-				 */
-				entry->nlist = GinPageGetOpaque(page)->maxoff;
-				memcpy(entry->list, GinDataPageGetItem(page, FirstOffsetNumber),
-				   GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData));
+		if (GinPageGetOpaque(page)->flags & GIN_DELETED)
+			continue;			/* page was deleted by concurrent vacuum */
 
-				LockBuffer(entry->buffer, GIN_UNLOCK);
+		/*
+		 * The first item > advancePast might not be on this page, but
+		 * somewhere to the right, if the page was split, or a non-match from
+		 * another key in the query allowed us to skip some items from this
+		 * entry. Keep following the right-links until we re-find the correct
+		 * page.
+		 */
+		if (!GinPageRightMost(page) &&
+			ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
+		{
+			/*
+			 * the item we're looking is > the right bound of the page, so it
+			 * can't be on this page.
+			 */
+			continue;
+		}
 
-				if (!ItemPointerIsValid(&entry->curItem) ||
-					ginCompareItemPointers(&entry->curItem,
-									   entry->list + entry->offset - 1) == 0)
-				{
-					/*
-					 * First pages are deleted or empty, or we found exact
-					 * position, so break inner loop and continue outer one.
-					 */
-					break;
-				}
+		entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast);
 
-				/*
-				 * Find greater than entry->curItem position, store it.
-				 */
-				entry->curItem = entry->list[entry->offset - 1];
+		for (i = 0; i < entry->nlist; i++)
+		{
+			if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
+			{
+				entry->offset = i;
 
+				if (GinPageRightMost(page))
+				{
+					/* after processing the copied items, we're done. */
+					UnlockReleaseBuffer(entry->buffer);
+					entry->buffer = InvalidBuffer;
+				}
+				else
+					LockBuffer(entry->buffer, GIN_UNLOCK);
 				return;
 			}
 		}
@@ -620,10 +704,10 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
 #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
 
 /*
- * Sets entry->curItem to next heap item pointer for one entry of one scan key,
- * or sets entry->isFinished to TRUE if there are no more.
+ * Sets entry->curItem to next heap item pointer > advancePast, for one entry
+ * of one scan key, or sets entry->isFinished to TRUE if there are no more.
  *
- * Item pointers must be returned in ascending order.
+ * Item pointers are returned in ascending order.
  *
  * Note: this can return a "lossy page" item pointer, indicating that the
  * entry potentially matches all items on that heap page.  However, it is
@@ -633,16 +717,33 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
  * current implementation this is guaranteed by the behavior of tidbitmaps.
  */
 static void
-entryGetItem(GinState *ginstate, GinScanEntry entry)
+entryGetItem(GinState *ginstate, GinScanEntry entry,
+			 ItemPointerData advancePast)
 {
 	Assert(!entry->isFinished);
 
+	Assert(!ItemPointerIsValid(&entry->curItem) ||
+		   ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
+
 	if (entry->matchBitmap)
 	{
+		/* A bitmap result */
+		BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
+		OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
+		bool		gotitem = false;
+
 		do
 		{
-			if (entry->matchResult == NULL ||
-				entry->offset >= entry->matchResult->ntuples)
+			/*
+			 * If we've exhausted all items on this block, move to next block
+			 * in the bitmap.
+			 */
+			while (entry->matchResult == NULL ||
+				   (entry->matchResult->ntuples >= 0 &&
+					entry->offset >= entry->matchResult->ntuples) ||
+				   entry->matchResult->blockno < advancePastBlk ||
+				   (ItemPointerIsLossyPage(&advancePast) &&
+					entry->matchResult->blockno == advancePastBlk))
 			{
 				entry->matchResult = tbm_iterate(entry->matchIterator);
 
@@ -663,54 +764,103 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
 				 */
 				entry->offset = 0;
 			}
+			if (entry->isFinished)
+				break;
 
+			/*
+			 * We're now on the first page after advancePast which has any
+			 * items on it. If it's a lossy result, return that.
+			 */
 			if (entry->matchResult->ntuples < 0)
 			{
-				/*
-				 * lossy result, so we need to check the whole page
-				 */
 				ItemPointerSetLossyPage(&entry->curItem,
 										entry->matchResult->blockno);
 
 				/*
 				 * We might as well fall out of the loop; we could not
 				 * estimate number of results on this page to support correct
-				 * reducing of result even if it's enabled
+				 * reducing of result even if it's enabled.
 				 */
+				gotitem = true;
 				break;
 			}
 
+			/*
+			 * Not a lossy page. Skip over any offsets <= advancePast, and
+			 * return that.
+			 */
+			if (entry->matchResult->blockno == advancePastBlk)
+			{
+				/*
+				 * First, do a quick check against the last offset on the
+				 * page. If that's > advancePast, so are all the other
+				 * offsets.
+				 */
+				if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
+				{
+					entry->offset = entry->matchResult->ntuples;
+					continue;
+				}
+
+				/* Otherwise scan to find the first item > advancePast */
+				while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
+					entry->offset++;
+			}
+
 			ItemPointerSet(&entry->curItem,
 						   entry->matchResult->blockno,
 						   entry->matchResult->offsets[entry->offset]);
 			entry->offset++;
-		} while (entry->reduceResult == TRUE && dropItem(entry));
+			gotitem = true;
+		} while (!gotitem || (entry->reduceResult == TRUE && dropItem(entry)));
 	}
 	else if (!BufferIsValid(entry->buffer))
 	{
-		entry->offset++;
-		if (entry->offset <= entry->nlist)
-			entry->curItem = entry->list[entry->offset - 1];
-		else
+		/*
+		 * A posting list from an entry tuple, or the last page of a posting
+		 * tree.
+		 */
+		do
 		{
-			ItemPointerSetInvalid(&entry->curItem);
-			entry->isFinished = TRUE;
-		}
+			if (entry->offset >= entry->nlist)
+			{
+				ItemPointerSetInvalid(&entry->curItem);
+				entry->isFinished = TRUE;
+				break;
+			}
+
+			entry->curItem = entry->list[entry->offset++];
+		} while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
+		/* XXX: shouldn't we apply the fuzzy search limit here? */
 	}
 	else
 	{
+		/* A posting tree */
 		do
 		{
-			entryGetNextItem(ginstate, entry);
-		} while (entry->isFinished == FALSE &&
-				 entry->reduceResult == TRUE &&
-				 dropItem(entry));
+			/* If we've processed the current batch, load more items */
+			while (entry->offset >= entry->nlist)
+			{
+				entryLoadMoreItems(ginstate, entry, advancePast);
+
+				if (entry->isFinished)
+				{
+					ItemPointerSetInvalid(&entry->curItem);
+					return;
+				}
+			}
+
+			entry->curItem = entry->list[entry->offset++];
+
+		} while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 ||
+				 (entry->reduceResult == TRUE && dropItem(entry)));
 	}
 }
 
 /*
- * Identify the "current" item among the input entry streams for this scan key,
- * and test whether it passes the scan key qual condition.
+ * Identify the "current" item among the input entry streams for this scan key
+ * that is greater than advancePast, and test whether it passes the scan key
+ * qual condition.
  *
  * The current item is the smallest curItem among the inputs.  key->curItem
  * is set to that value.  key->curItemMatches is set to indicate whether that
@@ -729,21 +879,30 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
  * logic in scanGetItem.)
  */
 static void
-keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
+keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
+		   ItemPointerData advancePast)
 {
 	ItemPointerData minItem;
 	ItemPointerData curPageLossy;
 	uint32		i;
-	uint32		lossyEntry;
 	bool		haveLossyEntry;
 	GinScanEntry entry;
-	bool		res;
+	GinTernaryValue res;
 	MemoryContext oldCtx;
+	bool		allFinished;
 
 	Assert(!key->isFinished);
 
 	/*
-	 * Find the minimum of the active entry curItems.
+	 * We might have already tested this item; if so, no need to repeat work.
+	 * (Note: the ">" case can happen, if advancePast is exact but we
+	 * previously had to set curItem to a lossy-page pointer.)
+	 */
+	if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
+		return;
+
+	/*
+	 * Find the minimum item > advancePast among the active entry streams.
 	 *
 	 * Note: a lossy-page entry is encoded by a ItemPointer with max value for
 	 * offset (0xffff), so that it will sort after any exact entries for the
@@ -751,16 +910,34 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
 	 * pointers, which is good.
 	 */
 	ItemPointerSetMax(&minItem);
-
-	for (i = 0; i < key->nentries; i++)
+	allFinished = true;
+	for (i = 0; i < key->nrequired; i++)
 	{
-		entry = key->scanEntry[i];
-		if (entry->isFinished == FALSE &&
-			ginCompareItemPointers(&entry->curItem, &minItem) < 0)
+		entry = key->requiredEntries[i];
+
+		if (entry->isFinished)
+			continue;
+
+		/*
+		 * Advance this stream if necessary.
+		 *
+		 * In particular, since entry->curItem was initialized with
+		 * ItemPointerSetMin, this ensures we fetch the first item for each
+		 * entry on the first call.
+		 */
+		if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+		{
+			entryGetItem(ginstate, entry, advancePast);
+			if (entry->isFinished)
+				continue;
+		}
+
+		allFinished = false;
+		if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
 			minItem = entry->curItem;
 	}
 
-	if (ItemPointerIsMax(&minItem))
+	if (allFinished)
 	{
 		/* all entries are finished */
 		key->isFinished = TRUE;
@@ -768,46 +945,96 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
 	}
 
 	/*
-	 * We might have already tested this item; if so, no need to repeat work.
-	 * (Note: the ">" case can happen, if minItem is exact but we previously
-	 * had to set curItem to a lossy-page pointer.)
+	 * Ok, we now know that there are no matches < minItem.
+	 *
+	 * If minItem is lossy, it means that there were no exact items on the
+	 * page among requiredEntries, because lossy pointers sort after exact
+	 * items. However, there might be exact items for the same page among
+	 * additionalEntries, so we mustn't advance past them.
 	 */
-	if (ginCompareItemPointers(&key->curItem, &minItem) >= 0)
-		return;
+	if (ItemPointerIsLossyPage(&minItem))
+	{
+		if (GinItemPointerGetBlockNumber(&advancePast) <
+			GinItemPointerGetBlockNumber(&minItem))
+		{
+			advancePast.ip_blkid = minItem.ip_blkid;
+			advancePast.ip_posid = 0;
+		}
+	}
+	else
+	{
+		Assert(minItem.ip_posid > 0);
+		advancePast = minItem;
+		advancePast.ip_posid--;
+	}
 
 	/*
-	 * OK, advance key->curItem and perform consistentFn test.
+	 * We might not have loaded all the entry streams for this TID yet. We
+	 * could call the consistent function, passing MAYBE for those entries, to
+	 * see if it can decide if this TID matches based on the information we
+	 * have. But if the consistent-function is expensive, and cannot in fact
+	 * decide with partial information, that could be a big loss. So, load all
+	 * the additional entries, before calling the consistent function.
 	 */
-	key->curItem = minItem;
+	for (i = 0; i < key->nadditional; i++)
+	{
+		entry = key->additionalEntries[i];
+
+		if (entry->isFinished)
+			continue;
+
+		if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+		{
+			entryGetItem(ginstate, entry, advancePast);
+			if (entry->isFinished)
+				continue;
+		}
+
+		/*
+		 * Normally, none of the items in additionalEntries can have a curItem
+		 * larger than minItem. But if minItem is a lossy page, then there
+		 * might be exact items on the same page among additionalEntries.
+		 */
+		if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
+		{
+			Assert(ItemPointerIsLossyPage(&minItem));
+			minItem = entry->curItem;
+		}
+	}
 
 	/*
+	 * Ok, we've advanced all the entries up to minItem now. Set key->curItem,
+	 * and perform consistentFn test.
+	 *
 	 * Lossy-page entries pose a problem, since we don't know the correct
 	 * entryRes state to pass to the consistentFn, and we also don't know what
 	 * its combining logic will be (could be AND, OR, or even NOT). If the
 	 * logic is OR then the consistentFn might succeed for all items in the
 	 * lossy page even when none of the other entries match.
 	 *
-	 * If we have a single lossy-page entry then we check to see if the
-	 * consistentFn will succeed with only that entry TRUE.  If so, we return
-	 * a lossy-page pointer to indicate that the whole heap page must be
+	 * Our strategy is to call the tri-state consistent function, with the
+	 * lossy-page entries set to MAYBE, and all the other entries FALSE. If it
+	 * returns FALSE, none of the lossy items alone are enough for a match, so
+	 * we don't need to return a lossy-page pointer. Otherwise, return a
+	 * lossy-page pointer to indicate that the whole heap page must be
 	 * checked.  (On subsequent calls, we'll do nothing until minItem is past
 	 * the page altogether, thus ensuring that we never return both regular
 	 * and lossy pointers for the same page.)
 	 *
-	 * This idea could be generalized to more than one lossy-page entry, but
-	 * ideally lossy-page entries should be infrequent so it would seldom be
-	 * the case that we have more than one at once.  So it doesn't seem worth
-	 * the extra complexity to optimize that case. If we do find more than
-	 * one, we just punt and return a lossy-page pointer always.
+	 * An exception is that it doesn't matter what we pass for lossy pointers
+	 * in "hidden" entries, because the consistentFn's result can't depend on
+	 * them. We could pass them as MAYBE as well, but if we're using the
+	 * "shim" implementation of a tri-state consistent function (see
+	 * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass
+	 * them as TRUE.
 	 *
 	 * Note that only lossy-page entries pointing to the current item's page
 	 * should trigger this processing; we might have future lossy pages in the
 	 * entry array, but they aren't relevant yet.
 	 */
+	key->curItem = minItem;
 	ItemPointerSetLossyPage(&curPageLossy,
 							GinItemPointerGetBlockNumber(&key->curItem));
-
-	lossyEntry = 0;
 	haveLossyEntry = false;
 	for (i = 0; i < key->nentries; i++)
 	{
@@ -815,17 +1042,14 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
 		if (entry->isFinished == FALSE &&
 			ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
 		{
-			if (haveLossyEntry)
-			{
-				/* Multiple lossy entries, punt */
-				key->curItem = curPageLossy;
-				key->curItemMatches = true;
-				key->recheckCurItem = true;
-				return;
-			}
-			lossyEntry = i;
+			if (i < key->nuserentries)
+				key->entryRes[i] = GIN_MAYBE;
+			else
+				key->entryRes[i] = GIN_TRUE;
 			haveLossyEntry = true;
 		}
+		else
+			key->entryRes[i] = GIN_FALSE;
 	}
 
 	/* prepare for calling consistentFn in temp context */
@@ -833,11 +1057,10 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
 
 	if (haveLossyEntry)
 	{
-		/* Single lossy-page entry, so see if whole page matches */
-		memset(key->entryRes, FALSE, key->nentries);
-		key->entryRes[lossyEntry] = TRUE;
+		/* Have lossy-page entries, so see if whole page matches */
+		res = key->triConsistentFn(key);
 
-		if (callConsistentFn(ginstate, key))
+		if (res == GIN_TRUE || res == GIN_MAYBE)
 		{
 			/* Yes, so clean up ... */
 			MemoryContextSwitchTo(oldCtx);
@@ -854,41 +1077,70 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
 	/*
 	 * At this point we know that we don't need to return a lossy whole-page
 	 * pointer, but we might have matches for individual exact item pointers,
-	 * possibly in combination with a lossy pointer.  Our strategy if there's
-	 * a lossy pointer is to try the consistentFn both ways and return a hit
-	 * if it accepts either one (forcing the hit to be marked lossy so it will
-	 * be rechecked).  An exception is that we don't need to try it both ways
-	 * if the lossy pointer is in a "hidden" entry, because the consistentFn's
-	 * result can't depend on that.
+	 * possibly in combination with a lossy pointer. Pass lossy pointers as
+	 * MAYBE to the ternary consistent function, to let it decide if this
+	 * tuple satisfies the overall key, even though we don't know if the lossy
+	 * entries match.
 	 *
 	 * Prepare entryRes array to be passed to consistentFn.
 	 */
 	for (i = 0; i < key->nentries; i++)
 	{
 		entry = key->scanEntry[i];
-		if (entry->isFinished == FALSE &&
-			ginCompareItemPointers(&entry->curItem, &key->curItem) == 0)
-			key->entryRes[i] = TRUE;
+		if (entry->isFinished)
+			key->entryRes[i] = GIN_FALSE;
+#if 0
+
+		/*
+		 * This case can't currently happen, because we loaded all the entries
+		 * for this item earlier.
+		 */
+		else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+			key->entryRes[i] = GIN_MAYBE;
+#endif
+		else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
+			key->entryRes[i] = GIN_MAYBE;
+		else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0)
+			key->entryRes[i] = GIN_TRUE;
 		else
-			key->entryRes[i] = FALSE;
+			key->entryRes[i] = GIN_FALSE;
 	}
-	if (haveLossyEntry)
-		key->entryRes[lossyEntry] = TRUE;
 
-	res = callConsistentFn(ginstate, key);
+	res = key->triConsistentFn(key);
 
-	if (!res && haveLossyEntry && lossyEntry < key->nuserentries)
+	switch (res)
 	{
-		/* try the other way for the lossy item */
-		key->entryRes[lossyEntry] = FALSE;
+		case GIN_TRUE:
+			key->curItemMatches = true;
+			/* triConsistentFn set recheckCurItem */
+			break;
+
+		case GIN_FALSE:
+			key->curItemMatches = false;
+			break;
 
-		res = callConsistentFn(ginstate, key);
+		case GIN_MAYBE:
+			key->curItemMatches = true;
+			key->recheckCurItem = true;
+			break;
+
+		default:
+
+			/*
+			 * the 'default' case shouldn't happen, but if the consistent
+			 * function returns something bogus, this is the safe result
+			 */
+			key->curItemMatches = true;
+			key->recheckCurItem = true;
+			break;
 	}
 
-	key->curItemMatches = res;
-	/* If we matched a lossy entry, force recheckCurItem = true */
-	if (haveLossyEntry)
-		key->recheckCurItem = true;
+	/*
+	 * We have a tuple, and we know if it matches or not. If it's a non-match,
+	 * we could continue to find the next matching tuple, but let's break out
+	 * and give scanGetItem a chance to advance the other keys. They might be
+	 * able to skip past to a much higher TID, allowing us to save work.
+	 */
 
 	/* clean up after consistentFn calls */
 	MemoryContextSwitchTo(oldCtx);
@@ -905,117 +1157,121 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
  * keyGetItem() the combination logic is known only to the consistentFn.
  */
 static bool
-scanGetItem(IndexScanDesc scan, ItemPointer advancePast,
+scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
 			ItemPointerData *item, bool *recheck)
 {
 	GinScanOpaque so = (GinScanOpaque) scan->opaque;
-	GinState   *ginstate = &so->ginstate;
-	ItemPointerData myAdvancePast = *advancePast;
 	uint32		i;
-	bool		allFinished;
 	bool		match;
 
-	for (;;)
+	/*----------
+	 * Advance the scan keys in lock-step, until we find an item that matches
+	 * all the keys. If any key reports isFinished, meaning its subset of the
+	 * entries is exhausted, we can stop.  Otherwise, set *item to the next
+	 * matching item.
+	 *
+	 * This logic works only if a keyGetItem stream can never contain both
+	 * exact and lossy pointers for the same page.  Else we could have a
+	 * case like
+	 *
+	 *		stream 1		stream 2
+	 *		...             ...
+	 *		42/6			42/7
+	 *		50/1			42/0xffff
+	 *		...             ...
+	 *
+	 * We would conclude that 42/6 is not a match and advance stream 1,
+	 * thus never detecting the match to the lossy pointer in stream 2.
+	 * (keyGetItem has a similar problem versus entryGetItem.)
+	 *----------
+	 */
+	do
 	{
-		/*
-		 * Advance any entries that are <= myAdvancePast.  In particular,
-		 * since entry->curItem was initialized with ItemPointerSetMin, this
-		 * ensures we fetch the first item for each entry on the first call.
-		 */
-		allFinished = TRUE;
-
-		for (i = 0; i < so->totalentries; i++)
-		{
-			GinScanEntry entry = so->entries[i];
-
-			while (entry->isFinished == FALSE &&
-				   ginCompareItemPointers(&entry->curItem,
-										  &myAdvancePast) <= 0)
-				entryGetItem(ginstate, entry);
-
-			if (entry->isFinished == FALSE)
-				allFinished = FALSE;
-		}
-
-		if (allFinished)
-		{
-			/* all entries exhausted, so we're done */
-			return false;
-		}
-
-		/*
-		 * Perform the consistentFn test for each scan key.  If any key
-		 * reports isFinished, meaning its subset of the entries is exhausted,
-		 * we can stop.  Otherwise, set *item to the minimum of the key
-		 * curItems.
-		 */
-		ItemPointerSetMax(item);
-
-		for (i = 0; i < so->nkeys; i++)
+		ItemPointerSetMin(item);
+		match = true;
+		for (i = 0; i < so->nkeys && match; i++)
 		{
 			GinScanKey	key = so->keys + i;
 
-			keyGetItem(&so->ginstate, so->tempCtx, key);
+			/* Fetch the next item for this key that is > advancePast. */
+			keyGetItem(&so->ginstate, so->tempCtx, key, advancePast);
 
 			if (key->isFinished)
-				return false;	/* finished one of keys */
-
-			if (ginCompareItemPointers(&key->curItem, item) < 0)
-				*item = key->curItem;
-		}
+				return false;
 
-		Assert(!ItemPointerIsMax(item));
+			/*
+			 * If it's not a match, we can immediately conclude that nothing
+			 * <= this item matches, without checking the rest of the keys.
+			 */
+			if (!key->curItemMatches)
+			{
+				advancePast = key->curItem;
+				match = false;
+				break;
+			}
 
-		/*----------
-		 * Now *item contains first ItemPointer after previous result.
-		 *
-		 * The item is a valid hit only if all the keys succeeded for either
-		 * that exact TID, or a lossy reference to the same page.
-		 *
-		 * This logic works only if a keyGetItem stream can never contain both
-		 * exact and lossy pointers for the same page.	Else we could have a
-		 * case like
-		 *
-		 *		stream 1		stream 2
-		 *		...				...
-		 *		42/6			42/7
-		 *		50/1			42/0xffff
-		 *		...				...
-		 *
-		 * We would conclude that 42/6 is not a match and advance stream 1,
-		 * thus never detecting the match to the lossy pointer in stream 2.
-		 * (keyGetItem has a similar problem versus entryGetItem.)
-		 *----------
-		 */
-		match = true;
-		for (i = 0; i < so->nkeys; i++)
-		{
-			GinScanKey	key = so->keys + i;
+			/*
+			 * It's a match. We can conclude that nothing < matches, so the
+			 * other key streams can skip to this item.
+			 *
+			 * Beware of lossy pointers, though; from a lossy pointer, we can
+			 * only conclude that nothing smaller than this *block* matches.
+			 */
+			if (ItemPointerIsLossyPage(&key->curItem))
+			{
+				if (GinItemPointerGetBlockNumber(&advancePast) <
+					GinItemPointerGetBlockNumber(&key->curItem))
+				{
+					advancePast.ip_blkid = key->curItem.ip_blkid;
+					advancePast.ip_posid = 0;
+				}
+			}
+			else
+			{
+				Assert(key->curItem.ip_posid > 0);
+				advancePast = key->curItem;
+				advancePast.ip_posid--;
+			}
 
-			if (key->curItemMatches)
+			/*
+			 * If this is the first key, remember this location as a potential
+			 * match, and proceed to check the rest of the keys.
+			 *
author	Pavan Deolasee	2015-05-05 09:19:18 +0000
committer	Pavan Deolasee	2015-05-05 09:19:18 +0000
commit	73fa25c67cbfa24c03e28c96bf356f2592671730 (patch)
tree	10ded7e26abd78d93658cb72fc5cb9d4672eff2a /src/backend
parent	da4d108859bcd7a308ca75aba54281e32968822c (diff)
parent	4a9ab6d8619817f9e3989c99b65140e19041dab7 (diff)