diff options
author | Pavan Deolasee | 2015-05-05 09:19:18 +0000 |
---|---|---|
committer | Pavan Deolasee | 2015-05-05 09:19:18 +0000 |
commit | 73fa25c67cbfa24c03e28c96bf356f2592671730 (patch) | |
tree | 10ded7e26abd78d93658cb72fc5cb9d4672eff2a /src/backend | |
parent | da4d108859bcd7a308ca75aba54281e32968822c (diff) | |
parent | 4a9ab6d8619817f9e3989c99b65140e19041dab7 (diff) |
Merge branch 'XL_MASTER_MERGE_9_4' into XL_NEW_MASTER
Conflicts:
src/test/regress/expected/aggregates.out
src/test/regress/expected/create_index.out
src/test/regress/expected/inherit.out
src/test/regress/expected/join.out
src/test/regress/expected/window.out
src/test/regress/expected/with.out
Diffstat (limited to 'src/backend')
711 files changed, 252214 insertions, 117889 deletions
diff --git a/src/backend/Makefile b/src/backend/Makefile index 611d29fa87..3d5d3eae30 100644 --- a/src/backend/Makefile +++ b/src/backend/Makefile @@ -2,7 +2,7 @@ # # Makefile for the postgres backend # -# Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group +# Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group # Portions Copyright (c) 1994, Regents of the University of California # # src/backend/Makefile @@ -51,10 +51,12 @@ OBJS = $(SUBDIROBJS) $(LOCALOBJS) \ $(top_builddir)/src/port/libpgport_srv.a \ $(top_builddir)/src/gtm/client/libgtmclient.a \ $(top_builddir)/src/gtm/common/libgtm.a \ - $(top_builddir)/src/gtm/libpq/libpqcomm.a + $(top_builddir)/src/gtm/libpq/libpqcomm.a \ + $(top_builddir)/src/common/libpgcommon_srv.a -# We put libpgport into OBJS, so remove it from LIBS; also add libldap -LIBS := $(filter-out -lpgport, $(LIBS)) $(LDAP_LIBS_BE) +# We put libpgport and libpgcommon into OBJS, so remove it from LIBS; also add +# libldap +LIBS := $(filter-out -lpgport -lpgcommon, $(LIBS)) $(LDAP_LIBS_BE) # The backend doesn't need everything that's in LIBS, however LIBS := $(filter-out -lz -lreadline -ledit -ltermcap -lncurses -lcurses, $(LIBS)) @@ -78,36 +80,26 @@ endif ifeq ($(PORTNAME), cygwin) -postgres: $(OBJS) postgres.def libpostgres.a - $(DLLTOOL) --dllname $@$(X) --output-exp $@.exp --def postgres.def - $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -o $@$(X) -Wl,--base-file,$@.base $@.exp $(call expand_subsys,$(OBJS)) $(LIBS) - $(DLLTOOL) --dllname $@$(X) --base-file $@.base --output-exp $@.exp --def postgres.def - $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -Wl,--stack,$(WIN32_STACK_RLIMIT) -o $@$(X) $@.exp $(call expand_subsys,$(OBJS)) $(LIBS) - rm -f $@.exp $@.base +postgres: $(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) $(export_dynamic) -Wl,--stack,$(WIN32_STACK_RLIMIT) -Wl,--export-all-symbols -Wl,--out-implib=libpostgres.a $(call expand_subsys,$^) $(LIBS) -o $@ -postgres.def: $(OBJS) - $(DLLTOOL) --export-all --output-def $@ $(call expand_subsys,$^) +# There is no correct way to write a rule that generates two files. +# Rules with two targets don't have that meaning, they are merely +# shorthand for two otherwise separate rules. To be safe for parallel +# make, we must chain the dependencies like this. The semicolon is +# important, otherwise make will choose some built-in rule. -libpostgres.a: postgres.def - $(DLLTOOL) --dllname postgres.exe --def postgres.def --output-lib $@ +libpostgres.a: postgres ; endif # cygwin ifeq ($(PORTNAME), win32) LIBS += -lsecur32 -postgres: $(OBJS) postgres.def libpostgres.a $(WIN32RES) - $(DLLTOOL) --dllname $@$(X) --output-exp $@.exp --def postgres.def - $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -o $@$(X) -Wl,--base-file,$@.base $@.exp $(call expand_subsys,$(OBJS)) $(WIN32RES) $(LIBS) - $(DLLTOOL) --dllname $@$(X) --base-file $@.base --output-exp $@.exp --def postgres.def - $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -Wl,--stack=$(WIN32_STACK_RLIMIT) -o $@$(X) $@.exp $(call expand_subsys,$(OBJS)) $(WIN32RES) $(LIBS) - rm -f $@.exp $@.base - -postgres.def: $(OBJS) - $(DLLTOOL) --export-all --output-def $@ $(call expand_subsys,$^) +postgres: $(OBJS) $(WIN32RES) + $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -Wl,--stack=$(WIN32_STACK_RLIMIT) -Wl,--export-all-symbols -Wl,--out-implib=libpostgres.a $(call expand_subsys,$(OBJS)) $(WIN32RES) $(LIBS) -o $@$(X) -libpostgres.a: postgres.def - $(DLLTOOL) --dllname postgres.exe --def postgres.def --output-lib $@ +libpostgres.a: postgres ; endif # win32 @@ -253,6 +245,7 @@ else endif ifeq ($(MAKE_EXPORTS), true) $(INSTALL_DATA) $(POSTGRES_IMP) '$(DESTDIR)$(pkglibdir)/$(POSTGRES_IMP)' + $(INSTALL_PROGRAM) $(MKLDEXPORT) '$(DESTDIR)$(pgxsdir)/$(MKLDEXPORT_DIR)/mkldexport.sh' endif .PHONY: install-bin @@ -271,6 +264,7 @@ endif endif ifeq ($(MAKE_EXPORTS), true) $(MKDIR_P) '$(DESTDIR)$(pkglibdir)' + $(MKDIR_P) '$(DESTDIR)$(pgxsdir)/$(MKLDEXPORT_DIR)' endif @@ -280,6 +274,7 @@ uninstall: rm -f '$(DESTDIR)$(bindir)/postgres$(X)' '$(DESTDIR)$(bindir)/postmaster' ifeq ($(MAKE_EXPORTS), true) rm -f '$(DESTDIR)$(pkglibdir)/$(POSTGRES_IMP)' + rm -f '$(DESTDIR)$(pgxsdir)/$(MKLDEXPORT_DIR)/mkldexport.sh' endif ifeq ($(PORTNAME), cygwin) ifeq ($(MAKE_DLL), true) @@ -308,10 +303,10 @@ clean: $(top_builddir)/src/include/utils/fmgroids.h \ $(top_builddir)/src/include/utils/probes.h ifeq ($(PORTNAME), cygwin) - rm -f postgres.dll postgres.def libpostgres.a + rm -f postgres.dll libpostgres.a endif ifeq ($(PORTNAME), win32) - rm -f postgres.dll postgres.def libpostgres.a $(WIN32RES) + rm -f postgres.dll libpostgres.a $(WIN32RES) endif distclean: clean @@ -329,7 +324,6 @@ maintainer-clean: distclean catalog/postgres.description \ catalog/postgres.shdescription \ replication/repl_gram.c \ - replication/repl_gram.h \ replication/repl_scanner.c \ utils/fmgroids.h \ utils/fmgrtab.c \ diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index 0366d59624..c32088f81d 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -8,6 +8,6 @@ subdir = src/backend/access top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = common gist hash heap index nbtree transam gin spgist +SUBDIRS = common gin gist hash heap index nbtree rmgrdesc spgist transam include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index 62479c04bb..b81bddf3c1 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -21,7 +21,7 @@ * tuptoaster.c. * * This change will break any code that assumes it needn't detoast values - * that have been put into a tuple but never sent to disk. Hopefully there + * that have been put into a tuple but never sent to disk. Hopefully there * are few such places. * * Varlenas still have alignment 'i' (or 'd') in pg_type/pg_attribute, since @@ -50,7 +50,7 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -69,6 +69,7 @@ #include "access/tuptoaster.h" #include "executor/tuptable.h" #ifdef XCP +#include "lib/stringinfo.h" #include "utils/memutils.h" #endif @@ -400,7 +401,7 @@ nocachegetattr(HeapTuple tuple, /* * Otherwise, check for non-fixed-length attrs up to and including - * target. If there aren't any, it's safe to cheaply initialize the + * target. If there aren't any, it's safe to cheaply initialize the * cached offsets for these attrs. */ if (HeapTupleHasVarWidth(tuple)) @@ -467,7 +468,7 @@ nocachegetattr(HeapTuple tuple, * * Note - This loop is a little tricky. For each non-null attribute, * we have to first account for alignment padding before the attr, - * then advance over the attr based on its length. Nulls have no + * then advance over the attr based on its length. Nulls have no * storage and no alignment padding either. We can use/set * attcacheoff until we reach either a null or a var-width attribute. */ @@ -552,17 +553,17 @@ heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull) result = ObjectIdGetDatum(HeapTupleGetOid(tup)); break; case MinTransactionIdAttributeNumber: - result = TransactionIdGetDatum(HeapTupleHeaderGetXmin(tup->t_data)); + result = TransactionIdGetDatum(HeapTupleHeaderGetRawXmin(tup->t_data)); break; case MaxTransactionIdAttributeNumber: - result = TransactionIdGetDatum(HeapTupleHeaderGetXmax(tup->t_data)); + result = TransactionIdGetDatum(HeapTupleHeaderGetRawXmax(tup->t_data)); break; case MinCommandIdAttributeNumber: case MaxCommandIdAttributeNumber: /* * cmin and cmax are now both aliases for the same field, which - * can in fact also be a combo command id. XXX perhaps we should + * can in fact also be a combo command id. XXX perhaps we should * return the "real" cmin or cmax if possible, that is if we are * inside the originating transaction? */ @@ -641,6 +642,41 @@ heap_copytuple_with_tuple(HeapTuple src, HeapTuple dest) memcpy((char *) dest->t_data, (char *) src->t_data, src->t_len); } +/* ---------------- + * heap_copy_tuple_as_datum + * + * copy a tuple as a composite-type Datum + * ---------------- + */ +Datum +heap_copy_tuple_as_datum(HeapTuple tuple, TupleDesc tupleDesc) +{ + HeapTupleHeader td; + + /* + * If the tuple contains any external TOAST pointers, we have to inline + * those fields to meet the conventions for composite-type Datums. + */ + if (HeapTupleHasExternal(tuple)) + return toast_flatten_tuple_to_datum(tuple->t_data, + tuple->t_len, + tupleDesc); + + /* + * Fast path for easy case: just make a palloc'd copy and insert the + * correct composite-Datum header fields (since those may not be set if + * the given tuple came from disk, rather than from heap_form_tuple). + */ + td = (HeapTupleHeader) palloc(tuple->t_len); + memcpy((char *) td, (char *) tuple->t_data, tuple->t_len); + + HeapTupleHeaderSetDatumLength(td, tuple->t_len); + HeapTupleHeaderSetTypeId(td, tupleDesc->tdtypeid); + HeapTupleHeaderSetTypMod(td, tupleDesc->tdtypmod); + + return PointerGetDatum(td); +} + /* * heap_form_tuple * construct a tuple from the given values[] and isnull[] arrays, @@ -659,7 +695,6 @@ heap_form_tuple(TupleDesc tupleDescriptor, data_len; int hoff; bool hasnull = false; - Form_pg_attribute *att = tupleDescriptor->attrs; int numberOfAttributes = tupleDescriptor->natts; int i; @@ -670,28 +705,14 @@ heap_form_tuple(TupleDesc tupleDescriptor, numberOfAttributes, MaxTupleAttributeNumber))); /* - * Check for nulls and embedded tuples; expand any toasted attributes in - * embedded tuples. This preserves the invariant that toasting can only - * go one level deep. - * - * We can skip calling toast_flatten_tuple_attribute() if the attribute - * couldn't possibly be of composite type. All composite datums are - * varlena and have alignment 'd'; furthermore they aren't arrays. Also, - * if an attribute is already toasted, it must have been sent to disk - * already and so cannot contain toasted attributes. + * Check for nulls */ for (i = 0; i < numberOfAttributes; i++) { if (isnull[i]) - hasnull = true; - else if (att[i]->attlen == -1 && - att[i]->attalign == 'd' && - att[i]->attndims == 0 && - !VARATT_IS_EXTENDED(DatumGetPointer(values[i]))) { - values[i] = toast_flatten_tuple_attribute(values[i], - att[i]->atttypid, - att[i]->atttypmod); + hasnull = true; + break; } } @@ -713,7 +734,7 @@ heap_form_tuple(TupleDesc tupleDescriptor, len += data_len; /* - * Allocate and zero the space needed. Note that the tuple body and + * Allocate and zero the space needed. Note that the tuple body and * HeapTupleData management structure are allocated in one chunk. */ tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + len); @@ -721,7 +742,8 @@ heap_form_tuple(TupleDesc tupleDescriptor, /* * And fill in the information. Note we fill the Datum fields even though - * this tuple may never become a Datum. + * this tuple may never become a Datum. This lets HeapTupleHeaderGetDatum + * identify the tuple type if needed. */ tuple->t_len = len; ItemPointerSetInvalid(&(tuple->t_self)); @@ -1641,7 +1663,6 @@ heap_form_minimal_tuple(TupleDesc tupleDescriptor, data_len; int hoff; bool hasnull = false; - Form_pg_attribute *att = tupleDescriptor->attrs; int numberOfAttributes = tupleDescriptor->natts; int i; @@ -1652,28 +1673,14 @@ heap_form_minimal_tuple(TupleDesc tupleDescriptor, numberOfAttributes, MaxTupleAttributeNumber))); /* - * Check for nulls and embedded tuples; expand any toasted attributes in - * embedded tuples. This preserves the invariant that toasting can only - * go one level deep. - * - * We can skip calling toast_flatten_tuple_attribute() if the attribute - * couldn't possibly be of composite type. All composite datums are - * varlena and have alignment 'd'; furthermore they aren't arrays. Also, - * if an attribute is already toasted, it must have been sent to disk - * already and so cannot contain toasted attributes. + * Check for nulls */ for (i = 0; i < numberOfAttributes; i++) { if (isnull[i]) - hasnull = true; - else if (att[i]->attlen == -1 && - att[i]->attalign == 'd' && - att[i]->attndims == 0 && - !VARATT_IS_EXTENDED(values[i])) { - values[i] = toast_flatten_tuple_attribute(values[i], - att[i]->atttypid, - att[i]->atttypmod); + hasnull = true; + break; } } diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c index 76c76e9e89..5fd400990b 100644 --- a/src/backend/access/common/indextuple.c +++ b/src/backend/access/common/indextuple.c @@ -4,7 +4,7 @@ * This file contains index tuple accessor and mutator routines, * as well as various tuple utilities. * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -71,7 +71,7 @@ index_form_tuple(TupleDesc tupleDescriptor, /* * If value is stored EXTERNAL, must fetch it so we are not depending - * on outside storage. This should be improved someday. + * on outside storage. This should be improved someday. */ if (VARATT_IS_EXTERNAL(DatumGetPointer(values[i]))) { @@ -158,6 +158,11 @@ index_form_tuple(TupleDesc tupleDescriptor, if (tupmask & HEAP_HASVARWIDTH) infomask |= INDEX_VAR_MASK; + /* Also assert we got rid of external attributes */ +#ifdef TOAST_INDEX_HACK + Assert((tupmask & HEAP_HASEXTERNAL) == 0); +#endif + /* * Here we make sure that the size will fit in the field reserved for it * in t_info. @@ -165,9 +170,8 @@ index_form_tuple(TupleDesc tupleDescriptor, if ((size & INDEX_SIZE_MASK) != size) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("index row requires %lu bytes, maximum size is %lu", - (unsigned long) size, - (unsigned long) INDEX_SIZE_MASK))); + errmsg("index row requires %zu bytes, maximum size is %zu", + size, (Size) INDEX_SIZE_MASK))); infomask |= size; @@ -276,7 +280,7 @@ nocache_index_getattr(IndexTuple tup, /* * Otherwise, check for non-fixed-length attrs up to and including - * target. If there aren't any, it's safe to cheaply initialize the + * target. If there aren't any, it's safe to cheaply initialize the * cached offsets for these attrs. */ if (IndexTupleHasVarwidths(tup)) @@ -343,7 +347,7 @@ nocache_index_getattr(IndexTuple tup, * * Note - This loop is a little tricky. For each non-null attribute, * we have to first account for alignment padding before the attr, - * then advance over the attr based on its length. Nulls have no + * then advance over the attr based on its length. Nulls have no * storage and no alignment padding either. We can use/set * attcacheoff until we reach either a null or a var-width attribute. */ diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c index 10a45a3146..d033cdd09a 100644 --- a/src/backend/access/common/printtup.c +++ b/src/backend/access/common/printtup.c @@ -10,7 +10,7 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -28,6 +28,9 @@ #ifdef PGXC #include "pgxc/pgxc.h" #endif +#include "utils/memdebug.h" +#include "utils/memutils.h" + static void printtup_startup(DestReceiver *self, int operation, TupleDesc typeinfo); @@ -67,6 +70,7 @@ typedef struct TupleDesc attrinfo; /* The attr info we are set up for */ int nattrs; PrinttupAttrInfo *myinfo; /* Cached info about each attr */ + MemoryContext tmpcontext; /* Memory context for per-row workspace */ } DR_printtup; /* ---------------- @@ -93,6 +97,7 @@ printtup_create_DR(CommandDest dest) self->attrinfo = NULL; self->nattrs = 0; self->myinfo = NULL; + self->tmpcontext = NULL; return (DestReceiver *) self; } @@ -130,6 +135,18 @@ printtup_startup(DestReceiver *self, int operation, TupleDesc typeinfo) DR_printtup *myState = (DR_printtup *) self; Portal portal = myState->portal; + /* + * Create a temporary memory context that we can reset once per row to + * recover palloc'd memory. This avoids any problems with leaks inside + * datatype output routines, and should be faster than retail pfree's + * anyway. + */ + myState->tmpcontext = AllocSetContextCreate(CurrentMemoryContext, + "printtup", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + if (PG_PROTOCOL_MAJOR(FrontendProtocol) < 3) { /* @@ -173,7 +190,7 @@ printtup_startup(DestReceiver *self, int operation, TupleDesc typeinfo) * or some similar function; it does not contain a full set of fields. * The targetlist will be NIL when executing a utility function that does * not have a plan. If the targetlist isn't NIL then it is a Query node's - * targetlist; it is up to us to ignore resjunk columns in it. The formats[] + * targetlist; it is up to us to ignore resjunk columns in it. The formats[] * array pointer might be NULL (if we are doing Describe on a prepared stmt); * send zeroes for the format codes in that case. */ @@ -309,6 +326,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self) { TupleDesc typeinfo = slot->tts_tupleDescriptor; DR_printtup *myState = (DR_printtup *) self; + MemoryContext oldcontext; StringInfoData buf; int natts = typeinfo->natts; int i; @@ -341,8 +359,11 @@ printtup(TupleTableSlot *slot, DestReceiver *self) /* Make sure the tuple is fully deconstructed */ slot_getallattrs(slot); + /* Switch into per-row context so we can recover memory below */ + oldcontext = MemoryContextSwitchTo(myState->tmpcontext); + /* - * Prepare a DataRow message + * Prepare a DataRow message (note buffer is in per-row context) */ pq_beginmessage(&buf, 'D'); @@ -354,8 +375,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self) for (i = 0; i < natts; ++i) { PrinttupAttrInfo *thisState = myState->myinfo + i; - Datum origattr = slot->tts_values[i], - attr; + Datum attr = slot->tts_values[i]; if (slot->tts_isnull[i]) { @@ -364,13 +384,15 @@ printtup(TupleTableSlot *slot, DestReceiver *self) } /* - * If we have a toasted datum, forcibly detoast it here to avoid - * memory leakage inside the type's output routine. + * Here we catch undefined bytes in datums that are returned to the + * client without hitting disk; see comments at the related check in + * PageAddItem(). This test is most useful for uncompressed, + * non-external datums, but we're quite likely to see such here when + * testing new C functions. */ if (thisState->typisvarlena) - attr = PointerGetDatum(PG_DETOAST_DATUM(origattr)); - else - attr = origattr; + VALGRIND_CHECK_MEM_IS_DEFINED(DatumGetPointer(attr), + VARSIZE_ANY(attr)); if (thisState->format == 0) { @@ -379,7 +401,6 @@ printtup(TupleTableSlot *slot, DestReceiver *self) outputstr = OutputFunctionCall(&thisState->finfo, attr); pq_sendcountedtext(&buf, outputstr, strlen(outputstr), false); - pfree(outputstr); } else { @@ -390,15 +411,14 @@ printtup(TupleTableSlot *slot, DestReceiver *self) pq_sendint(&buf, VARSIZE(outputbytes) - VARHDRSZ, 4); pq_sendbytes(&buf, VARDATA(outputbytes), VARSIZE(outputbytes) - VARHDRSZ); - pfree(outputbytes); } - - /* Clean up detoasted copy, if any */ - if (DatumGetPointer(attr) != DatumGetPointer(origattr)) - pfree(DatumGetPointer(attr)); } pq_endmessage(&buf); + + /* Return to caller's context, and flush row's temporary memory */ + MemoryContextSwitchTo(oldcontext); + MemoryContextReset(myState->tmpcontext); } /* ---------------- @@ -410,6 +430,7 @@ printtup_20(TupleTableSlot *slot, DestReceiver *self) { TupleDesc typeinfo = slot->tts_tupleDescriptor; DR_printtup *myState = (DR_printtup *) self; + MemoryContext oldcontext; StringInfoData buf; int natts = typeinfo->natts; int i, @@ -423,6 +444,9 @@ printtup_20(TupleTableSlot *slot, DestReceiver *self) /* Make sure the tuple is fully deconstructed */ slot_getallattrs(slot); + /* Switch into per-row context so we can recover memory below */ + oldcontext = MemoryContextSwitchTo(myState->tmpcontext); + /* * tell the frontend to expect new tuple data (in ASCII style) */ @@ -454,8 +478,7 @@ printtup_20(TupleTableSlot *slot, DestReceiver *self) for (i = 0; i < natts; ++i) { PrinttupAttrInfo *thisState = myState->myinfo + i; - Datum origattr = slot->tts_values[i], - attr; + Datum attr = slot->tts_values[i]; char *outputstr; if (slot->tts_isnull[i]) @@ -463,25 +486,15 @@ printtup_20(TupleTableSlot *slot, DestReceiver *self) Assert(thisState->format == 0); - /* - * If we have a toasted datum, forcibly detoast it here to avoid - * memory leakage inside the type's output routine. - */ - if (thisState->typisvarlena) - attr = PointerGetDatum(PG_DETOAST_DATUM(origattr)); - else - attr = origattr; - outputstr = OutputFunctionCall(&thisState->finfo, attr); pq_sendcountedtext(&buf, outputstr, strlen(outputstr), true); - pfree(outputstr); - - /* Clean up detoasted copy, if any */ - if (DatumGetPointer(attr) != DatumGetPointer(origattr)) - pfree(DatumGetPointer(attr)); } pq_endmessage(&buf); + + /* Return to caller's context, and flush row's temporary memory */ + MemoryContextSwitchTo(oldcontext); + MemoryContextReset(myState->tmpcontext); } /* ---------------- @@ -498,6 +511,10 @@ printtup_shutdown(DestReceiver *self) myState->myinfo = NULL; myState->attrinfo = NULL; + + if (myState->tmpcontext) + MemoryContextDelete(myState->tmpcontext); + myState->tmpcontext = NULL; } /* ---------------- @@ -560,8 +577,7 @@ debugtup(TupleTableSlot *slot, DestReceiver *self) TupleDesc typeinfo = slot->tts_tupleDescriptor; int natts = typeinfo->natts; int i; - Datum origattr, - attr; + Datum attr; char *value; bool isnull; Oid typoutput; @@ -569,30 +585,15 @@ debugtup(TupleTableSlot *slot, DestReceiver *self) for (i = 0; i < natts; ++i) { - origattr = slot_getattr(slot, i + 1, &isnull); + attr = slot_getattr(slot, i + 1, &isnull); if (isnull) continue; getTypeOutputInfo(typeinfo->attrs[i]->atttypid, &typoutput, &typisvarlena); - /* - * If we have a toasted datum, forcibly detoast it here to avoid - * memory leakage inside the type's output routine. - */ - if (typisvarlena) - attr = PointerGetDatum(PG_DETOAST_DATUM(origattr)); - else - attr = origattr; - value = OidOutputFunctionCall(typoutput, attr); printatt((unsigned) i + 1, typeinfo->attrs[i], value); - - pfree(value); - - /* Clean up detoasted copy, if any */ - if (DatumGetPointer(attr) != DatumGetPointer(origattr)) - pfree(DatumGetPointer(attr)); } printf("\t----\n"); } @@ -611,6 +612,7 @@ printtup_internal_20(TupleTableSlot *slot, DestReceiver *self) { TupleDesc typeinfo = slot->tts_tupleDescriptor; DR_printtup *myState = (DR_printtup *) self; + MemoryContext oldcontext; StringInfoData buf; int natts = typeinfo->natts; int i, @@ -624,6 +626,9 @@ printtup_internal_20(TupleTableSlot *slot, DestReceiver *self) /* Make sure the tuple is fully deconstructed */ slot_getallattrs(slot); + /* Switch into per-row context so we can recover memory below */ + oldcontext = MemoryContextSwitchTo(myState->tmpcontext); + /* * tell the frontend to expect new tuple data (in binary style) */ @@ -655,8 +660,7 @@ printtup_internal_20(TupleTableSlot *slot, DestReceiver *self) for (i = 0; i < natts; ++i) { PrinttupAttrInfo *thisState = myState->myinfo + i; - Datum origattr = slot->tts_values[i], - attr; + Datum attr = slot->tts_values[i]; bytea *outputbytes; if (slot->tts_isnull[i]) @@ -664,26 +668,15 @@ printtup_internal_20(TupleTableSlot *slot, DestReceiver *self) Assert(thisState->format == 1); - /* - * If we have a toasted datum, forcibly detoast it here to avoid - * memory leakage inside the type's output routine. - */ - if (thisState->typisvarlena) - attr = PointerGetDatum(PG_DETOAST_DATUM(origattr)); - else - attr = origattr; - outputbytes = SendFunctionCall(&thisState->finfo, attr); - /* We assume the result will not have been toasted */ pq_sendint(&buf, VARSIZE(outputbytes) - VARHDRSZ, 4); pq_sendbytes(&buf, VARDATA(outputbytes), VARSIZE(outputbytes) - VARHDRSZ); - pfree(outputbytes); - - /* Clean up detoasted copy, if any */ - if (DatumGetPointer(attr) != DatumGetPointer(origattr)) - pfree(DatumGetPointer(attr)); } pq_endmessage(&buf); + + /* Return to caller's context, and flush row's temporary memory */ + MemoryContextSwitchTo(oldcontext); + MemoryContextReset(myState->tmpcontext); } diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 607a80d7d4..522b671993 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -3,7 +3,7 @@ * reloptions.c * Core support for relation options (pg_class.reloptions) * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -17,12 +17,14 @@ #include "access/gist_private.h" #include "access/hash.h" +#include "access/htup_details.h" #include "access/nbtree.h" #include "access/reloptions.h" #include "access/spgist.h" #include "catalog/pg_type.h" #include "commands/defrem.h" #include "commands/tablespace.h" +#include "commands/view.h" #include "nodes/makefuncs.h" #include "utils/array.h" #include "utils/attoptcache.h" @@ -61,6 +63,14 @@ static relopt_bool boolRelOpts[] = }, { { + "user_catalog_table", + "Declare a table as an additional catalog table, e.g. for the purpose of logical replication", + RELOPT_KIND_HEAP + }, + false + }, + { + { "fastupdate", "Enables \"fast update\" feature for this GIN index", RELOPT_KIND_GIN @@ -163,6 +173,14 @@ static relopt_int intRelOpts[] = }, { { + "autovacuum_multixact_freeze_min_age", + "Minimum multixact age at which VACUUM should freeze a row multixact's, for autovacuum", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST + }, + -1, 0, 1000000000 + }, + { + { "autovacuum_freeze_max_age", "Age at which to autovacuum a table to prevent transaction ID wraparound", RELOPT_KIND_HEAP | RELOPT_KIND_TOAST @@ -171,11 +189,27 @@ static relopt_int intRelOpts[] = }, { { + "autovacuum_multixact_freeze_max_age", + "Multixact age at which to autovacuum a table to prevent multixact wraparound", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST + }, + -1, 100000000, 2000000000 + }, + { + { "autovacuum_freeze_table_age", - "Age at which VACUUM should perform a full table sweep to replace old Xid values with FrozenXID", + "Age at which VACUUM should perform a full table sweep to freeze row versions", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST + }, -1, 0, 2000000000 + }, + { + { + "autovacuum_multixact_freeze_table_age", + "Age of multixact at which VACUUM should perform a full table sweep to freeze row versions", RELOPT_KIND_HEAP | RELOPT_KIND_TOAST }, -1, 0, 2000000000 }, + /* list terminator */ {{NULL}} }; @@ -247,6 +281,17 @@ static relopt_string stringRelOpts[] = gistValidateBufferingOption, "auto" }, + { + { + "check_option", + "View has WITH CHECK OPTION defined (local or cascaded).", + RELOPT_KIND_VIEW + }, + 0, + true, + validateWithCheckOption, + NULL + }, /* list terminator */ {{NULL}} }; @@ -495,7 +540,7 @@ add_real_reloption(bits32 kinds, char *name, char *desc, double default_val, * Add a new string reloption * * "validator" is an optional function pointer that can be used to test the - * validity of the values. It must elog(ERROR) when the argument string is + * validity of the values. It must elog(ERROR) when the argument string is * not acceptable for the variable. Note that the default value must pass * the validation. */ @@ -790,7 +835,7 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, Oid amoptions) case RELKIND_RELATION: case RELKIND_TOASTVALUE: case RELKIND_VIEW: - case RELKIND_UNCATALOGED: + case RELKIND_MATVIEW: options = heap_reloptions(classForm->relkind, datum, false); break; case RELKIND_INDEX: @@ -823,7 +868,7 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, Oid amoptions) * is returned. * * Note: values of type int, bool and real are allocated as part of the - * returned array. Values of type string are allocated separately and must + * returned array. Values of type string are allocated separately and must * be freed by the caller. */ relopt_value * @@ -1145,12 +1190,22 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind) offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, freeze_max_age)}, {"autovacuum_freeze_table_age", RELOPT_TYPE_INT, offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, freeze_table_age)}, + {"autovacuum_multixact_freeze_min_age", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, multixact_freeze_min_age)}, + {"autovacuum_multixact_freeze_max_age", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, multixact_freeze_max_age)}, + {"autovacuum_multixact_freeze_table_age", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, multixact_freeze_table_age)}, {"autovacuum_vacuum_scale_factor", RELOPT_TYPE_REAL, offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, vacuum_scale_factor)}, {"autovacuum_analyze_scale_factor", RELOPT_TYPE_REAL, offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, analyze_scale_factor)}, {"security_barrier", RELOPT_TYPE_BOOL, offsetof(StdRdOptions, security_barrier)}, + {"check_option", RELOPT_TYPE_STRING, + offsetof(StdRdOptions, check_option_offset)}, + {"user_catalog_table", RELOPT_TYPE_BOOL, + offsetof(StdRdOptions, user_catalog_table)} }; options = parseRelOptions(reloptions, validate, kind, &numoptions); @@ -1191,6 +1246,7 @@ heap_reloptions(char relkind, Datum reloptions, bool validate) } return (bytea *) rdopts; case RELKIND_RELATION: + case RELKIND_MATVIEW: return default_reloptions(reloptions, validate, RELOPT_KIND_HEAP); case RELKIND_VIEW: return default_reloptions(reloptions, validate, RELOPT_KIND_VIEW); diff --git a/src/backend/access/common/scankey.c b/src/backend/access/common/scankey.c index 37d77ed7d5..85c31c49c1 100644 --- a/src/backend/access/common/scankey.c +++ b/src/backend/access/common/scankey.c @@ -3,7 +3,7 @@ * scankey.c * scan key support code * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/common/tupconvert.c b/src/backend/access/common/tupconvert.c index f5a43d47b8..2e48b32ba3 100644 --- a/src/backend/access/common/tupconvert.c +++ b/src/backend/access/common/tupconvert.c @@ -5,11 +5,11 @@ * * These functions provide conversion between rowtypes that are logically * equivalent but might have columns in a different order or different sets - * of dropped columns. There is some overlap of functionality with the + * of dropped columns. There is some overlap of functionality with the * executor's "junkfilter" routines, but these functions work on bare * HeapTuples rather than TupleTableSlots. * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -20,6 +20,7 @@ */ #include "postgres.h" +#include "access/htup_details.h" #include "access/tupconvert.h" #include "utils/builtins.h" diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c index daac277f03..f3b36893f7 100644 --- a/src/backend/access/common/tupdesc.c +++ b/src/backend/access/common/tupdesc.c @@ -3,7 +3,7 @@ * tupdesc.c * POSTGRES tuple descriptor support code * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -19,12 +19,13 @@ #include "postgres.h" +#include "access/htup_details.h" #include "catalog/pg_type.h" #include "miscadmin.h" #include "parser/parse_type.h" #include "utils/acl.h" #include "utils/builtins.h" -#include "utils/resowner.h" +#include "utils/resowner_private.h" #include "utils/syscache.h" @@ -203,6 +204,7 @@ CreateTupleDescCopyConstr(TupleDesc tupdesc) if (constr->check[i].ccbin) cpy->check[i].ccbin = pstrdup(constr->check[i].ccbin); cpy->check[i].ccvalid = constr->check[i].ccvalid; + cpy->check[i].ccnoinherit = constr->check[i].ccnoinherit; } } @@ -216,6 +218,47 @@ CreateTupleDescCopyConstr(TupleDesc tupdesc) } /* + * TupleDescCopyEntry + * This function copies a single attribute structure from one tuple + * descriptor to another. + * + * !!! Constraints and defaults are not copied !!! + */ +void +TupleDescCopyEntry(TupleDesc dst, AttrNumber dstAttno, + TupleDesc src, AttrNumber srcAttno) +{ + /* + * sanity checks + */ + AssertArg(PointerIsValid(src)); + AssertArg(PointerIsValid(dst)); + AssertArg(srcAttno >= 1); + AssertArg(srcAttno <= src->natts); + AssertArg(dstAttno >= 1); + AssertArg(dstAttno <= dst->natts); + + memcpy(dst->attrs[dstAttno - 1], src->attrs[srcAttno - 1], + ATTRIBUTE_FIXED_PART_SIZE); + + /* + * Aside from updating the attno, we'd better reset attcacheoff. + * + * XXX Actually, to be entirely safe we'd need to reset the attcacheoff of + * all following columns in dst as well. Current usage scenarios don't + * require that though, because all following columns will get initialized + * by other uses of this function or TupleDescInitEntry. So we cheat a + * bit to avoid a useless O(N^2) penalty. + */ + dst->attrs[dstAttno - 1]->attnum = dstAttno; + dst->attrs[dstAttno - 1]->attcacheoff = -1; + + /* since we're not copying constraints or defaults, clear these */ + dst->attrs[dstAttno - 1]->attnotnull = false; + dst->attrs[dstAttno - 1]->atthasdef = false; +} + +/* * Free a TupleDesc including all substructure */ void @@ -416,7 +459,9 @@ equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2) for (j = 0; j < n; check2++, j++) { if (strcmp(check1->ccname, check2->ccname) == 0 && - strcmp(check1->ccbin, check2->ccbin) == 0) + strcmp(check1->ccbin, check2->ccbin) == 0 && + check1->ccvalid == check2->ccvalid && + check1->ccnoinherit == check2->ccnoinherit) break; } if (j >= n) @@ -536,7 +581,7 @@ TupleDescInitEntryCollation(TupleDesc desc, * Given a relation schema (list of ColumnDef nodes), build a TupleDesc. * * Note: the default assumption is no OIDs; caller may modify the returned - * TupleDesc if it wants OIDs. Also, tdtypeid will need to be filled in + * TupleDesc if it wants OIDs. Also, tdtypeid will need to be filled in * later on. */ TupleDesc @@ -579,8 +624,7 @@ BuildDescForRelation(List *schema) aclresult = pg_type_aclcheck(atttypid, GetUserId(), ACL_USAGE); if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, ACL_KIND_TYPE, - format_type_be(atttypid)); + aclcheck_error_type(aclresult, atttypid); attcollation = GetColumnDefCollation(NULL, entry, atttypid); attdim = list_length(entry->typeName->arrayBounds); diff --git a/src/backend/access/gin/Makefile b/src/backend/access/gin/Makefile index 889dde6a27..db4f496a4d 100644 --- a/src/backend/access/gin/Makefile +++ b/src/backend/access/gin/Makefile @@ -14,6 +14,6 @@ include $(top_builddir)/src/Makefile.global OBJS = ginutil.o gininsert.o ginxlog.o ginentrypage.o gindatapage.o \ ginbtree.o ginscan.o ginget.o ginvacuum.o ginarrayproc.o \ - ginbulk.o ginfast.o + ginbulk.o ginfast.o ginpostinglist.o ginlogic.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README index 67159d8529..fade0cbb61 100644 --- a/src/backend/access/gin/README +++ b/src/backend/access/gin/README @@ -104,7 +104,7 @@ a few thousand entries can be much faster than retail insertion. (The win comes mainly from not having to do multiple searches/insertions when the same key appears in multiple new heap tuples.) -Key entries are nominally of the same IndexEntry format as used in other +Key entries are nominally of the same IndexTuple format as used in other index types, but since a leaf key entry typically refers to multiple heap tuples, there are significant differences. (See GinFormTuple, which works by building a "normal" index tuple and then modifying it.) The points to @@ -135,15 +135,15 @@ same category of null entry are merged into one index entry just as happens with ordinary key entries. * In a key entry at the btree leaf level, at the next SHORTALIGN boundary, -there is an array of zero or more ItemPointers, which store the heap tuple -TIDs for which the indexable items contain this key. This is called the -"posting list". The TIDs in a posting list must appear in sorted order. -If the list would be too big for the index tuple to fit on an index page, -the ItemPointers are pushed out to a separate posting page or pages, and -none appear in the key entry itself. The separate pages are called a -"posting tree"; they are organized as a btree of ItemPointer values. -Note that in either case, the ItemPointers associated with a key can -easily be read out in sorted order; this is relied on by the scan +there is a list of item pointers, in compressed format (see Posting List +Compression section), pointing to the heap tuples for which the indexable +items contain this key. This is called the "posting list". + +If the list would be too big for the index tuple to fit on an index page, the +ItemPointers are pushed out to a separate posting page or pages, and none +appear in the key entry itself. The separate pages are called a "posting +tree" (see below); Note that in either case, the ItemPointers associated with +a key can easily be read out in sorted order; this is relied on by the scan algorithms. * The index tuple header fields of a leaf key entry are abused as follows: @@ -163,6 +163,11 @@ algorithms. * The posting list can be accessed with GinGetPosting(itup) +* If GinITupIsCompressed(itup), the posting list is stored in compressed + format. Otherwise it is just an array of ItemPointers. New tuples are always + stored in compressed format, uncompressed items can be present if the + database was migrated from 9.3 or earlier version. + 2) Posting tree case: * ItemPointerGetBlockNumber(&itup->t_tid) contains the index block number @@ -176,7 +181,7 @@ algorithms. GinSetPostingTree macro. * If IndexTupleHasNulls(itup) is true, the null category byte can be - accessed/set with GinGetNullCategory(itup) / GinSetNullCategory(itup,c) + accessed/set with GinGetNullCategory(itup,gs) / GinSetNullCategory(itup,gs,c) * The posting list is not present and must not be accessed. @@ -210,6 +215,145 @@ fit on one pending-list page must have those pages to itself, even if this results in wasting much of the space on the preceding page and the last page for the tuple.) +Posting tree +------------ + +If a posting list is too large to store in-line in a key entry, a posting tree +is created. A posting tree is a B-tree structure, where the ItemPointer is +used as the key. + +Internal posting tree pages use the standard PageHeader and the same "opaque" +struct as other GIN page, but do not contain regular index tuples. Instead, +the contents of the page is an array of PostingItem structs. Each PostingItem +consists of the block number of the child page, and the right bound of that +child page, as an ItemPointer. The right bound of the page is stored right +after the page header, before the PostingItem array. + +Posting tree leaf pages also use the standard PageHeader and opaque struct, +and the right bound of the page is stored right after the page header, but +the page content comprises of a number of compressed posting lists. The +compressed posting lists are stored one after each other, between page header +and pd_lower. The space between pd_lower and pd_upper is unused, which allows +full-page images of posting tree leaf pages to skip the unused space in middle +(buffer_std = true in XLogRecData). + +The item pointers are stored in a number of independent compressed posting +lists (also called segments), instead of one big one, to make random access +to a given item pointer faster: to find an item in a compressed list, you +have to read the list from the beginning, but when the items are split into +multiple lists, you can first skip over to the list containing the item you're +looking for, and read only that segment. Also, an update only needs to +re-encode the affected segment. + +Posting List Compression +------------------------ + +To fit as many item pointers on a page as possible, posting tree leaf pages +and posting lists stored inline in entry tree leaf tuples use a lightweight +form of compression. We take advantage of the fact that the item pointers +are stored in sorted order. Instead of storing the block and offset number of +each item pointer separately, we store the difference from the previous item. +That in itself doesn't do much, but it allows us to use so-called varbyte +encoding to compress them. + +Varbyte encoding is a method to encode integers, allowing smaller numbers to +take less space at the cost of larger numbers. Each integer is represented by +variable number of bytes. High bit of each byte in varbyte encoding determines +whether the next byte is still part of this number. Therefore, to read a single +varbyte encoded number, you have to read bytes until you find a byte with the +high bit not set. + +When encoding, the block and offset number forming the item pointer are +combined into a single integer. The offset number is stored in the 11 low +bits (see MaxHeapTuplesPerPageBits in ginpostinglist.c), and the block number +is stored in the higher bits. That requires 43 bits in total, which +conveniently fits in at most 6 bytes. + +A compressed posting list is passed around and stored on disk in a +PackedPostingList struct. The first item in the list is stored uncompressed +as a regular ItemPointerData, followed by the length of the list in bytes, +followed by the packed items. + +Concurrency +----------- + +The entry tree and each posting tree is a B-tree, with right-links connecting +sibling pages at the same level. This is the same structure that is used in +the regular B-tree indexam (invented by Lehman & Yao), but we don't support +scanning a GIN trees backwards, so we don't need left-links. + +To avoid deadlocks, B-tree pages must always be locked in the same order: +left to right, and bottom to top. When searching, the tree is traversed from +top to bottom, so the lock on the parent page must be released before +descending to the next level. Concurrent page splits move the keyspace to +right, so after following a downlink, the page actually containing the key +we're looking for might be somewhere to the right of the page we landed on. +In that case, we follow the right-links until we find the page we're looking +for. + +To delete a page, the page's left sibling, the target page, and its parent, +are locked in that order, and the page is marked as deleted. However, a +concurrent search might already have read a pointer to the page, and might be +just about to follow it. A page can be reached via the right-link of its left +sibling, or via its downlink in the parent. + +To prevent a backend from reaching a deleted page via a right-link, when +following a right-link the lock on the previous page is not released until +the lock on next page has been acquired. + +The downlink is more tricky. A search descending the tree must release the +lock on the parent page before locking the child, or it could deadlock with +a concurrent split of the child page; a page split locks the parent, while +already holding a lock on the child page. However, posting trees are only +fully searched from left to right, starting from the leftmost leaf. (The +tree-structure is only needed by insertions, to quickly find the correct +insert location). So as long as we don't delete the leftmost page on each +level, a search can never follow a downlink to page that's about to be +deleted. + +The previous paragraph's reasoning only applies to searches, and only to +posting trees. To protect from inserters following a downlink to a deleted +page, vacuum simply locks out all concurrent insertions to the posting tree, +by holding a super-exclusive lock on the posting tree root. Inserters hold a +pin on the root page, but searches do not, so while new searches cannot begin +while root page is locked, any already-in-progress scans can continue +concurrently with vacuum. In the entry tree, we never delete pages. + +(This is quite different from the mechanism the btree indexam uses to make +page-deletions safe; it stamps the deleted pages with an XID and keeps the +deleted pages around with the right-link intact until all concurrent scans +have finished.) + +Compatibility +------------- + +Compression of TIDs was introduced in 9.4. Some GIN indexes could remain in +uncompressed format because of pg_upgrade from 9.3 or earlier versions. +For compatibility, old uncompressed format is also supported. Following +rules are used to handle it: + +* GIN_ITUP_COMPRESSED flag marks index tuples that contain a posting list. +This flag is stored in high bit of ItemPointerGetBlockNumber(&itup->t_tid). +Use GinItupIsCompressed(itup) to check the flag. + +* Posting tree pages in the new format are marked with the GIN_COMPRESSED flag. + Macros GinPageIsCompressed(page) and GinPageSetCompressed(page) are used to + check and set this flag. + +* All scan operations check format of posting list add use corresponding code +to read its content. + +* When updating an index tuple containing an uncompressed posting list, it +will be replaced with new index tuple containing a compressed list. + +* When updating an uncompressed posting tree leaf page, it's compressed. + +* If vacuum finds some dead TIDs in uncompressed posting lists, they are +converted into compressed posting lists. This assumes that the compressed +posting list fits in the space occupied by the uncompressed list. IOW, we +assume that the compressed version of the page, with the dead items removed, +takes less space than the old uncompressed version. + Limitations ----------- diff --git a/src/backend/access/gin/ginarrayproc.c b/src/backend/access/gin/ginarrayproc.c index f1f934331c..66cea28113 100644 --- a/src/backend/access/gin/ginarrayproc.c +++ b/src/backend/access/gin/ginarrayproc.c @@ -4,7 +4,7 @@ * support functions for GIN's indexing of any array * * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -197,7 +197,7 @@ ginarrayconsistent(PG_FUNCTION_ARGS) /* * Must have all elements in check[] true; no discrimination - * against nulls here. This is because array_contain_compare and + * against nulls here. This is because array_contain_compare and * array_eq handle nulls differently ... */ res = true; @@ -218,3 +218,88 @@ ginarrayconsistent(PG_FUNCTION_ARGS) PG_RETURN_BOOL(res); } + +/* + * triconsistent support function + */ +Datum +ginarraytriconsistent(PG_FUNCTION_ARGS) +{ + GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0); + StrategyNumber strategy = PG_GETARG_UINT16(1); + + /* ArrayType *query = PG_GETARG_ARRAYTYPE_P(2); */ + int32 nkeys = PG_GETARG_INT32(3); + + /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ + /* Datum *queryKeys = (Datum *) PG_GETARG_POINTER(5); */ + bool *nullFlags = (bool *) PG_GETARG_POINTER(6); + GinTernaryValue res; + int32 i; + + switch (strategy) + { + case GinOverlapStrategy: + /* must have a match for at least one non-null element */ + res = GIN_FALSE; + for (i = 0; i < nkeys; i++) + { + if (!nullFlags[i]) + { + if (check[i] == GIN_TRUE) + { + res = GIN_TRUE; + break; + } + else if (check[i] == GIN_MAYBE && res == GIN_FALSE) + { + res = GIN_MAYBE; + } + } + } + break; + case GinContainsStrategy: + /* must have all elements in check[] true, and no nulls */ + res = GIN_TRUE; + for (i = 0; i < nkeys; i++) + { + if (check[i] == GIN_FALSE || nullFlags[i]) + { + res = GIN_FALSE; + break; + } + if (check[i] == GIN_MAYBE) + { + res = GIN_MAYBE; + } + } + break; + case GinContainedStrategy: + /* can't do anything else useful here */ + res = GIN_MAYBE; + break; + case GinEqualStrategy: + + /* + * Must have all elements in check[] true; no discrimination + * against nulls here. This is because array_contain_compare and + * array_eq handle nulls differently ... + */ + res = GIN_MAYBE; + for (i = 0; i < nkeys; i++) + { + if (check[i] == GIN_FALSE) + { + res = GIN_FALSE; + break; + } + } + break; + default: + elog(ERROR, "ginarrayconsistent: unknown strategy number: %d", + strategy); + res = false; + } + + PG_RETURN_GIN_TERNARY_VALUE(res); +} diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index b160551b5e..27f88e0eb2 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -4,7 +4,7 @@ * page utilities routines for the postgres inverted index access method. * * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -18,8 +18,15 @@ #include "miscadmin.h" #include "utils/rel.h" +static void ginFindParents(GinBtree btree, GinBtreeStack *stack); +static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + Buffer childbuf, GinStatsData *buildStats); +static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack, + bool freestack, GinStatsData *buildStats); + /* - * Locks buffer by needed method for search. + * Lock buffer by needed method for search. */ static int ginTraverseLock(Buffer buffer, bool searchMode) @@ -52,58 +59,51 @@ ginTraverseLock(Buffer buffer, bool searchMode) return access; } -GinBtreeStack * -ginPrepareFindLeafPage(GinBtree btree, BlockNumber blkno) -{ - GinBtreeStack *stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); - - stack->blkno = blkno; - stack->buffer = ReadBuffer(btree->index, stack->blkno); - stack->parent = NULL; - stack->predictNumber = 1; - - ginTraverseLock(stack->buffer, btree->searchMode); - - return stack; -} - /* - * Locates leaf page contained tuple + * Descend the tree to the leaf page that contains or would contain the key + * we're searching for. The key should already be filled in 'btree', in + * tree-type specific manner. If btree->fullScan is true, descends to the + * leftmost leaf page. + * + * If 'searchmode' is false, on return stack->buffer is exclusively locked, + * and the stack represents the full path to the root. Otherwise stack->buffer + * is share-locked, and stack->parent is NULL. */ GinBtreeStack * -ginFindLeafPage(GinBtree btree, GinBtreeStack *stack) +ginFindLeafPage(GinBtree btree, bool searchMode) { - bool isfirst = TRUE; - BlockNumber rootBlkno; + GinBtreeStack *stack; - if (!stack) - stack = ginPrepareFindLeafPage(btree, GIN_ROOT_BLKNO); - rootBlkno = stack->blkno; + stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); + stack->blkno = btree->rootBlkno; + stack->buffer = ReadBuffer(btree->index, btree->rootBlkno); + stack->parent = NULL; + stack->predictNumber = 1; for (;;) { Page page; BlockNumber child; - int access = GIN_SHARE; + int access; stack->off = InvalidOffsetNumber; page = BufferGetPage(stack->buffer); - if (isfirst) - { - if (GinPageIsLeaf(page) && !btree->searchMode) - access = GIN_EXCLUSIVE; - isfirst = FALSE; - } - else - access = ginTraverseLock(stack->buffer, btree->searchMode); + access = ginTraverseLock(stack->buffer, searchMode); + + /* + * If we're going to modify the tree, finish any incomplete splits we + * encounter on the way. + */ + if (!searchMode && GinPageIsIncompleteSplit(page)) + ginFinishSplit(btree, stack, false, NULL); /* * ok, page is correctly locked, we should check to move right .., * root never has a right link, so small optimization */ - while (btree->fullScan == FALSE && stack->blkno != rootBlkno && + while (btree->fullScan == FALSE && stack->blkno != btree->rootBlkno && btree->isMoveRight(btree, page)) { BlockNumber rightlink = GinPageGetOpaque(page)->rightlink; @@ -112,11 +112,12 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack) /* rightmost page */ break; + stack->buffer = ginStepRight(stack->buffer, btree->index, access); stack->blkno = rightlink; - LockBuffer(stack->buffer, GIN_UNLOCK); - stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno); - LockBuffer(stack->buffer, access); page = BufferGetPage(stack->buffer); + + if (!searchMode && GinPageIsIncompleteSplit(page)) + ginFinishSplit(btree, stack, false, NULL); } if (GinPageIsLeaf(page)) /* we found, return locked page */ @@ -129,7 +130,7 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack) Assert(child != InvalidBlockNumber); Assert(stack->blkno != child); - if (btree->searchMode) + if (searchMode) { /* in search mode we may forget path to leaf */ stack->blkno = child; @@ -146,9 +147,41 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack) stack->predictNumber = 1; } } +} - /* keep compiler happy */ - return NULL; +/* + * Step right from current page. + * + * The next page is locked first, before releasing the current page. This is + * crucial to protect from concurrent page deletion (see comment in + * ginDeletePage). + */ +Buffer +ginStepRight(Buffer buffer, Relation index, int lockmode) +{ + Buffer nextbuffer; + Page page = BufferGetPage(buffer); + bool isLeaf = GinPageIsLeaf(page); + bool isData = GinPageIsData(page); + BlockNumber blkno = GinPageGetOpaque(page)->rightlink; + + nextbuffer = ReadBuffer(index, blkno); + LockBuffer(nextbuffer, lockmode); + UnlockReleaseBuffer(buffer); + + /* Sanity check that the page we stepped to is of similar kind. */ + page = BufferGetPage(nextbuffer); + if (isLeaf != GinPageIsLeaf(page) || isData != GinPageIsData(page)) + elog(ERROR, "right sibling of GIN page is of different type"); + + /* + * Given the proper lock sequence above, we should never land on a deleted + * page. + */ + if (GinPageIsDeleted(page)) + elog(ERROR, "right sibling of GIN page was deleted"); + + return nextbuffer; } void @@ -167,172 +200,293 @@ freeGinBtreeStack(GinBtreeStack *stack) } /* - * Try to find parent for current stack position, returns correct - * parent and child's offset in stack->parent. - * Function should never release root page to prevent conflicts - * with vacuum process + * Try to find parent for current stack position. Returns correct parent and + * child's offset in stack->parent. The root page is never released, to + * to prevent conflict with vacuum process. */ -void -ginFindParents(GinBtree btree, GinBtreeStack *stack, - BlockNumber rootBlkno) +static void +ginFindParents(GinBtree btree, GinBtreeStack *stack) { - Page page; Buffer buffer; BlockNumber blkno, leftmostBlkno; OffsetNumber offset; - GinBtreeStack *root = stack->parent; + GinBtreeStack *root; GinBtreeStack *ptr; - if (!root) + /* + * Unwind the stack all the way up to the root, leaving only the root + * item. + * + * Be careful not to release the pin on the root page! The pin on root + * page is required to lock out concurrent vacuums on the tree. + */ + root = stack->parent; + while (root->parent) { - /* XLog mode... */ - root = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); - root->blkno = rootBlkno; - root->buffer = ReadBuffer(btree->index, rootBlkno); - LockBuffer(root->buffer, GIN_EXCLUSIVE); - root->parent = NULL; + ReleaseBuffer(root->buffer); + root = root->parent; } - else - { - /* - * find root, we should not release root page until update is - * finished!! - */ - while (root->parent) - { - ReleaseBuffer(root->buffer); - root = root->parent; - } - Assert(root->blkno == rootBlkno); - Assert(BufferGetBlockNumber(root->buffer) == rootBlkno); - LockBuffer(root->buffer, GIN_EXCLUSIVE); - } + Assert(root->blkno == btree->rootBlkno); + Assert(BufferGetBlockNumber(root->buffer) == btree->rootBlkno); root->off = InvalidOffsetNumber; - page = BufferGetPage(root->buffer); - Assert(!GinPageIsLeaf(page)); - - /* check trivial case */ - if ((root->off = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) != InvalidOffsetNumber) - { - stack->parent = root; - return; - } + blkno = root->blkno; + buffer = root->buffer; + offset = InvalidOffsetNumber; - leftmostBlkno = blkno = btree->getLeftMostPage(btree, page); - LockBuffer(root->buffer, GIN_UNLOCK); - Assert(blkno != InvalidBlockNumber); + ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); for (;;) { - buffer = ReadBuffer(btree->index, blkno); LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); if (GinPageIsLeaf(page)) elog(ERROR, "Lost path"); - leftmostBlkno = btree->getLeftMostPage(btree, page); + if (GinPageIsIncompleteSplit(page)) + { + Assert(blkno != btree->rootBlkno); + ptr->blkno = blkno; + ptr->buffer = buffer; + + /* + * parent may be wrong, but if so, the ginFinishSplit call will + * recurse to call ginFindParents again to fix it. + */ + ptr->parent = root; + ptr->off = InvalidOffsetNumber; + + ginFinishSplit(btree, ptr, false, NULL); + } + + leftmostBlkno = btree->getLeftMostChild(btree, page); while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber) { blkno = GinPageGetOpaque(page)->rightlink; - LockBuffer(buffer, GIN_UNLOCK); - ReleaseBuffer(buffer); if (blkno == InvalidBlockNumber) + { + UnlockReleaseBuffer(buffer); break; - buffer = ReadBuffer(btree->index, blkno); - LockBuffer(buffer, GIN_EXCLUSIVE); + } + buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE); page = BufferGetPage(buffer); + + /* finish any incomplete splits, as above */ + if (GinPageIsIncompleteSplit(page)) + { + Assert(blkno != btree->rootBlkno); + ptr->blkno = blkno; + ptr->buffer = buffer; + ptr->parent = root; + ptr->off = InvalidOffsetNumber; + + ginFinishSplit(btree, ptr, false, NULL); + } } if (blkno != InvalidBlockNumber) { - ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); ptr->blkno = blkno; ptr->buffer = buffer; - ptr->parent = root; /* it's may be wrong, but in next call we will + ptr->parent = root; /* it may be wrong, but in next call we will * correct */ ptr->off = offset; stack->parent = ptr; return; } + /* Descend down to next level */ blkno = leftmostBlkno; + buffer = ReadBuffer(btree->index, blkno); } } /* - * Insert value (stored in GinBtree) to tree described by stack + * Insert a new item to a page. * - * During an index build, buildStats is non-null and the counters - * it contains should be incremented as needed. + * Returns true if the insertion was finished. On false, the page was split and + * the parent needs to be updated. (a root split returns true as it doesn't + * need any further action by the caller to complete) * - * NB: the passed-in stack is freed, as though by freeGinBtreeStack. + * When inserting a downlink to a internal page, 'childbuf' contains the + * child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared + * atomically with the insert. Also, the existing item at the given location + * is updated to point to 'updateblkno'. + * + * stack->buffer is locked on entry, and is kept locked. */ -void -ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats) +static bool +ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + Buffer childbuf, GinStatsData *buildStats) { - GinBtreeStack *parent = stack; - BlockNumber rootBlkno = InvalidBuffer; - Page page, - rpage, - lpage; - - /* remember root BlockNumber */ - while (parent) + Page page = BufferGetPage(stack->buffer); + XLogRecData *payloadrdata; + GinPlaceToPageRC rc; + uint16 xlflags = 0; + Page childpage = NULL; + Page newlpage = NULL, + newrpage = NULL; + + if (GinPageIsData(page)) + xlflags |= GIN_INSERT_ISDATA; + if (GinPageIsLeaf(page)) + { + xlflags |= GIN_INSERT_ISLEAF; + Assert(!BufferIsValid(childbuf)); + Assert(updateblkno == InvalidBlockNumber); + } + else { - rootBlkno = parent->blkno; - parent = parent->parent; + Assert(BufferIsValid(childbuf)); + Assert(updateblkno != InvalidBlockNumber); + childpage = BufferGetPage(childbuf); } - while (stack) + /* + * Try to put the incoming tuple on the page. placeToPage will decide if + * the page needs to be split. + */ + rc = btree->placeToPage(btree, stack->buffer, stack, + insertdata, updateblkno, + &payloadrdata, &newlpage, &newrpage); + if (rc == UNMODIFIED) + return true; + else if (rc == INSERTED) { - XLogRecData *rdata; - BlockNumber savedRightLink; + /* placeToPage did START_CRIT_SECTION() */ + MarkBufferDirty(stack->buffer); - page = BufferGetPage(stack->buffer); - savedRightLink = GinPageGetOpaque(page)->rightlink; + /* An insert to an internal page finishes the split of the child. */ + if (childbuf != InvalidBuffer) + { + GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; + MarkBufferDirty(childbuf); + } - if (btree->isEnoughSpace(btree, stack->buffer, stack->off)) + if (RelationNeedsWAL(btree->index)) { - START_CRIT_SECTION(); - btree->placeToPage(btree, stack->buffer, stack->off, &rdata); + XLogRecPtr recptr; + XLogRecData rdata[3]; + ginxlogInsert xlrec; + BlockIdData childblknos[2]; - MarkBufferDirty(stack->buffer); + xlrec.node = btree->index->rd_node; + xlrec.blkno = BufferGetBlockNumber(stack->buffer); + xlrec.flags = xlflags; - if (RelationNeedsWAL(btree->index)) + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &xlrec; + rdata[0].len = sizeof(ginxlogInsert); + + /* + * Log information about child if this was an insertion of a + * downlink. + */ + if (childbuf != InvalidBuffer) { - XLogRecPtr recptr; + rdata[0].next = &rdata[1]; - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata); - PageSetLSN(page, recptr); - PageSetTLI(page, ThisTimeLineID); + BlockIdSet(&childblknos[0], BufferGetBlockNumber(childbuf)); + BlockIdSet(&childblknos[1], GinPageGetOpaque(childpage)->rightlink); + + rdata[1].buffer = InvalidBuffer; + rdata[1].data = (char *) childblknos; + rdata[1].len = sizeof(BlockIdData) * 2; + rdata[1].next = &rdata[2]; + + rdata[2].buffer = childbuf; + rdata[2].buffer_std = false; + rdata[2].data = NULL; + rdata[2].len = 0; + rdata[2].next = payloadrdata; } + else + rdata[0].next = payloadrdata; - LockBuffer(stack->buffer, GIN_UNLOCK); - END_CRIT_SECTION(); + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata); + PageSetLSN(page, recptr); + if (childbuf != InvalidBuffer) + PageSetLSN(childpage, recptr); + } - freeGinBtreeStack(stack); + END_CRIT_SECTION(); - return; + return true; + } + else if (rc == SPLIT) + { + /* Didn't fit, have to split */ + Buffer rbuffer; + BlockNumber savedRightLink; + XLogRecData rdata[2]; + ginxlogSplit data; + Buffer lbuffer = InvalidBuffer; + Page newrootpg = NULL; + + rbuffer = GinNewBuffer(btree->index); + + /* During index build, count the new page */ + if (buildStats) + { + if (btree->isData) + buildStats->nDataPages++; + else + buildStats->nEntryPages++; + } + + savedRightLink = GinPageGetOpaque(page)->rightlink; + + /* + * newlpage and newrpage are pointers to memory pages, not associated + * with buffers. stack->buffer is not touched yet. + */ + + data.node = btree->index->rd_node; + data.rblkno = BufferGetBlockNumber(rbuffer); + data.flags = xlflags; + if (childbuf != InvalidBuffer) + { + Page childpage = BufferGetPage(childbuf); + + GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; + + data.leftChildBlkno = BufferGetBlockNumber(childbuf); + data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink; } else + data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber; + + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &data; + rdata[0].len = sizeof(ginxlogSplit); + + if (childbuf != InvalidBuffer) { - Buffer rbuffer = GinNewBuffer(btree->index); - Page newlpage; + rdata[0].next = &rdata[1]; + + rdata[1].buffer = childbuf; + rdata[1].buffer_std = false; + rdata[1].data = NULL; + rdata[1].len = 0; + rdata[1].next = payloadrdata; + } + else + rdata[0].next = payloadrdata; + if (stack->parent == NULL) + { /* - * newlpage is a pointer to memory page, it doesn't associate with - * buffer, stack->buffer should be untouched + * split root, so we need to allocate new left page and place + * pointer on root to left and right page */ - newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off, &rdata); - - ((ginxlogSplit *) (rdata->data))->rootBlkno = rootBlkno; + lbuffer = GinNewBuffer(btree->index); - /* During index build, count the newly-split page */ + /* During index build, count the newly-added root page */ if (buildStats) { if (btree->isData) @@ -341,134 +495,259 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats) buildStats->nEntryPages++; } - parent = stack->parent; + /* + * root never has a right-link, so we borrow the rrlink field to + * store the root block number. + */ + data.rrlink = BufferGetBlockNumber(stack->buffer); + data.lblkno = BufferGetBlockNumber(lbuffer); + data.flags |= GIN_SPLIT_ROOT; - if (parent == NULL) - { - /* - * split root, so we need to allocate new left page and place - * pointer on root to left and right page - */ - Buffer lbuffer = GinNewBuffer(btree->index); - - ((ginxlogSplit *) (rdata->data))->isRootSplit = TRUE; - ((ginxlogSplit *) (rdata->data))->rrlink = InvalidBlockNumber; - - page = BufferGetPage(stack->buffer); - lpage = BufferGetPage(lbuffer); - rpage = BufferGetPage(rbuffer); - - GinPageGetOpaque(rpage)->rightlink = InvalidBlockNumber; - GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); - ((ginxlogSplit *) (rdata->data))->lblkno = BufferGetBlockNumber(lbuffer); - - START_CRIT_SECTION(); - - GinInitBuffer(stack->buffer, GinPageGetOpaque(newlpage)->flags & ~GIN_LEAF); - PageRestoreTempPage(newlpage, lpage); - btree->fillRoot(btree, stack->buffer, lbuffer, rbuffer); - - MarkBufferDirty(rbuffer); - MarkBufferDirty(lbuffer); - MarkBufferDirty(stack->buffer); - - if (RelationNeedsWAL(btree->index)) - { - XLogRecPtr recptr; - - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); - PageSetLSN(page, recptr); - PageSetTLI(page, ThisTimeLineID); - PageSetLSN(lpage, recptr); - PageSetTLI(lpage, ThisTimeLineID); - PageSetLSN(rpage, recptr); - PageSetTLI(rpage, ThisTimeLineID); - } - - UnlockReleaseBuffer(rbuffer); - UnlockReleaseBuffer(lbuffer); - LockBuffer(stack->buffer, GIN_UNLOCK); - END_CRIT_SECTION(); - - freeGinBtreeStack(stack); - - /* During index build, count the newly-added root page */ - if (buildStats) - { - if (btree->isData) - buildStats->nDataPages++; - else - buildStats->nEntryPages++; - } - - return; - } - else - { - /* split non-root page */ - ((ginxlogSplit *) (rdata->data))->isRootSplit = FALSE; - ((ginxlogSplit *) (rdata->data))->rrlink = savedRightLink; - - lpage = BufferGetPage(stack->buffer); - rpage = BufferGetPage(rbuffer); - - GinPageGetOpaque(rpage)->rightlink = savedRightLink; - GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); - - START_CRIT_SECTION(); - PageRestoreTempPage(newlpage, lpage); - - MarkBufferDirty(rbuffer); - MarkBufferDirty(stack->buffer); - - if (RelationNeedsWAL(btree->index)) - { - XLogRecPtr recptr; - - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); - PageSetLSN(lpage, recptr); - PageSetTLI(lpage, ThisTimeLineID); - PageSetLSN(rpage, recptr); - PageSetTLI(rpage, ThisTimeLineID); - } - UnlockReleaseBuffer(rbuffer); - END_CRIT_SECTION(); - } + GinPageGetOpaque(newrpage)->rightlink = InvalidBlockNumber; + GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); + + /* + * Construct a new root page containing downlinks to the new left + * and right pages. (do this in a temporary copy first rather than + * overwriting the original page directly, so that we can still + * abort gracefully if this fails.) + */ + newrootpg = PageGetTempPage(newrpage); + GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ); + + btree->fillRoot(btree, newrootpg, + BufferGetBlockNumber(lbuffer), newlpage, + BufferGetBlockNumber(rbuffer), newrpage); + } + else + { + /* split non-root page */ + data.rrlink = savedRightLink; + data.lblkno = BufferGetBlockNumber(stack->buffer); + + GinPageGetOpaque(newrpage)->rightlink = savedRightLink; + GinPageGetOpaque(newlpage)->flags |= GIN_INCOMPLETE_SPLIT; + GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); + } + + /* + * Ok, we have the new contents of the left page in a temporary copy + * now (newlpage), and the newly-allocated right block has been filled + * in. The original page is still unchanged. + * + * If this is a root split, we also have a temporary page containing + * the new contents of the root. Copy the new left page to a + * newly-allocated block, and initialize the (original) root page the + * new copy. Otherwise, copy over the temporary copy of the new left + * page over the old left page. + */ + + START_CRIT_SECTION(); + + MarkBufferDirty(rbuffer); + MarkBufferDirty(stack->buffer); + if (BufferIsValid(childbuf)) + MarkBufferDirty(childbuf); + + /* + * Restore the temporary copies over the real buffers. But don't free + * the temporary copies yet, WAL record data points to them. + */ + if (stack->parent == NULL) + { + MarkBufferDirty(lbuffer); + memcpy(BufferGetPage(stack->buffer), newrootpg, BLCKSZ); + memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ); + memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); } + else + { + memcpy(BufferGetPage(stack->buffer), newlpage, BLCKSZ); + memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); + } + + /* write WAL record */ + if (RelationNeedsWAL(btree->index)) + { + XLogRecPtr recptr; + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); + PageSetLSN(BufferGetPage(stack->buffer), recptr); + PageSetLSN(BufferGetPage(rbuffer), recptr); + if (stack->parent == NULL) + PageSetLSN(BufferGetPage(lbuffer), recptr); + if (BufferIsValid(childbuf)) + PageSetLSN(childpage, recptr); + } + END_CRIT_SECTION(); + + /* + * We can release the lock on the right page now, but keep the + * original buffer locked. + */ + UnlockReleaseBuffer(rbuffer); + if (stack->parent == NULL) + UnlockReleaseBuffer(lbuffer); + + pfree(newlpage); + pfree(newrpage); + if (newrootpg) + pfree(newrootpg); + + /* + * If we split the root, we're done. Otherwise the split is not + * complete until the downlink for the new page has been inserted to + * the parent. + */ + if (stack->parent == NULL) + return true; + else + return false; + } + else + { + elog(ERROR, "unknown return code from GIN placeToPage method: %d", rc); + return false; /* keep compiler quiet */ + } +} - btree->isDelete = FALSE; +/* + * Finish a split by inserting the downlink for the new page to parent. + * + * On entry, stack->buffer is exclusively locked. + * + * If freestack is true, all the buffers are released and unlocked as we + * crawl up the tree, and 'stack' is freed. Otherwise stack->buffer is kept + * locked, and stack is unmodified, except for possibly moving right to find + * the correct parent of page. + */ +static void +ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack, + GinStatsData *buildStats) +{ + Page page; + bool done; + bool first = true; + + /* + * freestack == false when we encounter an incompletely split page during + * a scan, while freestack == true is used in the normal scenario that a + * split is finished right after the initial insert. + */ + if (!freestack) + elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"", + stack->blkno, RelationGetRelationName(btree->index)); + + /* this loop crawls up the stack until the insertion is complete */ + do + { + GinBtreeStack *parent = stack->parent; + void *insertdata; + BlockNumber updateblkno; /* search parent to lock */ LockBuffer(parent->buffer, GIN_EXCLUSIVE); + /* + * If the parent page was incompletely split, finish that split first, + * then continue with the current one. + * + * Note: we have to finish *all* incomplete splits we encounter, even + * if we have to move right. Otherwise we might choose as the target a + * page that has no downlink in the parent, and splitting it further + * would fail. + */ + if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer))) + ginFinishSplit(btree, parent, false, buildStats); + /* move right if it's needed */ page = BufferGetPage(parent->buffer); while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber) { - BlockNumber rightlink = GinPageGetOpaque(page)->rightlink; - - LockBuffer(parent->buffer, GIN_UNLOCK); - - if (rightlink == InvalidBlockNumber) + if (GinPageRightMost(page)) { /* * rightmost page, but we don't find parent, we should use * plain search... */ - ginFindParents(btree, stack, rootBlkno); + LockBuffer(parent->buffer, GIN_UNLOCK); + ginFindParents(btree, stack); parent = stack->parent; - page = BufferGetPage(parent->buffer); + Assert(parent != NULL); break; } - parent->blkno = rightlink; - parent->buffer = ReleaseAndReadBuffer(parent->buffer, btree->index, parent->blkno); - LockBuffer(parent->buffer, GIN_EXCLUSIVE); + parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE); + parent->blkno = BufferGetBlockNumber(parent->buffer); page = BufferGetPage(parent->buffer); + + if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer))) + ginFinishSplit(btree, parent, false, buildStats); } - UnlockReleaseBuffer(stack->buffer); - pfree(stack); + /* insert the downlink */ + insertdata = btree->prepareDownlink(btree, stack->buffer); + updateblkno = GinPageGetOpaque(BufferGetPage(stack->buffer))->rightlink; + done = ginPlaceToPage(btree, parent, + insertdata, updateblkno, + stack->buffer, buildStats); + pfree(insertdata); + + /* + * If the caller requested to free the stack, unlock and release the + * child buffer now. Otherwise keep it pinned and locked, but if we + * have to recurse up the tree, we can unlock the upper pages, only + * keeping the page at the bottom of the stack locked. + */ + if (!first || freestack) + LockBuffer(stack->buffer, GIN_UNLOCK); + if (freestack) + { + ReleaseBuffer(stack->buffer); + pfree(stack); + } stack = parent; + + first = false; + } while (!done); + + /* unlock the parent */ + LockBuffer(stack->buffer, GIN_UNLOCK); + + if (freestack) + freeGinBtreeStack(stack); +} + +/* + * Insert a value to tree described by stack. + * + * The value to be inserted is given in 'insertdata'. Its format depends + * on whether this is an entry or data tree, ginInsertValue just passes it + * through to the tree-specific callback function. + * + * During an index build, buildStats is non-null and the counters it contains + * are incremented as needed. + * + * NB: the passed-in stack is freed, as though by freeGinBtreeStack. + */ +void +ginInsertValue(GinBtree btree, GinBtreeStack *stack, void *insertdata, + GinStatsData *buildStats) +{ + bool done; + + /* If the leaf page was incompletely split, finish the split first */ + if (GinPageIsIncompleteSplit(BufferGetPage(stack->buffer))) + ginFinishSplit(btree, stack, false, buildStats); + + done = ginPlaceToPage(btree, stack, + insertdata, InvalidBlockNumber, + InvalidBuffer, buildStats); + if (done) + { + LockBuffer(stack->buffer, GIN_UNLOCK); + freeGinBtreeStack(stack); } + else + ginFinishSplit(btree, stack, true, buildStats); } diff --git a/src/backend/access/gin/ginbulk.c b/src/backend/access/gin/ginbulk.c index ddae862720..3af027187a 100644 --- a/src/backend/access/gin/ginbulk.c +++ b/src/backend/access/gin/ginbulk.c @@ -4,7 +4,7 @@ * routines for fast build of inverted index * * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -187,7 +187,7 @@ ginInsertBAEntry(BuildAccumulator *accum, * Since the entries are being inserted into a balanced binary tree, you * might think that the order of insertion wouldn't be critical, but it turns * out that inserting the entries in sorted order results in a lot of - * rebalancing operations and is slow. To prevent this, we attempt to insert + * rebalancing operations and is slow. To prevent this, we attempt to insert * the nodes in an order that will produce a nearly-balanced tree if the input * is in fact sorted. * diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index 13601468ed..272a9ca7c0 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -1,10 +1,10 @@ /*------------------------------------------------------------------------- * * gindatapage.c - * page utilities routines for the postgres inverted index access method. + * routines for handling GIN posting tree pages. * * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -15,69 +15,214 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/heapam_xlog.h" +#include "lib/ilist.h" +#include "miscadmin.h" +#include "utils/memutils.h" #include "utils/rel.h" -int -ginCompareItemPointers(ItemPointer a, ItemPointer b) +/* + * Min, Max and Target size of posting lists stored on leaf pages, in bytes. + * + * The code can deal with any size, but random access is more efficient when + * a number of smaller lists are stored, rather than one big list. If a + * posting list would become larger than Max size as a result of insertions, + * it is split into two. If a posting list would be smaller than minimum + * size, it is merged with the next posting list. + */ +#define GinPostingListSegmentMaxSize 384 +#define GinPostingListSegmentTargetSize 256 +#define GinPostingListSegmentMinSize 128 + +/* + * At least this many items fit in a GinPostingListSegmentMaxSize-bytes + * long segment. This is used when estimating how much space is required + * for N items, at minimum. + */ +#define MinTuplesPerSegment ((GinPostingListSegmentMaxSize - 2) / 6) + +/* + * A working struct for manipulating a posting tree leaf page. + */ +typedef struct { - BlockNumber ba = GinItemPointerGetBlockNumber(a); - BlockNumber bb = GinItemPointerGetBlockNumber(b); + dlist_head segments; /* a list of leafSegmentInfos */ - if (ba == bb) - { - OffsetNumber oa = GinItemPointerGetOffsetNumber(a); - OffsetNumber ob = GinItemPointerGetOffsetNumber(b); + /* + * The following fields represent how the segments are split across pages, + * if a page split is required. Filled in by leafRepackItems. + */ + dlist_node *lastleft; /* last segment on left page */ + int lsize; /* total size on left page */ + int rsize; /* total size on right page */ - if (oa == ob) - return 0; - return (oa > ob) ? 1 : -1; - } + bool oldformat; /* page is in pre-9.4 format on disk */ +} disassembledLeaf; - return (ba > bb) ? 1 : -1; -} +typedef struct +{ + dlist_node node; /* linked list pointers */ + + /*------------- + * 'action' indicates the status of this in-memory segment, compared to + * what's on disk. It is one of the GIN_SEGMENT_* action codes: + * + * UNMODIFIED no changes + * DELETE the segment is to be removed. 'seg' and 'items' are + * ignored + * INSERT this is a completely new segment + * REPLACE this replaces an existing segment with new content + * ADDITEMS like REPLACE, but no items have been removed, and we track + * in detail what items have been added to this segment, in + * 'modifieditems' + *------------- + */ + char action; + + ItemPointerData *modifieditems; + int nmodifieditems; + + /* + * The following fields represent the items in this segment. If 'items' is + * not NULL, it contains a palloc'd array of the itemsin this segment. If + * 'seg' is not NULL, it contains the items in an already-compressed + * format. It can point to an on-disk page (!modified), or a palloc'd + * segment in memory. If both are set, they must represent the same items. + */ + GinPostingList *seg; + ItemPointer items; + int nitems; /* # of items in 'items', if items != NULL */ +} leafSegmentInfo; + +static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems); +static void dataSplitPageInternal(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + XLogRecData **prdata, Page *newlpage, Page *newrpage); + +static disassembledLeaf *disassembleLeaf(Page page); +static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining); +static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, + int nNewItems); + +static XLogRecData *constructLeafRecompressWALData(Buffer buf, + disassembledLeaf *leaf); +static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf); +static void dataPlaceToPageLeafSplit(Buffer buf, + disassembledLeaf *leaf, + ItemPointerData lbound, ItemPointerData rbound, + XLogRecData **prdata, Page lpage, Page rpage); /* - * Merge two ordered arrays of itempointers, eliminating any duplicates. - * Returns the number of items in the result. - * Caller is responsible that there is enough space at *dst. + * Read TIDs from leaf data page to single uncompressed array. The TIDs are + * returned in ascending order. + * + * advancePast is a hint, indicating that the caller is only interested in + * TIDs > advancePast. To return all items, use ItemPointerSetMin. + * + * Note: This function can still return items smaller than advancePast that + * are in the same posting list as the items of interest, so the caller must + * still check all the returned items. But passing it allows this function to + * skip whole posting lists. */ -uint32 -ginMergeItemPointers(ItemPointerData *dst, - ItemPointerData *a, uint32 na, - ItemPointerData *b, uint32 nb) +ItemPointer +GinDataLeafPageGetItems(Page page, int *nitems, ItemPointerData advancePast) { - ItemPointerData *dptr = dst; - ItemPointerData *aptr = a, - *bptr = b; + ItemPointer result; - while (aptr - a < na && bptr - b < nb) + if (GinPageIsCompressed(page)) { - int cmp = ginCompareItemPointers(aptr, bptr); + GinPostingList *seg = GinDataLeafPageGetPostingList(page); + Size len = GinDataLeafPageGetPostingListSize(page); + Pointer endptr = ((Pointer) seg) + len; + GinPostingList *next; - if (cmp > 0) - *dptr++ = *bptr++; - else if (cmp == 0) + /* Skip to the segment containing advancePast+1 */ + if (ItemPointerIsValid(&advancePast)) { - /* we want only one copy of the identical items */ - *dptr++ = *bptr++; - aptr++; + next = GinNextPostingListSegment(seg); + while ((Pointer) next < endptr && + ginCompareItemPointers(&next->first, &advancePast) <= 0) + { + seg = next; + next = GinNextPostingListSegment(seg); + } + len = endptr - (Pointer) seg; } + + if (len > 0) + result = ginPostingListDecodeAllSegments(seg, len, nitems); else - *dptr++ = *aptr++; + { + result = NULL; + *nitems = 0; + } } + else + { + ItemPointer tmp = dataLeafPageGetUncompressed(page, nitems); - while (aptr - a < na) - *dptr++ = *aptr++; + result = palloc((*nitems) * sizeof(ItemPointerData)); + memcpy(result, tmp, (*nitems) * sizeof(ItemPointerData)); + } - while (bptr - b < nb) - *dptr++ = *bptr++; + return result; +} - return dptr - dst; +/* + * Places all TIDs from leaf data page to bitmap. + */ +int +GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm) +{ + ItemPointer uncompressed; + int nitems; + + if (GinPageIsCompressed(page)) + { + GinPostingList *segment = GinDataLeafPageGetPostingList(page); + Size len = GinDataLeafPageGetPostingListSize(page); + + nitems = ginPostingListDecodeAllSegmentsToTbm(segment, len, tbm); + } + else + { + uncompressed = dataLeafPageGetUncompressed(page, &nitems); + + if (nitems > 0) + tbm_add_tuples(tbm, uncompressed, nitems, false); + } + + return nitems; } /* - * Checks, should we move to right link... - * Compares inserting itemp pointer with right bound of current page + * Get pointer to the uncompressed array of items on a pre-9.4 format + * uncompressed leaf page. The number of items in the array is returned in + * *nitems. + */ +static ItemPointer +dataLeafPageGetUncompressed(Page page, int *nitems) +{ + ItemPointer items; + + Assert(!GinPageIsCompressed(page)); + + /* + * In the old pre-9.4 page format, the whole page content is used for + * uncompressed items, and the number of items is stored in 'maxoff' + */ + items = (ItemPointer) GinDataPageGetData(page); + *nitems = GinPageGetOpaque(page)->maxoff; + + return items; +} + +/* + * Check if we should follow the right link to find the item we're searching + * for. + * + * Compares inserting item pointer with the right bound of the current page. */ static bool dataIsMoveRight(GinBtree btree, Page page) @@ -87,12 +232,12 @@ dataIsMoveRight(GinBtree btree, Page page) if (GinPageRightMost(page)) return FALSE; - return (ginCompareItemPointers(btree->items + btree->curitem, iptr) > 0) ? TRUE : FALSE; + return (ginCompareItemPointers(&btree->itemptr, iptr) > 0) ? TRUE : FALSE; } /* - * Find correct PostingItem in non-leaf page. It supposed that page - * correctly chosen and searching value SHOULD be on page + * Find correct PostingItem in non-leaf page. It is assumed that this is + * the correct page, and the searched value SHOULD be on the page. */ static BlockNumber dataLocateItem(GinBtree btree, GinBtreeStack *stack) @@ -111,7 +256,7 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack) { stack->off = FirstOffsetNumber; stack->predictNumber *= GinPageGetOpaque(page)->maxoff; - return btree->getLeftMostPage(btree, page); + return btree->getLeftMostChild(btree, page); } low = FirstOffsetNumber; @@ -124,7 +269,7 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack) { OffsetNumber mid = low + ((high - low) / 2); - pitem = (PostingItem *) GinDataPageGetItem(page, mid); + pitem = GinDataPageGetPostingItem(page, mid); if (mid == maxoff) { @@ -136,8 +281,8 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack) } else { - pitem = (PostingItem *) GinDataPageGetItem(page, mid); - result = ginCompareItemPointers(btree->items + btree->curitem, &(pitem->key)); + pitem = GinDataPageGetPostingItem(page, mid); + result = ginCompareItemPointers(&btree->itemptr, &(pitem->key)); } if (result == 0) @@ -154,67 +299,12 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack) Assert(high >= FirstOffsetNumber && high <= maxoff); stack->off = high; - pitem = (PostingItem *) GinDataPageGetItem(page, high); + pitem = GinDataPageGetPostingItem(page, high); return PostingItemGetBlockNumber(pitem); } /* - * Searches correct position for value on leaf page. - * Page should be correctly chosen. - * Returns true if value found on page. - */ -static bool -dataLocateLeafItem(GinBtree btree, GinBtreeStack *stack) -{ - Page page = BufferGetPage(stack->buffer); - OffsetNumber low, - high; - int result; - - Assert(GinPageIsLeaf(page)); - Assert(GinPageIsData(page)); - - if (btree->fullScan) - { - stack->off = FirstOffsetNumber; - return TRUE; - } - - low = FirstOffsetNumber; - high = GinPageGetOpaque(page)->maxoff; - - if (high < low) - { - stack->off = FirstOffsetNumber; - return false; - } - - high++; - - while (high > low) - { - OffsetNumber mid = low + ((high - low) / 2); - - result = ginCompareItemPointers(btree->items + btree->curitem, (ItemPointer) GinDataPageGetItem(page, mid)); - - if (result == 0) - { - stack->off = mid; - return true; - } - else if (result > 0) - low = mid + 1; - else - high = mid; - } - - stack->off = high; - return false; -} - -/* - * Finds links to blkno on non-leaf page, returns - * offset of PostingItem + * Find link to blkno on non-leaf page, returns offset of PostingItem */ static OffsetNumber dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff) @@ -229,7 +319,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor /* if page isn't changed, we return storedOff */ if (storedOff >= FirstOffsetNumber && storedOff <= maxoff) { - pitem = (PostingItem *) GinDataPageGetItem(page, storedOff); + pitem = GinDataPageGetPostingItem(page, storedOff); if (PostingItemGetBlockNumber(pitem) == blkno) return storedOff; @@ -239,7 +329,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor */ for (i = storedOff + 1; i <= maxoff; i++) { - pitem = (PostingItem *) GinDataPageGetItem(page, i); + pitem = GinDataPageGetPostingItem(page, i); if (PostingItemGetBlockNumber(pitem) == blkno) return i; } @@ -250,7 +340,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor /* last chance */ for (i = FirstOffsetNumber; i <= maxoff; i++) { - pitem = (PostingItem *) GinDataPageGetItem(page, i); + pitem = GinDataPageGetPostingItem(page, i); if (PostingItemGetBlockNumber(pitem) == blkno) return i; } @@ -259,7 +349,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor } /* - * returns blkno of leftmost child + * Return blkno of leftmost child */ static BlockNumber dataGetLeftMostPage(GinBtree btree, Page page) @@ -270,39 +360,49 @@ dataGetLeftMostPage(GinBtree btree, Page page) Assert(GinPageIsData(page)); Assert(GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber); - pitem = (PostingItem *) GinDataPageGetItem(page, FirstOffsetNumber); + pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber); return PostingItemGetBlockNumber(pitem); } /* - * add ItemPointer or PostingItem to page. data should point to - * correct value! depending on leaf or non-leaf page + * Add PostingItem to a non-leaf page. */ void -GinDataPageAddItem(Page page, void *data, OffsetNumber offset) +GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset) { OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; char *ptr; + Assert(PostingItemGetBlockNumber(data) != InvalidBlockNumber); + Assert(!GinPageIsLeaf(page)); + if (offset == InvalidOffsetNumber) { - ptr = GinDataPageGetItem(page, maxoff + 1); + ptr = (char *) GinDataPageGetPostingItem(page, maxoff + 1); } else { - ptr = GinDataPageGetItem(page, offset); - if (maxoff + 1 - offset != 0) - memmove(ptr + GinSizeOfDataPageItem(page), + ptr = (char *) GinDataPageGetPostingItem(page, offset); + if (offset != maxoff + 1) + memmove(ptr + sizeof(PostingItem), ptr, - (maxoff - offset + 1) * GinSizeOfDataPageItem(page)); + (maxoff - offset + 1) *sizeof(PostingItem)); } - memcpy(ptr, data, GinSizeOfDataPageItem(page)); + memcpy(ptr, data, sizeof(PostingItem)); + + maxoff++; + GinPageGetOpaque(page)->maxoff = maxoff; - GinPageGetOpaque(page)->maxoff++; + /* + * Also set pd_lower to the end of the posting items, to follow the + * "standard" page layout, so that we can squeeze out the unused space + * from full-page images. + */ + GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem)); } /* - * Deletes posting item from non-leaf page + * Delete posting item from non-leaf page */ void GinPageDeletePostingItem(Page page, OffsetNumber offset) @@ -313,271 +413,892 @@ GinPageDeletePostingItem(Page page, OffsetNumber offset) Assert(offset >= FirstOffsetNumber && offset <= maxoff); if (offset != maxoff) - memmove(GinDataPageGetItem(page, offset), GinDataPageGetItem(page, offset + 1), + memmove(GinDataPageGetPostingItem(page, offset), + GinDataPageGetPostingItem(page, offset + 1), sizeof(PostingItem) * (maxoff - offset)); - GinPageGetOpaque(page)->maxoff--; + maxoff--; + GinPageGetOpaque(page)->maxoff = maxoff; + + GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem)); } /* - * checks space to install new value, - * item pointer never deletes! + * Places keys to leaf data page and fills WAL record. */ -static bool -dataIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off) +static GinPlaceToPageRC +dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, XLogRecData **prdata, + Page *newlpage, Page *newrpage) { + GinBtreeDataLeafInsertData *items = insertdata; + ItemPointer newItems = &items->items[items->curitem]; + int maxitems = items->nitem - items->curitem; Page page = BufferGetPage(buf); + int i; + ItemPointerData rbound; + ItemPointerData lbound; + bool needsplit; + bool append; + int segsize; + Size freespace; + MemoryContext tmpCxt; + MemoryContext oldCxt; + disassembledLeaf *leaf; + leafSegmentInfo *lastleftinfo; + ItemPointerData maxOldItem; + ItemPointerData remaining; Assert(GinPageIsData(page)); - Assert(!btree->isDelete); - if (GinPageIsLeaf(page)) + rbound = *GinDataPageGetRightBound(page); + + /* + * Count how many of the new items belong to this page. + */ + if (!GinPageRightMost(page)) { - if (GinPageRightMost(page) && off > GinPageGetOpaque(page)->maxoff) + for (i = 0; i < maxitems; i++) { - if ((btree->nitem - btree->curitem) * sizeof(ItemPointerData) <= GinDataPageGetFreeSpace(page)) - return true; + if (ginCompareItemPointers(&newItems[i], &rbound) > 0) + { + /* + * This needs to go to some other location in the tree. (The + * caller should've chosen the insert location so that at + * least the first item goes here.) + */ + Assert(i > 0); + break; + } } - else if (sizeof(ItemPointerData) <= GinDataPageGetFreeSpace(page)) - return true; + maxitems = i; } - else if (sizeof(PostingItem) <= GinDataPageGetFreeSpace(page)) - return true; - return false; -} + /* + * The following operations do quite a lot of small memory allocations, + * create a temporary memory context so that we don't need to keep track + * of them individually. + */ + tmpCxt = AllocSetContextCreate(CurrentMemoryContext, + "Gin split temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldCxt = MemoryContextSwitchTo(tmpCxt); -/* - * In case of previous split update old child blkno to - * new right page - * item pointer never deletes! - */ -static BlockNumber -dataPrepareData(GinBtree btree, Page page, OffsetNumber off) -{ - BlockNumber ret = InvalidBlockNumber; + leaf = disassembleLeaf(page); - Assert(GinPageIsData(page)); + /* + * Are we appending to the end of the page? IOW, are all the new items + * larger than any of the existing items. + */ + if (!dlist_is_empty(&leaf->segments)) + { + lastleftinfo = dlist_container(leafSegmentInfo, node, + dlist_tail_node(&leaf->segments)); + if (!lastleftinfo->items) + lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg, + &lastleftinfo->nitems); + maxOldItem = lastleftinfo->items[lastleftinfo->nitems - 1]; + if (ginCompareItemPointers(&newItems[0], &maxOldItem) >= 0) + append = true; + else + append = false; + } + else + { + ItemPointerSetMin(&maxOldItem); + append = true; + } - if (!GinPageIsLeaf(page) && btree->rightblkno != InvalidBlockNumber) + /* + * If we're appending to the end of the page, we will append as many items + * as we can fit (after splitting), and stop when the pages becomes full. + * Otherwise we have to limit the number of new items to insert, because + * once we start packing we can't just stop when we run out of space, + * because we must make sure that all the old items still fit. + */ + if (GinPageIsCompressed(page)) + freespace = GinDataLeafPageGetFreeSpace(page); + else + freespace = 0; + if (append) { - PostingItem *pitem = (PostingItem *) GinDataPageGetItem(page, off); + /* + * Even when appending, trying to append more items than will fit is + * not completely free, because we will merge the new items and old + * items into an array below. In the best case, every new item fits in + * a single byte, and we can use all the free space on the old page as + * well as the new page. For simplicity, ignore segment overhead etc. + */ + maxitems = Min(maxitems, freespace + GinDataPageMaxDataSize); + } + else + { + /* + * Calculate a conservative estimate of how many new items we can fit + * on the two pages after splitting. + * + * We can use any remaining free space on the old page to store full + * segments, as well as the new page. Each full-sized segment can hold + * at least MinTuplesPerSegment items + */ + int nnewsegments; + + nnewsegments = freespace / GinPostingListSegmentMaxSize; + nnewsegments += GinDataPageMaxDataSize / GinPostingListSegmentMaxSize; + maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment); + } + + /* Add the new items to the segments */ + if (!addItemsToLeaf(leaf, newItems, maxitems)) + { + /* all items were duplicates, we have nothing to do */ + items->curitem += maxitems; + + MemoryContextSwitchTo(oldCxt); + MemoryContextDelete(tmpCxt); + + return UNMODIFIED; + } + + /* + * Pack the items back to compressed segments, ready for writing to disk. + */ + needsplit = leafRepackItems(leaf, &remaining); + + /* + * Did all the new items fit? + * + * If we're appending, it's OK if they didn't. But as a sanity check, + * verify that all the old items fit. + */ + if (ItemPointerIsValid(&remaining)) + { + if (!append || ItemPointerCompare(&maxOldItem, &remaining) >= 0) + elog(ERROR, "could not split GIN page; all old items didn't fit"); + + /* Count how many of the new items did fit. */ + for (i = 0; i < maxitems; i++) + { + if (ginCompareItemPointers(&newItems[i], &remaining) >= 0) + break; + } + if (i == 0) + elog(ERROR, "could not split GIN page; no new items fit"); + maxitems = i; + } + + if (!needsplit) + { + /* + * Great, all the items fit on a single page. Construct a WAL record + * describing the changes we made, and write the segments back to the + * page. + * + * Once we start modifying the page, there's no turning back. The + * caller is responsible for calling END_CRIT_SECTION() after writing + * the WAL record. + */ + MemoryContextSwitchTo(oldCxt); + if (RelationNeedsWAL(btree->index)) + *prdata = constructLeafRecompressWALData(buf, leaf); + else + *prdata = NULL; + START_CRIT_SECTION(); + dataPlaceToPageLeafRecompress(buf, leaf); + + if (append) + elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, + items->nitem - items->curitem - maxitems); + else + elog(DEBUG2, "inserted %d new items to block %u; %d bytes (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, + items->nitem - items->curitem - maxitems); + } + else + { + /* + * Had to split. + * + * We already divided the segments between the left and the right + * page. The left page was filled as full as possible, and the rest + * overflowed to the right page. When building a new index, that's + * good, because the table is scanned from beginning to end and there + * won't be any more insertions to the left page during the build. + * This packs the index as tight as possible. But otherwise, split + * 50/50, by moving segments from the left page to the right page + * until they're balanced. + * + * As a further heuristic, when appending items to the end of the + * page, split 75/25, one the assumption that subsequent insertions + * will probably also go to the end. This packs the index somewhat + * tighter when appending to a table, which is very common. + */ + if (!btree->isBuild) + { + while (dlist_has_prev(&leaf->segments, leaf->lastleft)) + { + lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft); + + segsize = SizeOfGinPostingList(lastleftinfo->seg); + if (append) + { + if ((leaf->lsize - segsize) - (leaf->lsize - segsize) < BLCKSZ / 4) + break; + } + else + { + if ((leaf->lsize - segsize) - (leaf->rsize + segsize) < 0) + break; + } + + leaf->lsize -= segsize; + leaf->rsize += segsize; + leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft); + } + } + Assert(leaf->lsize <= GinDataPageMaxDataSize); + Assert(leaf->rsize <= GinDataPageMaxDataSize); - PostingItemSetBlockNumber(pitem, btree->rightblkno); - ret = btree->rightblkno; + /* + * Fetch the max item in the left page's last segment; it becomes the + * right bound of the page. + */ + lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft); + if (!lastleftinfo->items) + lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg, + &lastleftinfo->nitems); + lbound = lastleftinfo->items[lastleftinfo->nitems - 1]; + + *newlpage = MemoryContextAlloc(oldCxt, BLCKSZ); + *newrpage = MemoryContextAlloc(oldCxt, BLCKSZ); + + dataPlaceToPageLeafSplit(buf, leaf, lbound, rbound, + prdata, *newlpage, *newrpage); + + Assert(GinPageRightMost(page) || + ginCompareItemPointers(GinDataPageGetRightBound(*newlpage), + GinDataPageGetRightBound(*newrpage)) < 0); + + if (append) + elog(DEBUG2, "appended %d items to block %u; split %d/%d (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize, + items->nitem - items->curitem - maxitems); + else + elog(DEBUG2, "inserted %d items to block %u; split %d/%d (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize, + items->nitem - items->curitem - maxitems); } - btree->rightblkno = InvalidBlockNumber; + MemoryContextSwitchTo(oldCxt); + MemoryContextDelete(tmpCxt); + + items->curitem += maxitems; - return ret; + return needsplit ? SPLIT : INSERTED; } /* - * Places keys to page and fills WAL record. In case leaf page and - * build mode puts all ItemPointers to page. + * Vacuum a posting tree leaf page. */ -static void -dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata) +void +ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) { - Page page = BufferGetPage(buf); - int sizeofitem = GinSizeOfDataPageItem(page); - int cnt = 0; + Page page = BufferGetPage(buffer); + disassembledLeaf *leaf; + bool removedsomething = false; + dlist_iter iter; - /* these must be static so they can be returned to caller */ - static XLogRecData rdata[3]; - static ginxlogInsert data; + leaf = disassembleLeaf(page); - *prdata = rdata; - Assert(GinPageIsData(page)); - - data.updateBlkno = dataPrepareData(btree, page, off); + /* Vacuum each segment. */ + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); + int oldsegsize; + ItemPointer cleaned; + int ncleaned; + + if (!seginfo->items) + seginfo->items = ginPostingListDecode(seginfo->seg, + &seginfo->nitems); + if (seginfo->seg) + oldsegsize = SizeOfGinPostingList(seginfo->seg); + else + oldsegsize = GinDataPageMaxDataSize; + + cleaned = ginVacuumItemPointers(gvs, + seginfo->items, + seginfo->nitems, + &ncleaned); + pfree(seginfo->items); + seginfo->items = NULL; + seginfo->nitems = 0; + if (cleaned) + { + if (ncleaned > 0) + { + int npacked; + + seginfo->seg = ginCompressPostingList(cleaned, + ncleaned, + oldsegsize, + &npacked); + /* Removing an item never increases the size of the segment */ + if (npacked != ncleaned) + elog(ERROR, "could not fit vacuumed posting list"); + seginfo->action = GIN_SEGMENT_REPLACE; + } + else + { + seginfo->seg = NULL; + seginfo->items = NULL; + seginfo->action = GIN_SEGMENT_DELETE; + } + seginfo->nitems = ncleaned; - data.node = btree->index->rd_node; - data.blkno = BufferGetBlockNumber(buf); - data.offset = off; - data.nitem = 1; - data.isDelete = FALSE; - data.isData = TRUE; - data.isLeaf = GinPageIsLeaf(page) ? TRUE : FALSE; + removedsomething = true; + } + } /* - * Prevent full page write if child's split occurs. That is needed to - * remove incomplete splits while replaying WAL + * If we removed any items, reconstruct the page from the pieces. * - * data.updateBlkno contains new block number (of newly created right - * page) for recently splited page. + * We don't try to re-encode the segments here, even though some of them + * might be really small now that we've removed some items from them. It + * seems like a waste of effort, as there isn't really any benefit from + * larger segments per se; larger segments only help to pack more items in + * the same space. We might as well delay doing that until the next + * insertion, which will need to re-encode at least part of the page + * anyway. + * + * Also note if the page was in uncompressed, pre-9.4 format before, it is + * now represented as one huge segment that contains all the items. It + * might make sense to split that, to speed up random access, but we don't + * bother. You'll have to REINDEX anyway if you want the full gain of the + * new tighter index format. */ - if (data.updateBlkno == InvalidBlockNumber) + if (removedsomething) { - rdata[0].buffer = buf; - rdata[0].buffer_std = FALSE; - rdata[0].data = NULL; - rdata[0].len = 0; - rdata[0].next = &rdata[1]; - cnt++; + XLogRecData *payloadrdata = NULL; + bool modified; + + /* + * Make sure we have a palloc'd copy of all segments, after the first + * segment that is modified. (dataPlaceToPageLeafRecompress requires + * this). + */ + modified = false; + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + + if (seginfo->action != GIN_SEGMENT_UNMODIFIED) + modified = true; + if (modified && seginfo->action != GIN_SEGMENT_DELETE) + { + int segsize = SizeOfGinPostingList(seginfo->seg); + GinPostingList *tmp = (GinPostingList *) palloc(segsize); + + memcpy(tmp, seginfo->seg, segsize); + seginfo->seg = tmp; + } + } + + if (RelationNeedsWAL(indexrel)) + payloadrdata = constructLeafRecompressWALData(buffer, leaf); + START_CRIT_SECTION(); + dataPlaceToPageLeafRecompress(buffer, leaf); + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(indexrel)) + { + XLogRecPtr recptr; + XLogRecData rdata; + ginxlogVacuumDataLeafPage xlrec; + + xlrec.node = indexrel->rd_node; + xlrec.blkno = BufferGetBlockNumber(buffer); + + rdata.buffer = InvalidBuffer; + rdata.data = (char *) &xlrec; + rdata.len = offsetof(ginxlogVacuumDataLeafPage, data); + rdata.next = payloadrdata; + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE, &rdata); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); } +} - rdata[cnt].buffer = InvalidBuffer; - rdata[cnt].data = (char *) &data; - rdata[cnt].len = sizeof(ginxlogInsert); - rdata[cnt].next = &rdata[cnt + 1]; - cnt++; +/* + * Construct a ginxlogRecompressDataLeaf record representing the changes + * in *leaf. + */ +static XLogRecData * +constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf) +{ + int nmodified = 0; + char *walbufbegin; + char *walbufend; + XLogRecData *rdata; + dlist_iter iter; + int segno; + ginxlogRecompressDataLeaf *recompress_xlog; + + /* Count the modified segments */ + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); - rdata[cnt].buffer = InvalidBuffer; - rdata[cnt].data = (GinPageIsLeaf(page)) ? ((char *) (btree->items + btree->curitem)) : ((char *) &(btree->pitem)); - rdata[cnt].len = sizeofitem; - rdata[cnt].next = NULL; + if (seginfo->action != GIN_SEGMENT_UNMODIFIED) + nmodified++; + } - if (GinPageIsLeaf(page)) + walbufbegin = palloc( + sizeof(ginxlogRecompressDataLeaf) + + BLCKSZ + /* max size needed to hold the segment + * data */ + nmodified * 2 + /* (segno + action) per action */ + sizeof(XLogRecData)); + walbufend = walbufbegin; + + recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend; + walbufend += sizeof(ginxlogRecompressDataLeaf); + + recompress_xlog->nactions = nmodified; + + segno = 0; + dlist_foreach(iter, &leaf->segments) { - if (GinPageRightMost(page) && off > GinPageGetOpaque(page)->maxoff) + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + int segsize = 0; + int datalen; + uint8 action = seginfo->action; + + if (action == GIN_SEGMENT_UNMODIFIED) { - /* usually, create index... */ - uint32 savedPos = btree->curitem; + segno++; + continue; + } - while (btree->curitem < btree->nitem) - { - GinDataPageAddItem(page, btree->items + btree->curitem, off); - off++; - btree->curitem++; - } - data.nitem = btree->curitem - savedPos; - rdata[cnt].len = sizeofitem * data.nitem; + if (action != GIN_SEGMENT_DELETE) + segsize = SizeOfGinPostingList(seginfo->seg); + + /* + * If storing the uncompressed list of added item pointers would take + * more space than storing the compressed segment as is, do that + * instead. + */ + if (action == GIN_SEGMENT_ADDITEMS && + seginfo->nmodifieditems * sizeof(ItemPointerData) > segsize) + { + action = GIN_SEGMENT_REPLACE; } - else + + *((uint8 *) (walbufend++)) = segno; + *(walbufend++) = action; + + switch (action) { - GinDataPageAddItem(page, btree->items + btree->curitem, off); - btree->curitem++; + case GIN_SEGMENT_DELETE: + datalen = 0; + break; + + case GIN_SEGMENT_ADDITEMS: + datalen = seginfo->nmodifieditems * sizeof(ItemPointerData); + memcpy(walbufend, &seginfo->nmodifieditems, sizeof(uint16)); + memcpy(walbufend + sizeof(uint16), seginfo->modifieditems, datalen); + datalen += sizeof(uint16); + break; + + case GIN_SEGMENT_INSERT: + case GIN_SEGMENT_REPLACE: + datalen = SHORTALIGN(segsize); + memcpy(walbufend, seginfo->seg, segsize); + break; + + default: + elog(ERROR, "unexpected GIN leaf action %d", action); } + walbufend += datalen; + + if (action != GIN_SEGMENT_INSERT) + segno++; } - else - GinDataPageAddItem(page, &(btree->pitem), off); + + rdata = (XLogRecData *) MAXALIGN(walbufend); + rdata->buffer = buf; + rdata->buffer_std = TRUE; + rdata->data = walbufbegin; + rdata->len = walbufend - walbufbegin; + rdata->next = NULL; + + return rdata; } /* - * split page and fills WAL record. original buffer(lbuf) leaves untouched, - * returns shadow page of lbuf filled new data. In leaf page and build mode puts all - * ItemPointers to pages. Also, in build mode splits data by way to full fulled - * left page + * Assemble a disassembled posting tree leaf page back to a buffer. + * + * *prdata is filled with WAL information about this operation. The caller + * is responsible for inserting to the WAL, along with any other information + * about the operation that triggered this recompression. + * + * NOTE: The segment pointers must not point directly to the same buffer, + * except for segments that have not been modified and whose preceding + * segments have not been modified either. */ -static Page -dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata) +static void +dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf) { + Page page = BufferGetPage(buf); char *ptr; - OffsetNumber separator; - ItemPointer bound; - Page lpage = PageGetTempPageCopy(BufferGetPage(lbuf)); - ItemPointerData oldbound = *GinDataPageGetRightBound(lpage); - int sizeofitem = GinSizeOfDataPageItem(lpage); - OffsetNumber maxoff = GinPageGetOpaque(lpage)->maxoff; - Page rpage = BufferGetPage(rbuf); - Size pageSize = PageGetPageSize(lpage); - Size freeSpace; - uint32 nCopied = 1; + int newsize; + bool modified = false; + dlist_iter iter; + int segsize; - /* these must be static so they can be returned to caller */ - static ginxlogSplit data; - static XLogRecData rdata[4]; - static char vector[2 * BLCKSZ]; + /* + * If the page was in pre-9.4 format before, convert the header, and force + * all segments to be copied to the page whether they were modified or + * not. + */ + if (!GinPageIsCompressed(page)) + { + Assert(leaf->oldformat); + GinPageSetCompressed(page); + GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber; + modified = true; + } + + ptr = (char *) GinDataLeafPageGetPostingList(page); + newsize = 0; + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); - GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize); - freeSpace = GinDataPageGetFreeSpace(rpage); + if (seginfo->action != GIN_SEGMENT_UNMODIFIED) + modified = true; - *prdata = rdata; - data.leftChildBlkno = (GinPageIsLeaf(lpage)) ? - InvalidOffsetNumber : PostingItemGetBlockNumber(&(btree->pitem)); - data.updateBlkno = dataPrepareData(btree, lpage, off); + if (seginfo->action != GIN_SEGMENT_DELETE) + { + segsize = SizeOfGinPostingList(seginfo->seg); + + if (modified) + memcpy(ptr, seginfo->seg, segsize); - memcpy(vector, GinDataPageGetItem(lpage, FirstOffsetNumber), - maxoff * sizeofitem); + ptr += segsize; + newsize += segsize; + } + } + + Assert(newsize <= GinDataPageMaxDataSize); + GinDataPageSetDataSize(page, newsize); +} - if (GinPageIsLeaf(lpage) && GinPageRightMost(lpage) && off > GinPageGetOpaque(lpage)->maxoff) +/* + * Like dataPlaceToPageLeafRecompress, but writes the disassembled leaf + * segments to two pages instead of one. + * + * This is different from the non-split cases in that this does not modify + * the original page directly, but to temporary in-memory copies of the new + * left and right pages. + */ +static void +dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, + ItemPointerData lbound, ItemPointerData rbound, + XLogRecData **prdata, Page lpage, Page rpage) +{ + char *ptr; + int segsize; + int lsize; + int rsize; + dlist_node *node; + dlist_node *firstright; + leafSegmentInfo *seginfo; + + /* these must be static so they can be returned to caller */ + static ginxlogSplitDataLeaf split_xlog; + static XLogRecData rdata[3]; + + /* Initialize temporary pages to hold the new left and right pages */ + GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); + GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); + + /* + * Copy the segments that go to the left page. + * + * XXX: We should skip copying the unmodified part of the left page, like + * we do when recompressing. + */ + lsize = 0; + ptr = (char *) GinDataLeafPageGetPostingList(lpage); + firstright = dlist_next_node(&leaf->segments, leaf->lastleft); + for (node = dlist_head_node(&leaf->segments); + node != firstright; + node = dlist_next_node(&leaf->segments, node)) { - nCopied = 0; - while (btree->curitem < btree->nitem && - maxoff * sizeof(ItemPointerData) < 2 * (freeSpace - sizeof(ItemPointerData))) + seginfo = dlist_container(leafSegmentInfo, node, node); + + if (seginfo->action != GIN_SEGMENT_DELETE) { - memcpy(vector + maxoff * sizeof(ItemPointerData), - btree->items + btree->curitem, - sizeof(ItemPointerData)); - maxoff++; - nCopied++; - btree->curitem++; + segsize = SizeOfGinPostingList(seginfo->seg); + memcpy(ptr, seginfo->seg, segsize); + ptr += segsize; + lsize += segsize; } } - else + Assert(lsize == leaf->lsize); + GinDataPageSetDataSize(lpage, lsize); + *GinDataPageGetRightBound(lpage) = lbound; + + /* Copy the segments that go to the right page */ + ptr = (char *) GinDataLeafPageGetPostingList(rpage); + rsize = 0; + for (node = firstright; + ; + node = dlist_next_node(&leaf->segments, node)) { - ptr = vector + (off - 1) * sizeofitem; - if (maxoff + 1 - off != 0) - memmove(ptr + sizeofitem, ptr, (maxoff - off + 1) * sizeofitem); - if (GinPageIsLeaf(lpage)) + seginfo = dlist_container(leafSegmentInfo, node, node); + + if (seginfo->action != GIN_SEGMENT_DELETE) { - memcpy(ptr, btree->items + btree->curitem, sizeofitem); - btree->curitem++; + segsize = SizeOfGinPostingList(seginfo->seg); + memcpy(ptr, seginfo->seg, segsize); + ptr += segsize; + rsize += segsize; } - else - memcpy(ptr, &(btree->pitem), sizeofitem); - maxoff++; + if (!dlist_has_next(&leaf->segments, node)) + break; + } + Assert(rsize == leaf->rsize); + GinDataPageSetDataSize(rpage, rsize); + *GinDataPageGetRightBound(rpage) = rbound; + + /* Create WAL record */ + split_xlog.lsize = lsize; + split_xlog.rsize = rsize; + split_xlog.lrightbound = lbound; + split_xlog.rrightbound = rbound; + + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &split_xlog; + rdata[0].len = sizeof(ginxlogSplitDataLeaf); + rdata[0].next = &rdata[1]; + + rdata[1].buffer = InvalidBuffer; + rdata[1].data = (char *) GinDataLeafPageGetPostingList(lpage); + rdata[1].len = lsize; + rdata[1].next = &rdata[2]; + + rdata[2].buffer = InvalidBuffer; + rdata[2].data = (char *) GinDataLeafPageGetPostingList(rpage); + rdata[2].len = rsize; + rdata[2].next = NULL; + + *prdata = rdata; +} + +/* + * Place a PostingItem to page, and fill a WAL record. + * + * If the item doesn't fit, returns false without modifying the page. + * + * In addition to inserting the given item, the downlink of the existing item + * at 'off' is updated to point to 'updateblkno'. + */ +static GinPlaceToPageRC +dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + XLogRecData **prdata, Page *newlpage, Page *newrpage) +{ + Page page = BufferGetPage(buf); + OffsetNumber off = stack->off; + PostingItem *pitem; + + /* these must be static so they can be returned to caller */ + static XLogRecData rdata; + static ginxlogInsertDataInternal data; + + /* split if we have to */ + if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem)) + { + dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno, + prdata, newlpage, newrpage); + return SPLIT; } + *prdata = &rdata; + Assert(GinPageIsData(page)); + + START_CRIT_SECTION(); + + /* Update existing downlink to point to next page (on internal page) */ + pitem = GinDataPageGetPostingItem(page, off); + PostingItemSetBlockNumber(pitem, updateblkno); + + /* Add new item */ + pitem = (PostingItem *) insertdata; + GinDataPageAddPostingItem(page, pitem, off); + + data.offset = off; + data.newitem = *pitem; + + rdata.buffer = buf; + rdata.buffer_std = TRUE; + rdata.data = (char *) &data; + rdata.len = sizeof(ginxlogInsertDataInternal); + rdata.next = NULL; + + return INSERTED; +} + +/* + * Places an item (or items) to a posting tree. Calls relevant function of + * internal of leaf page because they are handled very differently. + */ +static GinPlaceToPageRC +dataPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + XLogRecData **prdata, + Page *newlpage, Page *newrpage) +{ + Page page = BufferGetPage(buf); + + Assert(GinPageIsData(page)); + + if (GinPageIsLeaf(page)) + return dataPlaceToPageLeaf(btree, buf, stack, insertdata, + prdata, newlpage, newrpage); + else + return dataPlaceToPageInternal(btree, buf, stack, + insertdata, updateblkno, + prdata, newlpage, newrpage); +} + +/* + * Split page and fill WAL record. Returns a new temp buffer filled with data + * that should go to the left page. The original buffer is left untouched. + */ +static void +dataSplitPageInternal(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + XLogRecData **prdata, Page *newlpage, Page *newrpage) +{ + Page oldpage = BufferGetPage(origbuf); + OffsetNumber off = stack->off; + int nitems = GinPageGetOpaque(oldpage)->maxoff; + int nleftitems; + int nrightitems; + Size pageSize = PageGetPageSize(oldpage); + ItemPointerData oldbound = *GinDataPageGetRightBound(oldpage); + ItemPointer bound; + Page lpage; + Page rpage; + OffsetNumber separator; + + /* these must be static so they can be returned to caller */ + static ginxlogSplitDataInternal data; + static XLogRecData rdata[4]; + static PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1]; + + lpage = PageGetTempPage(oldpage); + rpage = PageGetTempPage(oldpage); + GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize); + GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize); + + *prdata = rdata; + /* - * we suppose that during index creation table scaned from begin to end, - * so ItemPointers are monotonically increased.. + * First construct a new list of PostingItems, which includes all the old + * items, and the new item. */ - if (btree->isBuild && GinPageRightMost(lpage)) - separator = freeSpace / sizeofitem; - else - separator = maxoff / 2; + memcpy(allitems, GinDataPageGetPostingItem(oldpage, FirstOffsetNumber), + (off - 1) * sizeof(PostingItem)); - GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize); - GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize); + allitems[off - 1] = *((PostingItem *) insertdata); + memcpy(&allitems[off], GinDataPageGetPostingItem(oldpage, off), + (nitems - (off - 1)) * sizeof(PostingItem)); + nitems++; - memcpy(GinDataPageGetItem(lpage, FirstOffsetNumber), vector, separator * sizeofitem); - GinPageGetOpaque(lpage)->maxoff = separator; - memcpy(GinDataPageGetItem(rpage, FirstOffsetNumber), - vector + separator * sizeofitem, (maxoff - separator) * sizeofitem); - GinPageGetOpaque(rpage)->maxoff = maxoff - separator; + /* Update existing downlink to point to next page */ + PostingItemSetBlockNumber(&allitems[off], updateblkno); - PostingItemSetBlockNumber(&(btree->pitem), BufferGetBlockNumber(lbuf)); - if (GinPageIsLeaf(lpage)) - btree->pitem.key = *(ItemPointerData *) GinDataPageGetItem(lpage, - GinPageGetOpaque(lpage)->maxoff); + /* + * When creating a new index, fit as many tuples as possible on the left + * page, on the assumption that the table is scanned from beginning to + * end. This packs the index as tight as possible. + */ + if (btree->isBuild && GinPageRightMost(oldpage)) + separator = GinNonLeafDataPageGetFreeSpace(rpage) / sizeof(PostingItem); else - btree->pitem.key = ((PostingItem *) GinDataPageGetItem(lpage, - GinPageGetOpaque(lpage)->maxoff))->key; - btree->rightblkno = BufferGetBlockNumber(rbuf); + separator = nitems / 2; + nleftitems = separator; + nrightitems = nitems - separator; + + memcpy(GinDataPageGetPostingItem(lpage, FirstOffsetNumber), + allitems, + nleftitems * sizeof(PostingItem)); + GinPageGetOpaque(lpage)->maxoff = nleftitems; + memcpy(GinDataPageGetPostingItem(rpage, FirstOffsetNumber), + &allitems[separator], + nrightitems * sizeof(PostingItem)); + GinPageGetOpaque(rpage)->maxoff = nrightitems; + + /* + * Also set pd_lower for both pages, like GinDataPageAddPostingItem does. + */ + GinDataPageSetDataSize(lpage, nleftitems * sizeof(PostingItem)); + GinDataPageSetDataSize(rpage, nrightitems * sizeof(PostingItem)); /* set up right bound for left page */ bound = GinDataPageGetRightBound(lpage); - *bound = btree->pitem.key; + *bound = GinDataPageGetPostingItem(lpage, nleftitems)->key; /* set up right bound for right page */ - bound = GinDataPageGetRightBound(rpage); - *bound = oldbound; + *GinDataPageGetRightBound(rpage) = oldbound; - data.node = btree->index->rd_node; - data.rootBlkno = InvalidBlockNumber; - data.lblkno = BufferGetBlockNumber(lbuf); - data.rblkno = BufferGetBlockNumber(rbuf); data.separator = separator; - data.nitem = maxoff; - data.isData = TRUE; - data.isLeaf = GinPageIsLeaf(lpage) ? TRUE : FALSE; - data.isRootSplit = FALSE; + data.nitem = nitems; data.rightbound = oldbound; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &data; - rdata[0].len = sizeof(ginxlogSplit); + rdata[0].len = sizeof(ginxlogSplitDataInternal); rdata[0].next = &rdata[1]; rdata[1].buffer = InvalidBuffer; - rdata[1].data = vector; - rdata[1].len = MAXALIGN(maxoff * sizeofitem); + rdata[1].data = (char *) allitems; + rdata[1].len = nitems * sizeof(PostingItem); rdata[1].next = NULL; - return lpage; + *newlpage = lpage; + *newrpage = rpage; +} + +/* + * Construct insertion payload for inserting the downlink for given buffer. + */ +static void * +dataPrepareDownlink(GinBtree btree, Buffer lbuf) +{ + PostingItem *pitem = palloc(sizeof(PostingItem)); + Page lpage = BufferGetPage(lbuf); + + PostingItemSetBlockNumber(pitem, BufferGetBlockNumber(lbuf)); + pitem->key = *GinDataPageGetRightBound(lpage); + + return pitem; } /* @@ -585,102 +1306,596 @@ dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRe * Also called from ginxlog, should not use btree */ void -ginDataFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf) +ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage) { - Page page = BufferGetPage(root), - lpage = BufferGetPage(lbuf), - rpage = BufferGetPage(rbuf); PostingItem li, ri; li.key = *GinDataPageGetRightBound(lpage); - PostingItemSetBlockNumber(&li, BufferGetBlockNumber(lbuf)); - GinDataPageAddItem(page, &li, InvalidOffsetNumber); + PostingItemSetBlockNumber(&li, lblkno); + GinDataPageAddPostingItem(root, &li, InvalidOffsetNumber); ri.key = *GinDataPageGetRightBound(rpage); - PostingItemSetBlockNumber(&ri, BufferGetBlockNumber(rbuf)); - GinDataPageAddItem(page, &ri, InvalidOffsetNumber); + PostingItemSetBlockNumber(&ri, rblkno); + GinDataPageAddPostingItem(root, &ri, InvalidOffsetNumber); +} + + +/*** Functions to work with disassembled leaf pages ***/ + +/* + * Disassemble page into a disassembledLeaf struct. + */ +static disassembledLeaf * +disassembleLeaf(Page page) +{ + disassembledLeaf *leaf; + GinPostingList *seg; + Pointer segbegin; + Pointer segend; + + leaf = palloc0(sizeof(disassembledLeaf)); + dlist_init(&leaf->segments); + + if (GinPageIsCompressed(page)) + { + /* + * Create a leafSegment entry for each segment. + */ + seg = GinDataLeafPageGetPostingList(page); + segbegin = (Pointer) seg; + segend = segbegin + GinDataLeafPageGetPostingListSize(page); + while ((Pointer) seg < segend) + { + leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo)); + + seginfo->action = GIN_SEGMENT_UNMODIFIED; + seginfo->seg = seg; + seginfo->items = NULL; + seginfo->nitems = 0; + dlist_push_tail(&leaf->segments, &seginfo->node); + + seg = GinNextPostingListSegment(seg); + } + leaf->oldformat = false; + } + else + { + /* + * A pre-9.4 format uncompressed page is represented by a single + * segment, with an array of items. + */ + ItemPointer uncompressed; + int nuncompressed; + leafSegmentInfo *seginfo; + + uncompressed = dataLeafPageGetUncompressed(page, &nuncompressed); + + seginfo = palloc(sizeof(leafSegmentInfo)); + + seginfo->action = GIN_SEGMENT_REPLACE; + seginfo->seg = NULL; + seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData)); + memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData)); + seginfo->nitems = nuncompressed; + + dlist_push_tail(&leaf->segments, &seginfo->node); + + leaf->oldformat = true; + } + + return leaf; +} + +/* + * Distribute newItems to the segments. + * + * Any segments that acquire new items are decoded, and the new items are + * merged with the old items. + * + * Returns true if any new items were added. False means they were all + * duplicates of existing items on the page. + */ +static bool +addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems) +{ + dlist_iter iter; + ItemPointer nextnew = newItems; + int newleft = nNewItems; + bool modified = false; + leafSegmentInfo *newseg; + + /* + * If the page is completely empty, just construct one new segment to hold + * all the new items. + */ + if (dlist_is_empty(&leaf->segments)) + { + newseg = palloc(sizeof(leafSegmentInfo)); + newseg->seg = NULL; + newseg->items = newItems; + newseg->nitems = nNewItems; + newseg->action = GIN_SEGMENT_INSERT; + dlist_push_tail(&leaf->segments, &newseg->node); + return true; + } + + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *cur = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, iter.cur); + int nthis; + ItemPointer tmpitems; + int ntmpitems; + + /* + * How many of the new items fall into this segment? + */ + if (!dlist_has_next(&leaf->segments, iter.cur)) + nthis = newleft; + else + { + leafSegmentInfo *next; + ItemPointerData next_first; + + next = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, + dlist_next_node(&leaf->segments, iter.cur)); + if (next->items) + next_first = next->items[0]; + else + { + Assert(next->seg != NULL); + next_first = next->seg->first; + } + + nthis = 0; + while (nthis < newleft && ginCompareItemPointers(&nextnew[nthis], &next_first) < 0) + nthis++; + } + if (nthis == 0) + continue; + + /* Merge the new items with the existing items. */ + if (!cur->items) + cur->items = ginPostingListDecode(cur->seg, &cur->nitems); + + /* + * Fast path for the important special case that we're appending to + * the end of the page: don't let the last segment on the page grow + * larger than the target, create a new segment before that happens. + */ + if (!dlist_has_next(&leaf->segments, iter.cur) && + ginCompareItemPointers(&cur->items[cur->nitems - 1], &nextnew[0]) < 0 && + cur->seg != NULL && + SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize) + { + newseg = palloc(sizeof(leafSegmentInfo)); + newseg->seg = NULL; + newseg->items = nextnew; + newseg->nitems = nthis; + newseg->action = GIN_SEGMENT_INSERT; + dlist_push_tail(&leaf->segments, &newseg->node); + modified = true; + break; + } + + tmpitems = ginMergeItemPointers(cur->items, cur->nitems, + nextnew, nthis, + &ntmpitems); + if (ntmpitems != cur->nitems) + { + /* + * If there are no duplicates, track the added items so that we + * can emit a compact ADDITEMS WAL record later on. (it doesn't + * seem worth re-checking which items were duplicates, if there + * were any) + */ + if (ntmpitems == nthis + cur->nitems && + cur->action == GIN_SEGMENT_UNMODIFIED) + { + cur->action = GIN_SEGMENT_ADDITEMS; + cur->modifieditems = nextnew; + cur->nmodifieditems = nthis; + } + else + cur->action = GIN_SEGMENT_REPLACE; + + cur->items = tmpitems; + cur->nitems = ntmpitems; + cur->seg = NULL; + modified = true; + } + + nextnew += nthis; + newleft -= nthis; + if (newleft == 0) + break; + } + + return modified; +} + +/* + * Recompresses all segments that have been modified. + * + * If not all the items fit on two pages (ie. after split), we store as + * many items as fit, and set *remaining to the first item that didn't fit. + * If all items fit, *remaining is set to invalid. + * + * Returns true if the page has to be split. + */ +static bool +leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining) +{ + int pgused = 0; + bool needsplit = false; + dlist_iter iter; + int segsize; + leafSegmentInfo *nextseg; + int npacked; + bool modified; + dlist_node *cur_node; + dlist_node *next_node; + + ItemPointerSetInvalid(remaining); + + /* + * cannot use dlist_foreach_modify here because we insert adjacent items + * while iterating. + */ + for (cur_node = dlist_head_node(&leaf->segments); + cur_node != NULL; + cur_node = next_node) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + cur_node); + + if (dlist_has_next(&leaf->segments, cur_node)) + next_node = dlist_next_node(&leaf->segments, cur_node); + else + next_node = NULL; + + /* Compress the posting list, if necessary */ + if (seginfo->action != GIN_SEGMENT_DELETE) + { + if (seginfo->seg == NULL) + { + if (seginfo->nitems > GinPostingListSegmentMaxSize) + npacked = 0; /* no chance that it would fit. */ + else + { + seginfo->seg = ginCompressPostingList(seginfo->items, + seginfo->nitems, + GinPostingListSegmentMaxSize, + &npacked); + } + if (npacked != seginfo->nitems) + { + /* + * Too large. Compress again to the target size, and + * create a new segment to represent the remaining items. + * The new segment is inserted after this one, so it will + * be processed in the next iteration of this loop. + */ + if (seginfo->seg) + pfree(seginfo->seg); + seginfo->seg = ginCompressPostingList(seginfo->items, + seginfo->nitems, + GinPostingListSegmentTargetSize, + &npacked); + if (seginfo->action != GIN_SEGMENT_INSERT) + seginfo->action = GIN_SEGMENT_REPLACE; + + nextseg = palloc(sizeof(leafSegmentInfo)); + nextseg->action = GIN_SEGMENT_INSERT; + nextseg->seg = NULL; + nextseg->items = &seginfo->items[npacked]; + nextseg->nitems = seginfo->nitems - npacked; + next_node = &nextseg->node; + dlist_insert_after(cur_node, next_node); + } + } + + /* + * If the segment is very small, merge it with the next segment. + */ + if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node) + { + int nmerged; + + nextseg = dlist_container(leafSegmentInfo, node, next_node); + + if (seginfo->items == NULL) + seginfo->items = ginPostingListDecode(seginfo->seg, + &seginfo->nitems); + if (nextseg->items == NULL) + nextseg->items = ginPostingListDecode(nextseg->seg, + &nextseg->nitems); + nextseg->items = + ginMergeItemPointers(seginfo->items, seginfo->nitems, + nextseg->items, nextseg->nitems, + &nmerged); + Assert(nmerged == seginfo->nitems + nextseg->nitems); + nextseg->nitems = nmerged; + nextseg->seg = NULL; + + nextseg->action = GIN_SEGMENT_REPLACE; + nextseg->modifieditems = NULL; + nextseg->nmodifieditems = 0; + + if (seginfo->action == GIN_SEGMENT_INSERT) + { + dlist_delete(cur_node); + continue; + } + else + { + seginfo->action = GIN_SEGMENT_DELETE; + seginfo->seg = NULL; + } + } + + seginfo->items = NULL; + seginfo->nitems = 0; + } + + if (seginfo->action == GIN_SEGMENT_DELETE) + continue; + + /* + * OK, we now have a compressed version of this segment ready for + * copying to the page. Did we exceed the size that fits on one page? + */ + segsize = SizeOfGinPostingList(seginfo->seg); + if (pgused + segsize > GinDataPageMaxDataSize) + { + if (!needsplit) + { + /* switch to right page */ + Assert(pgused > 0); + leaf->lastleft = dlist_prev_node(&leaf->segments, cur_node); + needsplit = true; + leaf->lsize = pgused; + pgused = 0; + } + else + { + /* + * Filled both pages. The last segment we constructed did not + * fit. + */ + *remaining = seginfo->seg->first; + + /* + * remove all segments that did not fit from the list. + */ + while (dlist_has_next(&leaf->segments, cur_node)) + dlist_delete(dlist_next_node(&leaf->segments, cur_node)); + dlist_delete(cur_node); + break; + } + } + + pgused += segsize; + } + + if (!needsplit) + { + leaf->lsize = pgused; + leaf->rsize = 0; + } + else + leaf->rsize = pgused; + + Assert(leaf->lsize <= GinDataPageMaxDataSize); + Assert(leaf->rsize <= GinDataPageMaxDataSize); + + /* + * Make a palloc'd copy of every segment after the first modified one, + * because as we start copying items to the original page, we might + * overwrite an existing segment. + */ + modified = false; + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + + if (!modified && seginfo->action != GIN_SEGMENT_UNMODIFIED) + { + modified = true; + } + else if (modified && seginfo->action == GIN_SEGMENT_UNMODIFIED) + { + GinPostingList *tmp; + + segsize = SizeOfGinPostingList(seginfo->seg); + tmp = palloc(segsize); + memcpy(tmp, seginfo->seg, segsize); + seginfo->seg = tmp; + } + } + + return needsplit; +} + + +/*** Functions that are exported to the rest of the GIN code ***/ + +/* + * Creates new posting tree containing the given TIDs. Returns the page + * number of the root of the new posting tree. + * + * items[] must be in sorted order with no duplicates. + */ +BlockNumber +createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, + GinStatsData *buildStats) +{ + BlockNumber blkno; + Buffer buffer; + Page tmppage; + Page page; + Pointer ptr; + int nrootitems; + int rootsize; + + /* Construct the new root page in memory first. */ + tmppage = (Page) palloc(BLCKSZ); + GinInitPage(tmppage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); + GinPageGetOpaque(tmppage)->rightlink = InvalidBlockNumber; + + /* + * Write as many of the items to the root page as fit. In segments of max + * GinPostingListSegmentMaxSize bytes each. + */ + nrootitems = 0; + rootsize = 0; + ptr = (Pointer) GinDataLeafPageGetPostingList(tmppage); + while (nrootitems < nitems) + { + GinPostingList *segment; + int npacked; + int segsize; + + segment = ginCompressPostingList(&items[nrootitems], + nitems - nrootitems, + GinPostingListSegmentMaxSize, + &npacked); + segsize = SizeOfGinPostingList(segment); + if (rootsize + segsize > GinDataPageMaxDataSize) + break; + + memcpy(ptr, segment, segsize); + ptr += segsize; + rootsize += segsize; + nrootitems += npacked; + pfree(segment); + } + GinDataPageSetDataSize(tmppage, rootsize); + + /* + * All set. Get a new physical page, and copy the in-memory page to it. + */ + buffer = GinNewBuffer(index); + page = BufferGetPage(buffer); + blkno = BufferGetBlockNumber(buffer); + + START_CRIT_SECTION(); + + PageRestoreTempPage(tmppage, page); + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + XLogRecData rdata[2]; + ginxlogCreatePostingTree data; + + data.node = index->rd_node; + data.blkno = blkno; + data.size = rootsize; + + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &data; + rdata[0].len = sizeof(ginxlogCreatePostingTree); + rdata[0].next = &rdata[1]; + + rdata[1].buffer = InvalidBuffer; + rdata[1].data = (char *) GinDataLeafPageGetPostingList(page); + rdata[1].len = rootsize; + rdata[1].next = NULL; + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE, rdata); + PageSetLSN(page, recptr); + } + + UnlockReleaseBuffer(buffer); + + END_CRIT_SECTION(); + + /* During index build, count the newly-added data page */ + if (buildStats) + buildStats->nDataPages++; + + elog(DEBUG2, "created GIN posting tree with %d items", nrootitems); + + /* + * Add any remaining TIDs to the newly-created posting tree. + */ + if (nitems > nrootitems) + { + ginInsertItemPointers(index, blkno, + items + nrootitems, + nitems - nrootitems, + buildStats); + } + + return blkno; } void -ginPrepareDataScan(GinBtree btree, Relation index) +ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno) { memset(btree, 0, sizeof(GinBtreeData)); btree->index = index; + btree->rootBlkno = rootBlkno; btree->findChildPage = dataLocateItem; + btree->getLeftMostChild = dataGetLeftMostPage; btree->isMoveRight = dataIsMoveRight; - btree->findItem = dataLocateLeafItem; + btree->findItem = NULL; btree->findChildPtr = dataFindChildPtr; - btree->getLeftMostPage = dataGetLeftMostPage; - btree->isEnoughSpace = dataIsEnoughSpace; btree->placeToPage = dataPlaceToPage; - btree->splitPage = dataSplitPage; btree->fillRoot = ginDataFillRoot; + btree->prepareDownlink = dataPrepareDownlink; btree->isData = TRUE; - btree->searchMode = FALSE; - btree->isDelete = FALSE; btree->fullScan = FALSE; btree->isBuild = FALSE; } -GinPostingTreeScan * -ginPrepareScanPostingTree(Relation index, BlockNumber rootBlkno, bool searchMode) -{ - GinPostingTreeScan *gdi = (GinPostingTreeScan *) palloc0(sizeof(GinPostingTreeScan)); - - ginPrepareDataScan(&gdi->btree, index); - - gdi->btree.searchMode = searchMode; - gdi->btree.fullScan = searchMode; - - gdi->stack = ginPrepareFindLeafPage(&gdi->btree, rootBlkno); - - return gdi; -} - /* * Inserts array of item pointers, may execute several tree scan (very rare) */ void -ginInsertItemPointers(GinPostingTreeScan *gdi, +ginInsertItemPointers(Relation index, BlockNumber rootBlkno, ItemPointerData *items, uint32 nitem, GinStatsData *buildStats) { - BlockNumber rootBlkno = gdi->stack->blkno; + GinBtreeData btree; + GinBtreeDataLeafInsertData insertdata; + GinBtreeStack *stack; - gdi->btree.items = items; - gdi->btree.nitem = nitem; - gdi->btree.curitem = 0; + ginPrepareDataScan(&btree, index, rootBlkno); + btree.isBuild = (buildStats != NULL); + insertdata.items = items; + insertdata.nitem = nitem; + insertdata.curitem = 0; - while (gdi->btree.curitem < gdi->btree.nitem) + while (insertdata.curitem < insertdata.nitem) { - if (!gdi->stack) - gdi->stack = ginPrepareFindLeafPage(&gdi->btree, rootBlkno); - - gdi->stack = ginFindLeafPage(&gdi->btree, gdi->stack); - - if (gdi->btree.findItem(&(gdi->btree), gdi->stack)) - { - /* - * gdi->btree.items[gdi->btree.curitem] already exists in index - */ - gdi->btree.curitem++; - LockBuffer(gdi->stack->buffer, GIN_UNLOCK); - freeGinBtreeStack(gdi->stack); - } - else - ginInsertValue(&(gdi->btree), gdi->stack, buildStats); + /* search for the leaf page where the first item should go to */ + btree.itemptr = insertdata.items[insertdata.curitem]; + stack = ginFindLeafPage(&btree, false); - gdi->stack = NULL; + ginInsertValue(&btree, stack, &insertdata, buildStats); } } -Buffer -ginScanBeginPostingTree(GinPostingTreeScan *gdi) +/* + * Starts a new scan on a posting tree. + */ +GinBtreeStack * +ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno) { - gdi->stack = ginFindLeafPage(&gdi->btree, gdi->stack); - return gdi->stack->buffer; + GinBtreeStack *stack; + + ginPrepareDataScan(btree, index, rootBlkno); + + btree->fullScan = TRUE; + + stack = ginFindLeafPage(btree, TRUE); + + return stack; } diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c index 70fcddfe40..412f90da4d 100644 --- a/src/backend/access/gin/ginentrypage.c +++ b/src/backend/access/gin/ginentrypage.c @@ -1,10 +1,10 @@ /*------------------------------------------------------------------------- * * ginentrypage.c - * page utilities routines for the postgres inverted index access method. + * routines for handling GIN entry tree pages. * * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -15,8 +15,15 @@ #include "postgres.h" #include "access/gin_private.h" +#include "miscadmin.h" #include "utils/rel.h" +static void entrySplitPage(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + void *insertPayload, + BlockNumber updateblkno, XLogRecData **prdata, + Page *newlpage, Page *newrpage); + /* * Form a tuple for entry tree. * @@ -27,15 +34,15 @@ * format that is being built here. We build on the assumption that we * are making a leaf-level key entry containing a posting list of nipd items. * If the caller is actually trying to make a posting-tree entry, non-leaf - * entry, or pending-list entry, it should pass nipd = 0 and then overwrite - * the t_tid fields as necessary. In any case, ipd can be NULL to skip - * copying any itempointers into the posting list; the caller is responsible - * for filling the posting list afterwards, if ipd = NULL and nipd > 0. + * entry, or pending-list entry, it should pass dataSize = 0 and then overwrite + * the t_tid fields as necessary. In any case, 'data' can be NULL to skip + * filling in the posting list; the caller is responsible for filling it + * afterwards if data = NULL and nipd > 0. */ IndexTuple GinFormTuple(GinState *ginstate, OffsetNumber attnum, Datum key, GinNullCategory category, - ItemPointerData *ipd, uint32 nipd, + Pointer data, Size dataSize, int nipd, bool errorTooBig) { Datum datums[2]; @@ -83,24 +90,23 @@ GinFormTuple(GinState *ginstate, newsize = SHORTALIGN(newsize); GinSetPostingOffset(itup, newsize); - GinSetNPosting(itup, nipd); /* * Add space needed for posting list, if any. Then check that the tuple * won't be too big to store. */ - newsize += sizeof(ItemPointerData) * nipd; + newsize += dataSize; + newsize = MAXALIGN(newsize); - if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize)) + + if (newsize > GinMaxItemSize) { if (errorTooBig) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", - (unsigned long) newsize, - (unsigned long) Min(INDEX_SIZE_MASK, - GinMaxItemSize), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + (Size) newsize, (Size) GinMaxItemSize, RelationGetRelationName(ginstate->index)))); pfree(itup); return NULL; @@ -113,12 +119,28 @@ GinFormTuple(GinState *ginstate, { itup = repalloc(itup, newsize); + /* + * PostgreSQL 9.3 and earlier did not clear this new space, so we + * might find uninitialized padding when reading tuples from disk. + */ + memset((char *) itup + IndexTupleSize(itup), + 0, newsize - IndexTupleSize(itup)); /* set new size in tuple header */ itup->t_info &= ~INDEX_SIZE_MASK; itup->t_info |= newsize; } /* + * Copy in the posting list, if provided + */ + if (data) + { + char *ptr = GinGetPosting(itup); + + memcpy(ptr, data, dataSize); + } + + /* * Insert category byte, if needed */ if (category != GIN_CAT_NORM_KEY) @@ -126,44 +148,52 @@ GinFormTuple(GinState *ginstate, Assert(IndexTupleHasNulls(itup)); GinSetNullCategory(itup, ginstate, category); } - - /* - * Copy in the posting list, if provided - */ - if (ipd) - memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd); - return itup; } /* - * Sometimes we reduce the number of posting list items in a tuple after - * having built it with GinFormTuple. This function adjusts the size - * fields to match. + * Read item pointers from leaf entry tuple. + * + * Returns a palloc'd array of ItemPointers. The number of items is returned + * in *nitems. */ -void -GinShortenTuple(IndexTuple itup, uint32 nipd) +ItemPointer +ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup, + int *nitems) { - uint32 newsize; - - Assert(nipd <= GinGetNPosting(itup)); - - newsize = GinGetPostingOffset(itup) + sizeof(ItemPointerData) * nipd; - newsize = MAXALIGN(newsize); + Pointer ptr = GinGetPosting(itup); + int nipd = GinGetNPosting(itup); + ItemPointer ipd; + int ndecoded; - Assert(newsize <= (itup->t_info & INDEX_SIZE_MASK)); - - itup->t_info &= ~INDEX_SIZE_MASK; - itup->t_info |= newsize; - - GinSetNPosting(itup, nipd); + if (GinItupIsCompressed(itup)) + { + if (nipd > 0) + { + ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded); + if (nipd != ndecoded) + elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded", + nipd, ndecoded); + } + else + { + ipd = palloc(0); + } + } + else + { + ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd); + memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd); + } + *nitems = nipd; + return ipd; } /* * Form a non-leaf entry tuple by copying the key data from the given tuple, * which can be either a leaf or non-leaf entry tuple. * - * Any posting list in the source tuple is not copied. The specified child + * Any posting list in the source tuple is not copied. The specified child * block number is inserted into t_tid. */ static IndexTuple @@ -252,7 +282,7 @@ entryLocateEntry(GinBtree btree, GinBtreeStack *stack) { stack->off = FirstOffsetNumber; stack->predictNumber *= PageGetMaxOffsetNumber(page); - return btree->getLeftMostPage(btree, page); + return btree->getLeftMostChild(btree, page); } low = FirstOffsetNumber; @@ -425,22 +455,26 @@ entryGetLeftMostPage(GinBtree btree, Page page) } static bool -entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off) +entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off, + GinBtreeEntryInsertData *insertData) { - Size itupsz = 0; + Size releasedsz = 0; + Size addedsz; Page page = BufferGetPage(buf); - Assert(btree->entry); + Assert(insertData->entry); Assert(!GinPageIsData(page)); - if (btree->isDelete) + if (insertData->isDelete) { IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); - itupsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); + releasedsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); } - if (PageGetFreeSpace(page) + itupsz >= MAXALIGN(IndexTupleSize(btree->entry)) + sizeof(ItemIdData)) + addedsz = MAXALIGN(IndexTupleSize(insertData->entry)) + sizeof(ItemIdData); + + if (PageGetFreeSpace(page) + releasedsz >= addedsz) return true; return false; @@ -451,136 +485,140 @@ entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off) * should update it, update old child blkno to new right page * if child split occurred */ -static BlockNumber -entryPreparePage(GinBtree btree, Page page, OffsetNumber off) +static void +entryPreparePage(GinBtree btree, Page page, OffsetNumber off, + GinBtreeEntryInsertData *insertData, BlockNumber updateblkno) { - BlockNumber ret = InvalidBlockNumber; - - Assert(btree->entry); + Assert(insertData->entry); Assert(!GinPageIsData(page)); - if (btree->isDelete) + if (insertData->isDelete) { Assert(GinPageIsLeaf(page)); PageIndexTupleDelete(page, off); } - if (!GinPageIsLeaf(page) && btree->rightblkno != InvalidBlockNumber) + if (!GinPageIsLeaf(page) && updateblkno != InvalidBlockNumber) { IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); - GinSetDownlink(itup, btree->rightblkno); - ret = btree->rightblkno; + GinSetDownlink(itup, updateblkno); } - - btree->rightblkno = InvalidBlockNumber; - - return ret; } /* * Place tuple on page and fills WAL record + * + * If the tuple doesn't fit, returns false without modifying the page. + * + * On insertion to an internal node, in addition to inserting the given item, + * the downlink of the existing item at 'off' is updated to point to + * 'updateblkno'. */ -static void -entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata) +static GinPlaceToPageRC +entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertPayload, BlockNumber updateblkno, + XLogRecData **prdata, Page *newlpage, Page *newrpage) { + GinBtreeEntryInsertData *insertData = insertPayload; Page page = BufferGetPage(buf); + OffsetNumber off = stack->off; OffsetNumber placed; int cnt = 0; /* these must be static so they can be returned to caller */ static XLogRecData rdata[3]; - static ginxlogInsert data; + static ginxlogInsertEntry data; + + /* quick exit if it doesn't fit */ + if (!entryIsEnoughSpace(btree, buf, off, insertData)) + { + entrySplitPage(btree, buf, stack, insertPayload, updateblkno, + prdata, newlpage, newrpage); + return SPLIT; + } + + START_CRIT_SECTION(); *prdata = rdata; - data.updateBlkno = entryPreparePage(btree, page, off); + entryPreparePage(btree, page, off, insertData, updateblkno); - placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false, false); + placed = PageAddItem(page, + (Item) insertData->entry, + IndexTupleSize(insertData->entry), + off, false, false); if (placed != off) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(btree->index)); - data.node = btree->index->rd_node; - data.blkno = BufferGetBlockNumber(buf); + data.isDelete = insertData->isDelete; data.offset = off; - data.nitem = 1; - data.isDelete = btree->isDelete; - data.isData = false; - data.isLeaf = GinPageIsLeaf(page) ? TRUE : FALSE; - - /* - * Prevent full page write if child's split occurs. That is needed to - * remove incomplete splits while replaying WAL - * - * data.updateBlkno contains new block number (of newly created right - * page) for recently splited page. - */ - if (data.updateBlkno == InvalidBlockNumber) - { - rdata[0].buffer = buf; - rdata[0].buffer_std = TRUE; - rdata[0].data = NULL; - rdata[0].len = 0; - rdata[0].next = &rdata[1]; - cnt++; - } - rdata[cnt].buffer = InvalidBuffer; + rdata[cnt].buffer = buf; + rdata[cnt].buffer_std = true; rdata[cnt].data = (char *) &data; - rdata[cnt].len = sizeof(ginxlogInsert); + rdata[cnt].len = offsetof(ginxlogInsertEntry, tuple); rdata[cnt].next = &rdata[cnt + 1]; cnt++; - rdata[cnt].buffer = InvalidBuffer; - rdata[cnt].data = (char *) btree->entry; - rdata[cnt].len = IndexTupleSize(btree->entry); + rdata[cnt].buffer = buf; + rdata[cnt].buffer_std = true; + rdata[cnt].data = (char *) insertData->entry; + rdata[cnt].len = IndexTupleSize(insertData->entry); rdata[cnt].next = NULL; - btree->entry = NULL; + return INSERTED; } /* * Place tuple and split page, original buffer(lbuf) leaves untouched, - * returns shadow page of lbuf filled new data. + * returns shadow pages filled with new data. * Tuples are distributed between pages by equal size on its, not * an equal number! */ -static Page -entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata) +static void +entrySplitPage(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + void *insertPayload, + BlockNumber updateblkno, XLogRecData **prdata, + Page *newlpage, Page *newrpage) { + GinBtreeEntryInsertData *insertData = insertPayload; + OffsetNumber off = stack->off; OffsetNumber i, maxoff, separator = InvalidOffsetNumber; Size totalsize = 0; + Size tupstoresize; Size lsize = 0, size; char *ptr; - IndexTuple itup, - leftrightmost = NULL; + IndexTuple itup; Page page; - Page lpage = PageGetTempPageCopy(BufferGetPage(lbuf)); - Page rpage = BufferGetPage(rbuf); + Page lpage = PageGetTempPageCopy(BufferGetPage(origbuf)); + Page rpage = PageGetTempPageCopy(BufferGetPage(origbuf)); Size pageSize = PageGetPageSize(lpage); /* these must be static so they can be returned to caller */ static XLogRecData rdata[2]; - static ginxlogSplit data; + static ginxlogSplitEntry data; static char tupstore[2 * BLCKSZ]; *prdata = rdata; - data.leftChildBlkno = (GinPageIsLeaf(lpage)) ? - InvalidOffsetNumber : GinGetDownlink(btree->entry); - data.updateBlkno = entryPreparePage(btree, lpage, off); + entryPreparePage(btree, lpage, off, insertData, updateblkno); + /* + * First, append all the existing tuples and the new tuple we're inserting + * one after another in a temporary workspace. + */ maxoff = PageGetMaxOffsetNumber(lpage); ptr = tupstore; - for (i = FirstOffsetNumber; i <= maxoff; i++) { if (i == off) { - size = MAXALIGN(IndexTupleSize(btree->entry)); - memcpy(ptr, btree->entry, size); + size = MAXALIGN(IndexTupleSize(insertData->entry)); + memcpy(ptr, insertData->entry, size); ptr += size; totalsize += size + sizeof(ItemIdData); } @@ -594,12 +632,17 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR if (off == maxoff + 1) { - size = MAXALIGN(IndexTupleSize(btree->entry)); - memcpy(ptr, btree->entry, size); + size = MAXALIGN(IndexTupleSize(insertData->entry)); + memcpy(ptr, insertData->entry, size); ptr += size; totalsize += size + sizeof(ItemIdData); } + tupstoresize = ptr - tupstore; + /* + * Initialize the left and right pages, and copy all the tuples back to + * them. + */ GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize); GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize); @@ -620,7 +663,6 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR } else { - leftrightmost = itup; lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); } @@ -630,48 +672,41 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR ptr += MAXALIGN(IndexTupleSize(itup)); } - btree->entry = GinFormInteriorTuple(leftrightmost, lpage, - BufferGetBlockNumber(lbuf)); - - btree->rightblkno = BufferGetBlockNumber(rbuf); - - data.node = btree->index->rd_node; - data.rootBlkno = InvalidBlockNumber; - data.lblkno = BufferGetBlockNumber(lbuf); - data.rblkno = BufferGetBlockNumber(rbuf); data.separator = separator; data.nitem = maxoff; - data.isData = FALSE; - data.isLeaf = GinPageIsLeaf(lpage) ? TRUE : FALSE; - data.isRootSplit = FALSE; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &data; - rdata[0].len = sizeof(ginxlogSplit); + rdata[0].len = sizeof(ginxlogSplitEntry); rdata[0].next = &rdata[1]; rdata[1].buffer = InvalidBuffer; rdata[1].data = tupstore; - rdata[1].len = MAXALIGN(totalsize); + rdata[1].len = tupstoresize; rdata[1].next = NULL; - return lpage; + *newlpage = lpage; + *newrpage = rpage; } /* - * return newly allocated rightmost tuple + * Construct insertion payload for inserting the downlink for given buffer. */ -IndexTuple -ginPageGetLinkItup(Buffer buf) +static void * +entryPrepareDownlink(GinBtree btree, Buffer lbuf) { - IndexTuple itup, - nitup; - Page page = BufferGetPage(buf); + GinBtreeEntryInsertData *insertData; + Page lpage = BufferGetPage(lbuf); + BlockNumber lblkno = BufferGetBlockNumber(lbuf); + IndexTuple itup; - itup = getRightMostTuple(page); - nitup = GinFormInteriorTuple(itup, page, BufferGetBlockNumber(buf)); + itup = getRightMostTuple(lpage); - return nitup; + insertData = palloc(sizeof(GinBtreeEntryInsertData)); + insertData->entry = GinFormInteriorTuple(itup, lpage, lblkno); + insertData->isDelete = false; + + return insertData; } /* @@ -679,20 +714,19 @@ ginPageGetLinkItup(Buffer buf) * Also called from ginxlog, should not use btree */ void -ginEntryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf) +ginEntryFillRoot(GinBtree btree, Page root, + BlockNumber lblkno, Page lpage, + BlockNumber rblkno, Page rpage) { - Page page; IndexTuple itup; - page = BufferGetPage(root); - - itup = ginPageGetLinkItup(lbuf); - if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + itup = GinFormInteriorTuple(getRightMostTuple(lpage), lpage, lblkno); + if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index root page"); pfree(itup); - itup = ginPageGetLinkItup(rbuf); - if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + itup = GinFormInteriorTuple(getRightMostTuple(rpage), rpage, rblkno); + if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index root page"); pfree(itup); } @@ -711,25 +745,23 @@ ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum, memset(btree, 0, sizeof(GinBtreeData)); btree->index = ginstate->index; + btree->rootBlkno = GIN_ROOT_BLKNO; btree->ginstate = ginstate; btree->findChildPage = entryLocateEntry; + btree->getLeftMostChild = entryGetLeftMostPage; btree->isMoveRight = entryIsMoveRight; btree->findItem = entryLocateLeafEntry; btree->findChildPtr = entryFindChildPtr; - btree->getLeftMostPage = entryGetLeftMostPage; - btree->isEnoughSpace = entryIsEnoughSpace; btree->placeToPage = entryPlaceToPage; - btree->splitPage = entrySplitPage; btree->fillRoot = ginEntryFillRoot; + btree->prepareDownlink = entryPrepareDownlink; btree->isData = FALSE; - btree->searchMode = FALSE; btree->fullScan = FALSE; btree->isBuild = FALSE; btree->entryAttnum = attnum; btree->entryKey = key; btree->entryCategory = category; - btree->isDelete = FALSE; } diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index b9bfde2ee4..09c3e39bf3 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -7,7 +7,7 @@ * transfer pending entries into the regular index structure. This * wins because bulk insertion is much more efficient than retail. * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -119,15 +119,13 @@ writeListPage(Relation index, Buffer buffer, rdata[0].len = sizeof(ginxlogInsertListPage); rdata[0].next = rdata + 1; - rdata[1].buffer = buffer; - rdata[1].buffer_std = true; + rdata[1].buffer = InvalidBuffer; rdata[1].data = workspace; rdata[1].len = size; rdata[1].next = NULL; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE, rdata); PageSetLSN(page, recptr); - PageSetTLI(page, ThisTimeLineID); } /* get free space before releasing buffer */ @@ -290,7 +288,7 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) if (metadata->head == InvalidBlockNumber) { /* - * Main list is empty, so just copy sublist into main list + * Main list is empty, so just insert sublist as main list */ START_CRIT_SECTION(); @@ -313,6 +311,14 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); + rdata[0].next = rdata + 1; + + rdata[1].buffer = buffer; + rdata[1].buffer_std = true; + rdata[1].data = NULL; + rdata[1].len = 0; + rdata[1].next = NULL; + Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber); START_CRIT_SECTION(); @@ -400,12 +406,10 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE, rdata); PageSetLSN(metapage, recptr); - PageSetTLI(metapage, ThisTimeLineID); if (buffer != InvalidBuffer) { PageSetLSN(page, recptr); - PageSetTLI(page, ThisTimeLineID); } } @@ -436,7 +440,7 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) * Create temporary index tuples for a single indexable item (one index column * for the heap tuple specified by ht_ctid), and append them to the array * in *collector. They will subsequently be written out using - * ginHeapTupleFastInsert. Note that to guarantee consistent state, all + * ginHeapTupleFastInsert. Note that to guarantee consistent state, all * temp tuples for a given heap tuple must be written in one call to * ginHeapTupleFastInsert. */ @@ -482,7 +486,7 @@ ginHeapTupleFastCollect(GinState *ginstate, IndexTuple itup; itup = GinFormTuple(ginstate, attnum, entries[i], categories[i], - NULL, 0, true); + NULL, 0, 0, true); itup->t_tid = *ht_ctid; collector->tuples[collector->ntuples++] = itup; collector->sumsize += IndexTupleSize(itup); @@ -586,13 +590,11 @@ shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE, rdata); PageSetLSN(metapage, recptr); - PageSetTLI(metapage, ThisTimeLineID); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); PageSetLSN(page, recptr); - PageSetTLI(page, ThisTimeLineID); } } @@ -705,7 +707,7 @@ processPendingPage(BuildAccumulator *accum, KeyArray *ka, * * This can be called concurrently by multiple backends, so it must cope. * On first glance it looks completely not concurrent-safe and not crash-safe - * either. The reason it's okay is that multiple insertion of the same entry + * either. The reason it's okay is that multiple insertion of the same entry * is detected and treated as a no-op by gininsert.c. If we crash after * posting entries to the main index and before removing them from the * pending list, it's okay because when we redo the posting later on, nothing @@ -759,7 +761,7 @@ ginInsertCleanup(GinState *ginstate, LockBuffer(metabuffer, GIN_UNLOCK); /* - * Initialize. All temporary space will be in opCtx + * Initialize. All temporary space will be in opCtx */ opCtx = AllocSetContextCreate(CurrentMemoryContext, "GIN insert cleanup temporary context", @@ -853,7 +855,7 @@ ginInsertCleanup(GinState *ginstate, /* * While we left the page unlocked, more stuff might have gotten - * added to it. If so, process those entries immediately. There + * added to it. If so, process those entries immediately. There * shouldn't be very many, so we don't worry about the fact that * we're doing this with exclusive lock. Insertion algorithm * guarantees that inserted row(s) will not continue on next page. diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 022bd27b23..144ed504dc 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -4,7 +4,7 @@ * fetch tuples from a GIN scan. * * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -20,6 +20,8 @@ #include "utils/datum.h" #include "utils/memutils.h" +/* GUC parameter */ +int GinFuzzySearchLimit = 0; typedef struct pendingPosition { @@ -32,67 +34,6 @@ typedef struct pendingPosition /* - * Convenience function for invoking a key's consistentFn - */ -static bool -callConsistentFn(GinState *ginstate, GinScanKey key) -{ - /* - * If we're dealing with a dummy EVERYTHING key, we don't want to call the - * consistentFn; just claim it matches. - */ - if (key->searchMode == GIN_SEARCH_MODE_EVERYTHING) - { - key->recheckCurItem = false; - return true; - } - - /* - * Initialize recheckCurItem in case the consistentFn doesn't know it - * should set it. The safe assumption in that case is to force recheck. - */ - key->recheckCurItem = true; - - return DatumGetBool(FunctionCall8Coll(&ginstate->consistentFn[key->attnum - 1], - ginstate->supportCollation[key->attnum - 1], - PointerGetDatum(key->entryRes), - UInt16GetDatum(key->strategy), - key->query, - UInt32GetDatum(key->nuserentries), - PointerGetDatum(key->extra_data), - PointerGetDatum(&key->recheckCurItem), - PointerGetDatum(key->queryValues), - PointerGetDatum(key->queryCategories))); -} - -/* - * Tries to refind previously taken ItemPointer on a posting page. - */ -static bool -findItemInPostingPage(Page page, ItemPointer item, OffsetNumber *off) -{ - OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; - int res; - - if (GinPageGetOpaque(page)->flags & GIN_DELETED) - /* page was deleted by concurrent vacuum */ - return false; - - /* - * scan page to find equal or first greater value - */ - for (*off = FirstOffsetNumber; *off <= maxoff; (*off)++) - { - res = ginCompareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off)); - - if (res <= 0) - return true; - } - - return false; -} - -/* * Goes to the next page if current offset is outside of bounds */ static bool @@ -105,16 +46,11 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack) /* * We scanned the whole page, so we should take right page */ - stack->blkno = GinPageGetOpaque(page)->rightlink; - if (GinPageRightMost(page)) return false; /* no more pages */ - LockBuffer(stack->buffer, GIN_UNLOCK); - stack->buffer = ReleaseAndReadBuffer(stack->buffer, - btree->index, - stack->blkno); - LockBuffer(stack->buffer, GIN_SHARE); + stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE); + stack->blkno = BufferGetBlockNumber(stack->buffer); stack->off = FirstOffsetNumber; } @@ -129,19 +65,17 @@ static void scanPostingTree(Relation index, GinScanEntry scanEntry, BlockNumber rootPostingTree) { - GinPostingTreeScan *gdi; + GinBtreeData btree; + GinBtreeStack *stack; Buffer buffer; Page page; - BlockNumber blkno; /* Descend to the leftmost leaf page */ - gdi = ginPrepareScanPostingTree(index, rootPostingTree, TRUE); - - buffer = ginScanBeginPostingTree(gdi); + stack = ginScanBeginPostingTree(&btree, index, rootPostingTree); + buffer = stack->buffer; IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */ - freeGinBtreeStack(gdi->stack); - pfree(gdi); + freeGinBtreeStack(stack); /* * Loop iterates through all leaf pages of posting tree @@ -149,23 +83,17 @@ scanPostingTree(Relation index, GinScanEntry scanEntry, for (;;) { page = BufferGetPage(buffer); - - if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0 && - GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber) + if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0) { - tbm_add_tuples(scanEntry->matchBitmap, - (ItemPointer) GinDataPageGetItem(page, FirstOffsetNumber), - GinPageGetOpaque(page)->maxoff, false); - scanEntry->predictNumberResult += GinPageGetOpaque(page)->maxoff; + int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap); + + scanEntry->predictNumberResult += n; } if (GinPageRightMost(page)) break; /* no more pages */ - blkno = GinPageGetOpaque(page)->rightlink; - LockBuffer(buffer, GIN_UNLOCK); - buffer = ReleaseAndReadBuffer(buffer, index, blkno); - LockBuffer(buffer, GIN_SHARE); + buffer = ginStepRight(buffer, index, GIN_SHARE); } UnlockReleaseBuffer(buffer); @@ -173,7 +101,7 @@ scanPostingTree(Relation index, GinScanEntry scanEntry, /* * Collects TIDs into scanEntry->matchBitmap for all heap tuples that - * match the search entry. This supports three different match modes: + * match the search entry. This supports three different match modes: * * 1. Partial-match support: scan from current point until the * comparePartialFn says we're done. @@ -269,7 +197,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, /* * In ALL mode, we are not interested in null items, so we can * stop if we get to a null-item placeholder (which will be the - * last entry for a given attnum). We do want to include NULL_KEY + * last entry for a given attnum). We do want to include NULL_KEY * and EMPTY_ITEM entries, though. */ if (icategory == GIN_CAT_NULL_ITEM) @@ -344,8 +272,11 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, } else { - tbm_add_tuples(scanEntry->matchBitmap, - GinGetPosting(itup), GinGetNPosting(itup), false); + ItemPointer ipd; + int nipd; + + ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd); + tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false); scanEntry->predictNumberResult += GinGetNPosting(itup); } @@ -354,8 +285,6 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, */ stack->off++; } - - return true; } /* @@ -387,8 +316,7 @@ restartScanEntry: ginPrepareEntryScan(&btreeEntry, entry->attnum, entry->queryKey, entry->queryCategory, ginstate); - btreeEntry.searchMode = TRUE; - stackEntry = ginFindLeafPage(&btreeEntry, NULL); + stackEntry = ginFindLeafPage(&btreeEntry, true); page = BufferGetPage(stackEntry->buffer); needUnlock = TRUE; @@ -438,8 +366,9 @@ restartScanEntry: if (GinIsPostingTree(itup)) { BlockNumber rootPostingTree = GinGetPostingTree(itup); - GinPostingTreeScan *gdi; + GinBtreeStack *stack; Page page; + ItemPointerData minItem; /* * We should unlock entry page before touching posting tree to @@ -450,9 +379,10 @@ restartScanEntry: */ LockBuffer(stackEntry->buffer, GIN_UNLOCK); needUnlock = FALSE; - gdi = ginPrepareScanPostingTree(ginstate->index, rootPostingTree, TRUE); - entry->buffer = ginScanBeginPostingTree(gdi); + stack = ginScanBeginPostingTree(&entry->btree, ginstate->index, + rootPostingTree); + entry->buffer = stack->buffer; /* * We keep buffer pinned because we need to prevent deletion of @@ -462,26 +392,25 @@ restartScanEntry: IncrBufferRefCount(entry->buffer); page = BufferGetPage(entry->buffer); - entry->predictNumberResult = gdi->stack->predictNumber * GinPageGetOpaque(page)->maxoff; /* - * Keep page content in memory to prevent durable page locking + * Load the first page into memory. */ - entry->list = (ItemPointerData *) palloc(BLCKSZ); - entry->nlist = GinPageGetOpaque(page)->maxoff; - memcpy(entry->list, GinDataPageGetItem(page, FirstOffsetNumber), - GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData)); + ItemPointerSetMin(&minItem); + entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem); + + entry->predictNumberResult = stack->predictNumber * entry->nlist; LockBuffer(entry->buffer, GIN_UNLOCK); - freeGinBtreeStack(gdi->stack); - pfree(gdi); + freeGinBtreeStack(stack); entry->isFinished = FALSE; } else if (GinGetNPosting(itup) > 0) { - entry->nlist = GinGetNPosting(itup); - entry->list = (ItemPointerData *) palloc(sizeof(ItemPointerData) * entry->nlist); - memcpy(entry->list, GinGetPosting(itup), sizeof(ItemPointerData) * entry->nlist); + entry->list = ginReadTuple(ginstate, entry->attnum, itup, + &entry->nlist); + entry->predictNumberResult = entry->nlist; + entry->isFinished = FALSE; } } @@ -491,13 +420,106 @@ restartScanEntry: freeGinBtreeStack(stackEntry); } +/* + * Comparison function for scan entry indexes. Sorts by predictNumberResult, + * least frequent items first. + */ +static int +entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg) +{ + const GinScanKey key = (const GinScanKey) arg; + int i1 = *(const int *) a1; + int i2 = *(const int *) a2; + uint32 n1 = key->scanEntry[i1]->predictNumberResult; + uint32 n2 = key->scanEntry[i2]->predictNumberResult; + + if (n1 < n2) + return -1; + else if (n1 == n2) + return 0; + else + return 1; +} + static void -startScanKey(GinState *ginstate, GinScanKey key) +startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key) { + MemoryContext oldCtx = CurrentMemoryContext; + int i; + int j; + int *entryIndexes; + ItemPointerSetMin(&key->curItem); key->curItemMatches = false; key->recheckCurItem = false; key->isFinished = false; + + /* + * Divide the entries into two distinct sets: required and additional. + * Additional entries are not enough for a match alone, without any items + * from the required set, but are needed by the consistent function to + * decide if an item matches. When scanning, we can skip over items from + * additional entries that have no corresponding matches in any of the + * required entries. That speeds up queries like "frequent & rare" + * considerably, if the frequent term can be put in the additional set. + * + * There can be many legal ways to divide them entries into these two + * sets. A conservative division is to just put everything in the required + * set, but the more you can put in the additional set, the more you can + * skip during the scan. To maximize skipping, we try to put as many + * frequent items as possible into additional, and less frequent ones into + * required. To do that, sort the entries by frequency + * (predictNumberResult), and put entries into the required set in that + * order, until the consistent function says that none of the remaining + * entries can form a match, without any items from the required set. The + * rest go to the additional set. + */ + if (key->nentries > 1) + { + MemoryContextSwitchTo(so->tempCtx); + + entryIndexes = (int *) palloc(sizeof(int) * key->nentries); + for (i = 0; i < key->nentries; i++) + entryIndexes[i] = i; + qsort_arg(entryIndexes, key->nentries, sizeof(int), + entryIndexByFrequencyCmp, key); + + for (i = 0; i < key->nentries - 1; i++) + { + /* Pass all entries <= i as FALSE, and the rest as MAYBE */ + for (j = 0; j <= i; j++) + key->entryRes[entryIndexes[j]] = GIN_FALSE; + for (j = i + 1; j < key->nentries; j++) + key->entryRes[entryIndexes[j]] = GIN_MAYBE; + + if (key->triConsistentFn(key) == GIN_FALSE) + break; + } + /* i is now the last required entry. */ + + MemoryContextSwitchTo(oldCtx); + + key->nrequired = i + 1; + key->nadditional = key->nentries - key->nrequired; + key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry)); + key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry)); + + j = 0; + for (i = 0; i < key->nrequired; i++) + key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]]; + for (i = 0; i < key->nadditional; i++) + key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]]; + + /* clean up after consistentFn calls (also frees entryIndexes) */ + MemoryContextReset(so->tempCtx); + } + else + { + key->nrequired = 1; + key->nadditional = 0; + key->requiredEntries = palloc(1 * sizeof(GinScanEntry)); + key->requiredEntries[0] = key->scanEntry[0]; + } } static void @@ -532,84 +554,146 @@ startScan(IndexScanDesc scan) } for (i = 0; i < so->nkeys; i++) - startScanKey(ginstate, so->keys + i); + startScanKey(ginstate, so, so->keys + i); } /* - * Gets next ItemPointer from PostingTree. Note, that we copy - * page into GinScanEntry->list array and unlock page, but keep it pinned - * to prevent interference with vacuum + * Load the next batch of item pointers from a posting tree. + * + * Note that we copy the page into GinScanEntry->list array and unlock it, but + * keep it pinned to prevent interference with vacuum. */ static void -entryGetNextItem(GinState *ginstate, GinScanEntry entry) +entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advancePast) { Page page; - BlockNumber blkno; + int i; + bool stepright; + + if (!BufferIsValid(entry->buffer)) + { + entry->isFinished = true; + return; + } + /* + * We have two strategies for finding the correct page: step right from + * the current page, or descend the tree again from the root. If + * advancePast equals the current item, the next matching item should be + * on the next page, so we step right. Otherwise, descend from root. + */ + if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0) + { + stepright = true; + LockBuffer(entry->buffer, GIN_SHARE); + } + else + { + GinBtreeStack *stack; + + ReleaseBuffer(entry->buffer); + + /* + * Set the search key, and find the correct leaf page. + */ + if (ItemPointerIsLossyPage(&advancePast)) + { + ItemPointerSet(&entry->btree.itemptr, + GinItemPointerGetBlockNumber(&advancePast) + 1, + FirstOffsetNumber); + } + else + { + entry->btree.itemptr = advancePast; + entry->btree.itemptr.ip_posid++; + } + entry->btree.fullScan = false; + stack = ginFindLeafPage(&entry->btree, true); + + /* we don't need the stack, just the buffer. */ + entry->buffer = stack->buffer; + IncrBufferRefCount(entry->buffer); + freeGinBtreeStack(stack); + stepright = false; + } + + elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d", + GinItemPointerGetBlockNumber(&advancePast), + GinItemPointerGetOffsetNumber(&advancePast), + !stepright); + + page = BufferGetPage(entry->buffer); for (;;) { - if (entry->offset < entry->nlist) + entry->offset = InvalidOffsetNumber; + if (entry->list) { - entry->curItem = entry->list[entry->offset++]; - return; + pfree(entry->list); + entry->list = NULL; + entry->nlist = 0; } - LockBuffer(entry->buffer, GIN_SHARE); - page = BufferGetPage(entry->buffer); - for (;;) + if (stepright) { /* - * It's needed to go by right link. During that we should refind - * first ItemPointer greater that stored + * We've processed all the entries on this page. If it was the + * last page in the tree, we're done. */ - - blkno = GinPageGetOpaque(page)->rightlink; - - LockBuffer(entry->buffer, GIN_UNLOCK); - if (blkno == InvalidBlockNumber) + if (GinPageRightMost(page)) { - ReleaseBuffer(entry->buffer); - ItemPointerSetInvalid(&entry->curItem); + UnlockReleaseBuffer(entry->buffer); entry->buffer = InvalidBuffer; entry->isFinished = TRUE; return; } - entry->buffer = ReleaseAndReadBuffer(entry->buffer, - ginstate->index, - blkno); - LockBuffer(entry->buffer, GIN_SHARE); + /* + * Step to next page, following the right link. then find the + * first ItemPointer greater than advancePast. + */ + entry->buffer = ginStepRight(entry->buffer, + ginstate->index, + GIN_SHARE); page = BufferGetPage(entry->buffer); + } + stepright = true; - entry->offset = InvalidOffsetNumber; - if (!ItemPointerIsValid(&entry->curItem) || - findItemInPostingPage(page, &entry->curItem, &entry->offset)) - { - /* - * Found position equal to or greater than stored - */ - entry->nlist = GinPageGetOpaque(page)->maxoff; - memcpy(entry->list, GinDataPageGetItem(page, FirstOffsetNumber), - GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData)); + if (GinPageGetOpaque(page)->flags & GIN_DELETED) + continue; /* page was deleted by concurrent vacuum */ - LockBuffer(entry->buffer, GIN_UNLOCK); + /* + * The first item > advancePast might not be on this page, but + * somewhere to the right, if the page was split, or a non-match from + * another key in the query allowed us to skip some items from this + * entry. Keep following the right-links until we re-find the correct + * page. + */ + if (!GinPageRightMost(page) && + ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0) + { + /* + * the item we're looking is > the right bound of the page, so it + * can't be on this page. + */ + continue; + } - if (!ItemPointerIsValid(&entry->curItem) || - ginCompareItemPointers(&entry->curItem, - entry->list + entry->offset - 1) == 0) - { - /* - * First pages are deleted or empty, or we found exact - * position, so break inner loop and continue outer one. - */ - break; - } + entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast); - /* - * Find greater than entry->curItem position, store it. - */ - entry->curItem = entry->list[entry->offset - 1]; + for (i = 0; i < entry->nlist; i++) + { + if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0) + { + entry->offset = i; + if (GinPageRightMost(page)) + { + /* after processing the copied items, we're done. */ + UnlockReleaseBuffer(entry->buffer); + entry->buffer = InvalidBuffer; + } + else + LockBuffer(entry->buffer, GIN_UNLOCK); return; } } @@ -620,10 +704,10 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry) #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) ) /* - * Sets entry->curItem to next heap item pointer for one entry of one scan key, - * or sets entry->isFinished to TRUE if there are no more. + * Sets entry->curItem to next heap item pointer > advancePast, for one entry + * of one scan key, or sets entry->isFinished to TRUE if there are no more. * - * Item pointers must be returned in ascending order. + * Item pointers are returned in ascending order. * * Note: this can return a "lossy page" item pointer, indicating that the * entry potentially matches all items on that heap page. However, it is @@ -633,16 +717,33 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry) * current implementation this is guaranteed by the behavior of tidbitmaps. */ static void -entryGetItem(GinState *ginstate, GinScanEntry entry) +entryGetItem(GinState *ginstate, GinScanEntry entry, + ItemPointerData advancePast) { Assert(!entry->isFinished); + Assert(!ItemPointerIsValid(&entry->curItem) || + ginCompareItemPointers(&entry->curItem, &advancePast) <= 0); + if (entry->matchBitmap) { + /* A bitmap result */ + BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast); + OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast); + bool gotitem = false; + do { - if (entry->matchResult == NULL || - entry->offset >= entry->matchResult->ntuples) + /* + * If we've exhausted all items on this block, move to next block + * in the bitmap. + */ + while (entry->matchResult == NULL || + (entry->matchResult->ntuples >= 0 && + entry->offset >= entry->matchResult->ntuples) || + entry->matchResult->blockno < advancePastBlk || + (ItemPointerIsLossyPage(&advancePast) && + entry->matchResult->blockno == advancePastBlk)) { entry->matchResult = tbm_iterate(entry->matchIterator); @@ -663,54 +764,103 @@ entryGetItem(GinState *ginstate, GinScanEntry entry) */ entry->offset = 0; } + if (entry->isFinished) + break; + /* + * We're now on the first page after advancePast which has any + * items on it. If it's a lossy result, return that. + */ if (entry->matchResult->ntuples < 0) { - /* - * lossy result, so we need to check the whole page - */ ItemPointerSetLossyPage(&entry->curItem, entry->matchResult->blockno); /* * We might as well fall out of the loop; we could not * estimate number of results on this page to support correct - * reducing of result even if it's enabled + * reducing of result even if it's enabled. */ + gotitem = true; break; } + /* + * Not a lossy page. Skip over any offsets <= advancePast, and + * return that. + */ + if (entry->matchResult->blockno == advancePastBlk) + { + /* + * First, do a quick check against the last offset on the + * page. If that's > advancePast, so are all the other + * offsets. + */ + if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff) + { + entry->offset = entry->matchResult->ntuples; + continue; + } + + /* Otherwise scan to find the first item > advancePast */ + while (entry->matchResult->offsets[entry->offset] <= advancePastOff) + entry->offset++; + } + ItemPointerSet(&entry->curItem, entry->matchResult->blockno, entry->matchResult->offsets[entry->offset]); entry->offset++; - } while (entry->reduceResult == TRUE && dropItem(entry)); + gotitem = true; + } while (!gotitem || (entry->reduceResult == TRUE && dropItem(entry))); } else if (!BufferIsValid(entry->buffer)) { - entry->offset++; - if (entry->offset <= entry->nlist) - entry->curItem = entry->list[entry->offset - 1]; - else + /* + * A posting list from an entry tuple, or the last page of a posting + * tree. + */ + do { - ItemPointerSetInvalid(&entry->curItem); - entry->isFinished = TRUE; - } + if (entry->offset >= entry->nlist) + { + ItemPointerSetInvalid(&entry->curItem); + entry->isFinished = TRUE; + break; + } + + entry->curItem = entry->list[entry->offset++]; + } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0); + /* XXX: shouldn't we apply the fuzzy search limit here? */ } else { + /* A posting tree */ do { - entryGetNextItem(ginstate, entry); - } while (entry->isFinished == FALSE && - entry->reduceResult == TRUE && - dropItem(entry)); + /* If we've processed the current batch, load more items */ + while (entry->offset >= entry->nlist) + { + entryLoadMoreItems(ginstate, entry, advancePast); + + if (entry->isFinished) + { + ItemPointerSetInvalid(&entry->curItem); + return; + } + } + + entry->curItem = entry->list[entry->offset++]; + + } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 || + (entry->reduceResult == TRUE && dropItem(entry))); } } /* - * Identify the "current" item among the input entry streams for this scan key, - * and test whether it passes the scan key qual condition. + * Identify the "current" item among the input entry streams for this scan key + * that is greater than advancePast, and test whether it passes the scan key + * qual condition. * * The current item is the smallest curItem among the inputs. key->curItem * is set to that value. key->curItemMatches is set to indicate whether that @@ -729,21 +879,30 @@ entryGetItem(GinState *ginstate, GinScanEntry entry) * logic in scanGetItem.) */ static void -keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key) +keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key, + ItemPointerData advancePast) { ItemPointerData minItem; ItemPointerData curPageLossy; uint32 i; - uint32 lossyEntry; bool haveLossyEntry; GinScanEntry entry; - bool res; + GinTernaryValue res; MemoryContext oldCtx; + bool allFinished; Assert(!key->isFinished); /* - * Find the minimum of the active entry curItems. + * We might have already tested this item; if so, no need to repeat work. + * (Note: the ">" case can happen, if advancePast is exact but we + * previously had to set curItem to a lossy-page pointer.) + */ + if (ginCompareItemPointers(&key->curItem, &advancePast) > 0) + return; + + /* + * Find the minimum item > advancePast among the active entry streams. * * Note: a lossy-page entry is encoded by a ItemPointer with max value for * offset (0xffff), so that it will sort after any exact entries for the @@ -751,16 +910,34 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key) * pointers, which is good. */ ItemPointerSetMax(&minItem); - - for (i = 0; i < key->nentries; i++) + allFinished = true; + for (i = 0; i < key->nrequired; i++) { - entry = key->scanEntry[i]; - if (entry->isFinished == FALSE && - ginCompareItemPointers(&entry->curItem, &minItem) < 0) + entry = key->requiredEntries[i]; + + if (entry->isFinished) + continue; + + /* + * Advance this stream if necessary. + * + * In particular, since entry->curItem was initialized with + * ItemPointerSetMin, this ensures we fetch the first item for each + * entry on the first call. + */ + if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) + { + entryGetItem(ginstate, entry, advancePast); + if (entry->isFinished) + continue; + } + + allFinished = false; + if (ginCompareItemPointers(&entry->curItem, &minItem) < 0) minItem = entry->curItem; } - if (ItemPointerIsMax(&minItem)) + if (allFinished) { /* all entries are finished */ key->isFinished = TRUE; @@ -768,46 +945,96 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key) } /* - * We might have already tested this item; if so, no need to repeat work. - * (Note: the ">" case can happen, if minItem is exact but we previously - * had to set curItem to a lossy-page pointer.) + * Ok, we now know that there are no matches < minItem. + * + * If minItem is lossy, it means that there were no exact items on the + * page among requiredEntries, because lossy pointers sort after exact + * items. However, there might be exact items for the same page among + * additionalEntries, so we mustn't advance past them. */ - if (ginCompareItemPointers(&key->curItem, &minItem) >= 0) - return; + if (ItemPointerIsLossyPage(&minItem)) + { + if (GinItemPointerGetBlockNumber(&advancePast) < + GinItemPointerGetBlockNumber(&minItem)) + { + advancePast.ip_blkid = minItem.ip_blkid; + advancePast.ip_posid = 0; + } + } + else + { + Assert(minItem.ip_posid > 0); + advancePast = minItem; + advancePast.ip_posid--; + } /* - * OK, advance key->curItem and perform consistentFn test. + * We might not have loaded all the entry streams for this TID yet. We + * could call the consistent function, passing MAYBE for those entries, to + * see if it can decide if this TID matches based on the information we + * have. But if the consistent-function is expensive, and cannot in fact + * decide with partial information, that could be a big loss. So, load all + * the additional entries, before calling the consistent function. */ - key->curItem = minItem; + for (i = 0; i < key->nadditional; i++) + { + entry = key->additionalEntries[i]; + + if (entry->isFinished) + continue; + + if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) + { + entryGetItem(ginstate, entry, advancePast); + if (entry->isFinished) + continue; + } + + /* + * Normally, none of the items in additionalEntries can have a curItem + * larger than minItem. But if minItem is a lossy page, then there + * might be exact items on the same page among additionalEntries. + */ + if (ginCompareItemPointers(&entry->curItem, &minItem) < 0) + { + Assert(ItemPointerIsLossyPage(&minItem)); + minItem = entry->curItem; + } + } /* + * Ok, we've advanced all the entries up to minItem now. Set key->curItem, + * and perform consistentFn test. + * * Lossy-page entries pose a problem, since we don't know the correct * entryRes state to pass to the consistentFn, and we also don't know what * its combining logic will be (could be AND, OR, or even NOT). If the * logic is OR then the consistentFn might succeed for all items in the * lossy page even when none of the other entries match. * - * If we have a single lossy-page entry then we check to see if the - * consistentFn will succeed with only that entry TRUE. If so, we return - * a lossy-page pointer to indicate that the whole heap page must be + * Our strategy is to call the tri-state consistent function, with the + * lossy-page entries set to MAYBE, and all the other entries FALSE. If it + * returns FALSE, none of the lossy items alone are enough for a match, so + * we don't need to return a lossy-page pointer. Otherwise, return a + * lossy-page pointer to indicate that the whole heap page must be * checked. (On subsequent calls, we'll do nothing until minItem is past * the page altogether, thus ensuring that we never return both regular * and lossy pointers for the same page.) * - * This idea could be generalized to more than one lossy-page entry, but - * ideally lossy-page entries should be infrequent so it would seldom be - * the case that we have more than one at once. So it doesn't seem worth - * the extra complexity to optimize that case. If we do find more than - * one, we just punt and return a lossy-page pointer always. + * An exception is that it doesn't matter what we pass for lossy pointers + * in "hidden" entries, because the consistentFn's result can't depend on + * them. We could pass them as MAYBE as well, but if we're using the + * "shim" implementation of a tri-state consistent function (see + * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass + * them as TRUE. * * Note that only lossy-page entries pointing to the current item's page * should trigger this processing; we might have future lossy pages in the * entry array, but they aren't relevant yet. */ + key->curItem = minItem; ItemPointerSetLossyPage(&curPageLossy, GinItemPointerGetBlockNumber(&key->curItem)); - - lossyEntry = 0; haveLossyEntry = false; for (i = 0; i < key->nentries; i++) { @@ -815,17 +1042,14 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key) if (entry->isFinished == FALSE && ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0) { - if (haveLossyEntry) - { - /* Multiple lossy entries, punt */ - key->curItem = curPageLossy; - key->curItemMatches = true; - key->recheckCurItem = true; - return; - } - lossyEntry = i; + if (i < key->nuserentries) + key->entryRes[i] = GIN_MAYBE; + else + key->entryRes[i] = GIN_TRUE; haveLossyEntry = true; } + else + key->entryRes[i] = GIN_FALSE; } /* prepare for calling consistentFn in temp context */ @@ -833,11 +1057,10 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key) if (haveLossyEntry) { - /* Single lossy-page entry, so see if whole page matches */ - memset(key->entryRes, FALSE, key->nentries); - key->entryRes[lossyEntry] = TRUE; + /* Have lossy-page entries, so see if whole page matches */ + res = key->triConsistentFn(key); - if (callConsistentFn(ginstate, key)) + if (res == GIN_TRUE || res == GIN_MAYBE) { /* Yes, so clean up ... */ MemoryContextSwitchTo(oldCtx); @@ -854,41 +1077,70 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key) /* * At this point we know that we don't need to return a lossy whole-page * pointer, but we might have matches for individual exact item pointers, - * possibly in combination with a lossy pointer. Our strategy if there's - * a lossy pointer is to try the consistentFn both ways and return a hit - * if it accepts either one (forcing the hit to be marked lossy so it will - * be rechecked). An exception is that we don't need to try it both ways - * if the lossy pointer is in a "hidden" entry, because the consistentFn's - * result can't depend on that. + * possibly in combination with a lossy pointer. Pass lossy pointers as + * MAYBE to the ternary consistent function, to let it decide if this + * tuple satisfies the overall key, even though we don't know if the lossy + * entries match. * * Prepare entryRes array to be passed to consistentFn. */ for (i = 0; i < key->nentries; i++) { entry = key->scanEntry[i]; - if (entry->isFinished == FALSE && - ginCompareItemPointers(&entry->curItem, &key->curItem) == 0) - key->entryRes[i] = TRUE; + if (entry->isFinished) + key->entryRes[i] = GIN_FALSE; +#if 0 + + /* + * This case can't currently happen, because we loaded all the entries + * for this item earlier. + */ + else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) + key->entryRes[i] = GIN_MAYBE; +#endif + else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0) + key->entryRes[i] = GIN_MAYBE; + else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0) + key->entryRes[i] = GIN_TRUE; else - key->entryRes[i] = FALSE; + key->entryRes[i] = GIN_FALSE; } - if (haveLossyEntry) - key->entryRes[lossyEntry] = TRUE; - res = callConsistentFn(ginstate, key); + res = key->triConsistentFn(key); - if (!res && haveLossyEntry && lossyEntry < key->nuserentries) + switch (res) { - /* try the other way for the lossy item */ - key->entryRes[lossyEntry] = FALSE; + case GIN_TRUE: + key->curItemMatches = true; + /* triConsistentFn set recheckCurItem */ + break; + + case GIN_FALSE: + key->curItemMatches = false; + break; - res = callConsistentFn(ginstate, key); + case GIN_MAYBE: + key->curItemMatches = true; + key->recheckCurItem = true; + break; + + default: + + /* + * the 'default' case shouldn't happen, but if the consistent + * function returns something bogus, this is the safe result + */ + key->curItemMatches = true; + key->recheckCurItem = true; + break; } - key->curItemMatches = res; - /* If we matched a lossy entry, force recheckCurItem = true */ - if (haveLossyEntry) - key->recheckCurItem = true; + /* + * We have a tuple, and we know if it matches or not. If it's a non-match, + * we could continue to find the next matching tuple, but let's break out + * and give scanGetItem a chance to advance the other keys. They might be + * able to skip past to a much higher TID, allowing us to save work. + */ /* clean up after consistentFn calls */ MemoryContextSwitchTo(oldCtx); @@ -905,117 +1157,121 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key) * keyGetItem() the combination logic is known only to the consistentFn. */ static bool -scanGetItem(IndexScanDesc scan, ItemPointer advancePast, +scanGetItem(IndexScanDesc scan, ItemPointerData advancePast, ItemPointerData *item, bool *recheck) { GinScanOpaque so = (GinScanOpaque) scan->opaque; - GinState *ginstate = &so->ginstate; - ItemPointerData myAdvancePast = *advancePast; uint32 i; - bool allFinished; bool match; - for (;;) + /*---------- + * Advance the scan keys in lock-step, until we find an item that matches + * all the keys. If any key reports isFinished, meaning its subset of the + * entries is exhausted, we can stop. Otherwise, set *item to the next + * matching item. + * + * This logic works only if a keyGetItem stream can never contain both + * exact and lossy pointers for the same page. Else we could have a + * case like + * + * stream 1 stream 2 + * ... ... + * 42/6 42/7 + * 50/1 42/0xffff + * ... ... + * + * We would conclude that 42/6 is not a match and advance stream 1, + * thus never detecting the match to the lossy pointer in stream 2. + * (keyGetItem has a similar problem versus entryGetItem.) + *---------- + */ + do { - /* - * Advance any entries that are <= myAdvancePast. In particular, - * since entry->curItem was initialized with ItemPointerSetMin, this - * ensures we fetch the first item for each entry on the first call. - */ - allFinished = TRUE; - - for (i = 0; i < so->totalentries; i++) - { - GinScanEntry entry = so->entries[i]; - - while (entry->isFinished == FALSE && - ginCompareItemPointers(&entry->curItem, - &myAdvancePast) <= 0) - entryGetItem(ginstate, entry); - - if (entry->isFinished == FALSE) - allFinished = FALSE; - } - - if (allFinished) - { - /* all entries exhausted, so we're done */ - return false; - } - - /* - * Perform the consistentFn test for each scan key. If any key - * reports isFinished, meaning its subset of the entries is exhausted, - * we can stop. Otherwise, set *item to the minimum of the key - * curItems. - */ - ItemPointerSetMax(item); - - for (i = 0; i < so->nkeys; i++) + ItemPointerSetMin(item); + match = true; + for (i = 0; i < so->nkeys && match; i++) { GinScanKey key = so->keys + i; - keyGetItem(&so->ginstate, so->tempCtx, key); + /* Fetch the next item for this key that is > advancePast. */ + keyGetItem(&so->ginstate, so->tempCtx, key, advancePast); if (key->isFinished) - return false; /* finished one of keys */ - - if (ginCompareItemPointers(&key->curItem, item) < 0) - *item = key->curItem; - } + return false; - Assert(!ItemPointerIsMax(item)); + /* + * If it's not a match, we can immediately conclude that nothing + * <= this item matches, without checking the rest of the keys. + */ + if (!key->curItemMatches) + { + advancePast = key->curItem; + match = false; + break; + } - /*---------- - * Now *item contains first ItemPointer after previous result. - * - * The item is a valid hit only if all the keys succeeded for either - * that exact TID, or a lossy reference to the same page. - * - * This logic works only if a keyGetItem stream can never contain both - * exact and lossy pointers for the same page. Else we could have a - * case like - * - * stream 1 stream 2 - * ... ... - * 42/6 42/7 - * 50/1 42/0xffff - * ... ... - * - * We would conclude that 42/6 is not a match and advance stream 1, - * thus never detecting the match to the lossy pointer in stream 2. - * (keyGetItem has a similar problem versus entryGetItem.) - *---------- - */ - match = true; - for (i = 0; i < so->nkeys; i++) - { - GinScanKey key = so->keys + i; + /* + * It's a match. We can conclude that nothing < matches, so the + * other key streams can skip to this item. + * + * Beware of lossy pointers, though; from a lossy pointer, we can + * only conclude that nothing smaller than this *block* matches. + */ + if (ItemPointerIsLossyPage(&key->curItem)) + { + if (GinItemPointerGetBlockNumber(&advancePast) < + GinItemPointerGetBlockNumber(&key->curItem)) + { + advancePast.ip_blkid = key->curItem.ip_blkid; + advancePast.ip_posid = 0; + } + } + else + { + Assert(key->curItem.ip_posid > 0); + advancePast = key->curItem; + advancePast.ip_posid--; + } - if (key->curItemMatches) + /* + * If this is the first key, remember this location as a potential + * match, and proceed to check the rest of the keys. + * |