diff options
Diffstat (limited to 'src')
2257 files changed, 256741 insertions, 103580 deletions
diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000000..a66aac007a --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,3 @@ +/Makefile.global +/Makefile.port +/Makefile.custom diff --git a/src/Makefile b/src/Makefile index b30527e923..65ea50e71d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -4,7 +4,7 @@ # # Copyright (c) 1994, Regents of the University of California # -# $PostgreSQL: pgsql/src/Makefile,v 1.50 2010/01/20 09:16:23 heikki Exp $ +# src/Makefile # #------------------------------------------------------------------------- @@ -12,22 +12,26 @@ subdir = src top_builddir = .. include Makefile.global +SUBDIRS = \ + port \ + timezone \ + gtm \ + backend \ + backend/utils/mb/conversion_procs \ + backend/snowball \ + include \ + interfaces \ + backend/replication/libpqwalreceiver \ + bin \ + pl \ + makefiles \ + test/regress -all install installdirs uninstall distprep: - $(MAKE) -C port $@ - $(MAKE) -C timezone $@ - # GTM should be built before backend because of dependancy - $(MAKE) -C gtm $@ - $(MAKE) -C interfaces $@ - $(MAKE) -C backend $@ - $(MAKE) -C backend/utils/mb/conversion_procs $@ - $(MAKE) -C backend/snowball $@ - $(MAKE) -C include $@ - $(MAKE) -C backend/replication/libpqwalreceiver $@ - $(MAKE) -C bin $@ - $(MAKE) -C pl $@ - $(MAKE) -C makefiles $@ - $(MAKE) -C test/regress $@ +# There are too many interdependencies between the subdirectories, so +# don't attempt parallel make here. +.NOTPARALLEL: + +$(recurse) install: install-local @@ -47,41 +51,25 @@ uninstall: uninstall-local uninstall-local: rm -f $(addprefix '$(DESTDIR)$(pgxsdir)/$(subdir)'/, Makefile.global Makefile.port Makefile.shlib nls-global.mk) +distprep: + $(MAKE) -C test/isolation $@ + clean: - $(MAKE) -C port $@ - $(MAKE) -C timezone $@ - $(MAKE) -C gtm $@ - $(MAKE) -C backend $@ - $(MAKE) -C backend/snowball $@ - $(MAKE) -C include $@ - $(MAKE) -C interfaces $@ - $(MAKE) -C backend/replication/libpqwalreceiver $@ - $(MAKE) -C bin $@ - $(MAKE) -C pl $@ - $(MAKE) -C makefiles $@ $(MAKE) -C test $@ $(MAKE) -C tutorial NO_PGXS=1 $@ + $(MAKE) -C test/isolation $@ $(MAKE) -C test/thread $@ distclean maintainer-clean: - $(MAKE) -C port $@ - $(MAKE) -C timezone $@ - $(MAKE) -C gtm $@ - $(MAKE) -C backend $@ - $(MAKE) -C backend/snowball $@ - $(MAKE) -C include $@ - $(MAKE) -C interfaces $@ - $(MAKE) -C backend/replication/libpqwalreceiver $@ - $(MAKE) -C bin $@ - $(MAKE) -C pl $@ - $(MAKE) -C makefiles $@ $(MAKE) -C test $@ $(MAKE) -C tutorial NO_PGXS=1 $@ + $(MAKE) -C test/isolation $@ $(MAKE) -C test/thread $@ rm -f Makefile.port Makefile.global coverage: $(MAKE) -C timezone $@ + $(MAKE) -C gtm $@ $(MAKE) -C backend $@ $(MAKE) -C backend/utils/mb/conversion_procs $@ $(MAKE) -C backend/snowball $@ diff --git a/src/Makefile.global.in b/src/Makefile.global.in index a82a301ee4..712ac58696 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -1,5 +1,5 @@ # -*-makefile-*- -# $PostgreSQL: pgsql/src/Makefile.global.in,v 1.264 2010/07/06 22:03:05 tgl Exp $ +# src/Makefile.global.in #------------------------------------------------------------------------------ # All PostgreSQL makefiles include this file and use the variables it sets, @@ -18,7 +18,9 @@ # # Meta configuration -.PHONY: all install install-strip installdirs uninstall clean distclean maintainer-clean distprep check installcheck maintainer-check coverage html man installcheck-parallel world install-world installcheck-world +standard_targets = all install installdirs uninstall distprep clean distclean maintainer-clean coverage check installcheck maintainer-check + +.PHONY: $(standard_targets) install-strip html man installcheck-parallel # make `all' the default target all: @@ -156,6 +158,7 @@ with_python = @with_python@ with_tcl = @with_tcl@ with_openssl = @with_openssl@ with_ossp_uuid = @with_ossp_uuid@ +with_selinux = @with_selinux@ with_libxml = @with_libxml@ with_libxslt = @with_libxslt@ with_system_tzdata = @with_system_tzdata@ @@ -258,7 +261,7 @@ RANLIB = @RANLIB@ WINDRES = @WINDRES@ X = @EXEEXT@ -# Perl +# Perl ifneq (@PERL@,) # quoted to protect pathname with spaces @@ -284,9 +287,6 @@ XGETTEXT = @XGETTEXT@ GZIP = gzip BZIP2 = bzip2 -PL_TESTDB = pl_regression -CONTRIB_TESTDB = contrib_regression - # Installation. INSTALL = $(SHELL) $(top_srcdir)/config/install-sh -c @@ -363,6 +363,9 @@ ELF_SYSTEM= @ELF_SYS@ # Backend stack size limit has to be hard-wired on Windows (it's in bytes) WIN32_STACK_RLIMIT=4194304 +# Set if we have a working win32 crashdump header +have_win32_dbghelp = @have_win32_dbghelp@ + # Pull in platform-specific magic include $(top_builddir)/src/Makefile.port @@ -391,7 +394,7 @@ endif # This macro is for use by libraries linking to libpq. (Because libpgport # isn't created with the same link flags as libpq, it can't be used.) libpq = -L$(libpq_builddir) -lpq - + # If doing static linking, shared library dependency info isn't available, # so add in the libraries that libpq depends on. ifeq ($(enable_shared), no) @@ -400,9 +403,9 @@ libpq += $(filter -lintl -lssl -lcrypto -lkrb5 -lcrypt, $(LIBS)) \ endif # This macro is for use by client executables (not libraries) that use libpq. -# We force clients to pull symbols from the non-shared library libpgport -# rather than pulling some libpgport symbols from libpq just because -# libpq uses those functions too. This makes applications less +# We force clients to pull symbols from the non-shared library libpgport +# rather than pulling some libpgport symbols from libpq just because +# libpq uses those functions too. This makes applications less # dependent on changes in libpq's usage of pgport. To do this we link to # pgport before libpq. This does cause duplicate -lpgport's to appear # on client link lines. @@ -424,6 +427,25 @@ submake-libpgport: ########################################################################## # +# Testing support + +PL_TESTDB = pl_regression +CONTRIB_TESTDB = contrib_regression + +ifdef NO_LOCALE +NOLOCALE += --no-locale +endif + +pg_regress_locale_flags = $(if $(ENCODING),--encoding=$(ENCODING)) $(NOLOCALE) + +pg_regress_check = $(top_builddir)/src/test/regress/pg_regress --inputdir=$(srcdir) --temp-install=./tmp_check --top-builddir=$(top_builddir) $(pg_regress_locale_flags) +pg_regress_installcheck = $(top_builddir)/src/test/regress/pg_regress --inputdir=$(srcdir) --psqldir=$(PSQLDIR) $(pg_regress_locale_flags) + +pg_regress_clean_files = results/ regression.diffs regression.out tmp_check/ log/ + + +########################################################################## +# # Customization # # This includes your local customizations if Makefile.custom exists @@ -517,7 +539,7 @@ $(top_builddir)/src/include/pg_config.h: $(top_builddir)/src/include/stamp-h $(top_builddir)/src/include/stamp-h: $(top_srcdir)/src/include/pg_config.h.in $(top_builddir)/config.status cd $(top_builddir) && ./config.status src/include/pg_config.h -# Also remake ecpg_config.h from ecpg_config.h.in if the latter changed, same +# Also remake ecpg_config.h from ecpg_config.h.in if the latter changed, same # logic as above. $(top_builddir)/src/interfaces/ecpg/include/ecpg_config.h: $(top_builddir)/src/interfaces/ecpg/include/stamp-h @@ -542,6 +564,50 @@ install-strip: ########################################################################## # +# Recursive make support +# ---------------------- +# Instead of recursing through subdirectories with a for loop or +# repeated $(MAKE) -C whatever calls, this is a little smarter: it +# allows parallel make across directories and lets make -k and -q work +# correctly. + +# We need the $(eval) function and order-only prerequisites, which are +# available in GNU make 3.80. That also happens to be the version +# where the .VARIABLES variable was introduced, so this is a simple check. +ifndef .VARIABLES +$(error GNU make 3.80 or newer is required. You are using version $(MAKE_VERSION)) +endif + +# This function is only for internal use below. It should be called +# using $(eval). It will set up a target so that it recurses into +# a given subdirectory. Note that to avoid a nasty bug in make 3.80, +# this function has to avoid using any complicated constructs (like +# multiple targets on a line) and also not contain any lines that expand +# to more than about 200 bytes. This is why we make it apply to just one +# subdirectory at a time, rather than to a list of subdirectories. +# $1: target name, e.g., all +# $2: subdir name +# $3: target to run in subdir, usually same as $1 +define _create_recursive_target +.PHONY: $(1)-$(2)-recurse +$(1): $(1)-$(2)-recurse +$(1)-$(2)-recurse: + $$(MAKE) -C $(2) $(3) +endef +# Note that the use of $$ on the last line above is important; we want +# $(MAKE) to be evaluated when the rule is run, not when the $(eval) is run +# to create the rule. This is necessary to get make -q working. + +# Call this function in a makefile that needs to recurse into subdirectories. +# In the normal case all arguments can be defaulted. +# $1: targets to make recursive (defaults to list of standard targets) +# $2: list of subdirs (defaults to SUBDIRS variable) +# $3: target to run in subdir (defaults to current element of $1) +recurse = $(foreach target,$(if $1,$1,$(standard_targets)),$(foreach subdir,$(if $2,$2,$(SUBDIRS)),$(eval $(call _create_recursive_target,$(target),$(subdir),$(if $3,$3,$(target)))))) + + +########################################################################## +# # Automatic dependency generation # ------------------------------- # When we configure with --enable-depend then we override the default @@ -582,6 +648,17 @@ clean distclean maintainer-clean: clean-deps clean-deps: @rm -rf $(DEPDIR) +# When in automatic dependency mode, never delete any intermediate +# files automatically. Otherwise, the following could happen: When +# starting from a clean source tree, the first build would delete the +# intermediate file, but also create the dependency file, which +# mentions the intermediate file, thus making it non-intermediate. +# The second build will then need to rebuild the now non-intermediate +# missing file. So the second build will do work even though nothing +# had changed. One place where this happens is the .c -> .o -> .so +# chain for some contrib modules. +.SECONDARY: + endif # autodepend @@ -631,7 +708,6 @@ lcov.info: $(gcda_files) $(GCOV) -b -f -p -o . $(GCOVFLAGS) $*.c >$*.c.gcov.out coverage: $(gcda_files:.gcda=.c.gcov) lcov.info - $(if $(SUBDIRS),for dir in $(SUBDIRS); do $(MAKE) -C $$dir coverage || exit; done) .PHONY: coverage-html coverage-html: coverage diff --git a/src/Makefile.shlib b/src/Makefile.shlib index f3dc2409b9..a5cf6c6c16 100644 --- a/src/Makefile.shlib +++ b/src/Makefile.shlib @@ -6,7 +6,7 @@ # Copyright (c) 1998, Regents of the University of California # # IDENTIFICATION -# $PostgreSQL: pgsql/src/Makefile.shlib,v 1.125 2010/07/06 03:55:33 tgl Exp $ +# src/Makefile.shlib # #------------------------------------------------------------------------- @@ -22,6 +22,7 @@ # OBJS List of object files to include in library # SHLIB_LINK If shared library relies on other libraries, # additional stuff to put in its link command +# SHLIB_PREREQS Order-only prerequisites for library build target # SHLIB_EXPORTS (optional) Name of file containing list of symbols to # export, in the format "function_name number" # @@ -270,7 +271,7 @@ endif ifeq ($(PORTNAME), sunos4) LINK.shared = $(LD) -assert pure-text -Bdynamic endif - + ifeq ($(PORTNAME), osf) LINK.shared = $(LD) -shared -expect_unresolved '*' endif @@ -340,7 +341,7 @@ all-static-lib: $(stlib) all-shared-lib: $(shlib) ifndef haslibarule -$(stlib): $(OBJS) +$(stlib): $(OBJS) | $(SHLIB_PREREQS) $(LINK.static) $@ $^ $(RANLIB) $@ endif #haslibarule @@ -351,7 +352,7 @@ ifeq (,$(filter cygwin win32,$(PORTNAME))) ifneq ($(PORTNAME), aix) # Normal case -$(shlib): $(OBJS) +$(shlib): $(OBJS) | $(SHLIB_PREREQS) $(LINK.shared) -o $@ $(OBJS) $(LDFLAGS) $(LDFLAGS_SL) $(SHLIB_LINK) ifdef shlib_major # If we're using major and minor versions, then make a symlink to major-version-only. @@ -382,7 +383,7 @@ endif else # PORTNAME == aix # AIX case -$(shlib) $(stlib): $(OBJS) +$(shlib) $(stlib): $(OBJS) | $(SHLIB_PREREQS) $(LINK.static) $(stlib) $^ $(RANLIB) $(stlib) $(MKLDEXPORT) $(stlib) >$(exports_file) @@ -408,10 +409,10 @@ else DLL_DEFFILE = lib$(NAME)dll.def endif -$(shlib): $(OBJS) $(DLL_DEFFILE) +$(shlib): $(OBJS) $(DLL_DEFFILE) | $(SHLIB_PREREQS) $(DLLWRAP) -o $@ --dllname $(shlib) $(DLLWRAP_FLAGS) --def $(DLL_DEFFILE) $(OBJS) $(LDFLAGS) $(LDFLAGS_SL) $(SHLIB_LINK) -$(stlib): $(shlib) $(DLL_DEFFILE) +$(stlib): $(shlib) $(DLL_DEFFILE) | $(SHLIB_PREREQS) $(DLLTOOL) --dllname $(shlib) $(DLLTOOL_LIBFLAGS) --def $(DLL_DEFFILE) --output-lib $@ endif # PORTNAME == cygwin || PORTNAME == win32 diff --git a/src/backend/.gitignore b/src/backend/.gitignore new file mode 100644 index 0000000000..4e04ff0575 --- /dev/null +++ b/src/backend/.gitignore @@ -0,0 +1,2 @@ +/postgres +/postgres.def diff --git a/src/backend/Makefile b/src/backend/Makefile index a707a94056..905ba06414 100644 --- a/src/backend/Makefile +++ b/src/backend/Makefile @@ -2,10 +2,10 @@ # # Makefile for the postgres backend # -# Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group +# Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group # Portions Copyright (c) 1994, Regents of the University of California # -# $PostgreSQL: pgsql/src/backend/Makefile,v 1.145 2010/07/05 18:54:37 tgl Exp $ +# src/backend/Makefile # #------------------------------------------------------------------------- @@ -128,15 +128,20 @@ endif endif # aix # Update the commonly used headers before building the subdirectories -$(SUBDIRS:%=%-recursive): $(top_builddir)/src/include/parser/gram.h $(top_builddir)/src/include/catalog/schemapg.h $(top_builddir)/src/include/utils/fmgroids.h $(top_builddir)/src/include/utils/probes.h +$(SUBDIRS:%=%-recursive): $(top_builddir)/src/include/parser/gram.h $(top_builddir)/src/include/catalog/schemapg.h $(top_builddir)/src/include/utils/fmgroids.h $(top_builddir)/src/include/utils/errcodes.h $(top_builddir)/src/include/utils/probes.h # run this unconditionally to avoid needing to know its dependencies here: submake-schemapg: $(MAKE) -C catalog schemapg.h -.PHONY: submake-schemapg +# src/port needs a convenient way to force errcodes.h to get built +submake-errcodes: $(top_builddir)/src/include/utils/errcodes.h -catalog/schemapg.h: submake-schemapg +.PHONY: submake-schemapg submake-errcodes + +catalog/schemapg.h: | submake-schemapg + +$(top_builddir)/src/port/libpgport_srv.a: | submake-libpgport # The postgres.o target is needed by the rule in Makefile.global that @@ -155,6 +160,9 @@ parser/gram.h: parser/gram.y utils/fmgroids.h: utils/Gen_fmgrtab.pl catalog/Catalog.pm $(top_srcdir)/src/include/catalog/pg_proc.h $(MAKE) -C utils fmgroids.h +utils/errcodes.h: utils/generate-errcodes.pl utils/errcodes.txt + $(MAKE) -C utils errcodes.h + utils/probes.h: utils/probes.d $(MAKE) -C utils probes.h @@ -179,6 +187,10 @@ $(top_builddir)/src/include/catalog/schemapg.h: catalog/schemapg.h cd $(dir $@) && rm -f $(notdir $@) && \ $(LN_S) "$$prereqdir/$(notdir $<)" . +$(top_builddir)/src/include/utils/errcodes.h: utils/errcodes.h + cd $(dir $@) && rm -f $(notdir $@) && \ + $(LN_S) ../../../$(subdir)/utils/errcodes.h . + $(top_builddir)/src/include/utils/fmgroids.h: utils/fmgroids.h prereqdir=`cd $(dir $<) >/dev/null && pwd` && \ cd $(dir $@) && rm -f $(notdir $@) && \ @@ -195,11 +207,13 @@ utils/probes.o: utils/probes.d $(SUBDIROBJS) ########################################################################## +# Be sure that these files get removed by the maintainer-clean target distprep: $(MAKE) -C parser gram.c gram.h scan.c $(MAKE) -C bootstrap bootparse.c bootscanner.c $(MAKE) -C catalog schemapg.h postgres.bki postgres.description postgres.shdescription - $(MAKE) -C utils fmgrtab.c fmgroids.h + $(MAKE) -C replication repl_gram.c repl_scanner.c + $(MAKE) -C utils fmgrtab.c fmgroids.h errcodes.h $(MAKE) -C utils/misc guc-file.c @@ -295,20 +309,25 @@ ifeq ($(PORTNAME), win32) endif distclean: clean - rm -f port/tas.s port/dynloader.c port/pg_sema.c port/pg_shmem.c + rm -f port/tas.s port/dynloader.c port/pg_sema.c port/pg_shmem.c \ + port/pg_latch.c maintainer-clean: distclean rm -f bootstrap/bootparse.c \ bootstrap/bootscanner.c \ parser/gram.c \ - parser/scan.c \ parser/gram.h \ + parser/scan.c \ catalog/schemapg.h \ catalog/postgres.bki \ catalog/postgres.description \ catalog/postgres.shdescription \ + replication/repl_gram.c \ + replication/repl_gram.h \ + replication/repl_scanner.c \ utils/fmgroids.h \ utils/fmgrtab.c \ + utils/errcodes.h \ utils/misc/guc-file.c @@ -316,7 +335,7 @@ maintainer-clean: distclean # # Support for code development. # -# Use target "quick" to build "postgres" when you know all the subsystems +# Use target "quick" to build "postgres" when you know all the subsystems # are up to date. It saves the time of doing all the submakes. .PHONY: quick quick: $(OBJS) diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index cfbb5a5d35..a4c4ca7da9 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -1,7 +1,7 @@ # # Makefile for the access methods module # -# $PostgreSQL: pgsql/src/backend/access/Makefile,v 1.14 2008/02/19 10:30:06 petere Exp $ +# src/backend/access/Makefile # subdir = src/backend/access diff --git a/src/backend/access/common/Makefile b/src/backend/access/common/Makefile index 011c60fce5..1fa6de0823 100644 --- a/src/backend/access/common/Makefile +++ b/src/backend/access/common/Makefile @@ -4,7 +4,7 @@ # Makefile for access/common # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/access/common/Makefile,v 1.26 2009/08/06 20:44:31 tgl Exp $ +# src/backend/access/common/Makefile # #------------------------------------------------------------------------- diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index d34f00261a..cb1f8f89fd 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -45,12 +45,12 @@ * and we'd like to still refer to them via C struct offsets. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/common/heaptuple.c,v 1.130 2010/01/10 04:26:36 rhaas Exp $ + * src/backend/access/common/heaptuple.c * *------------------------------------------------------------------------- */ @@ -353,7 +353,7 @@ nocachegetattr(HeapTuple tuple, * * check to see if any preceding bits are null... */ - int byte = attnum >> 3; + int byte = attnum >> 3; int finalbit = attnum & 0x07; /* check for nulls "before" final bit of last byte */ diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c index 3ce377b854..85c43199aa 100644 --- a/src/backend/access/common/indextuple.c +++ b/src/backend/access/common/indextuple.c @@ -4,12 +4,12 @@ * This file contains index tuple accessor and mutator routines, * as well as various tuple utilities. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/common/indextuple.c,v 1.91 2010/01/10 04:26:36 rhaas Exp $ + * src/backend/access/common/indextuple.c * *------------------------------------------------------------------------- */ @@ -237,7 +237,7 @@ nocache_index_getattr(IndexTuple tup, * Now check to see if any preceding bits are null... */ { - int byte = attnum >> 3; + int byte = attnum >> 3; int finalbit = attnum & 0x07; /* check for nulls "before" final bit of last byte */ diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c index 8ff4f6aad5..8b5f4a862f 100644 --- a/src/backend/access/common/printtup.c +++ b/src/backend/access/common/printtup.c @@ -5,11 +5,11 @@ * clients and standalone backends are supported here). * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/common/printtup.c,v 1.106 2010/01/02 16:57:33 momjian Exp $ + * src/backend/access/common/printtup.c * *------------------------------------------------------------------------- */ diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index cd4f59005a..465742556f 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -3,12 +3,12 @@ * reloptions.c * Core support for relation options (pg_class.reloptions) * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/common/reloptions.c,v 1.35 2010/06/07 02:59:02 itagaki Exp $ + * src/backend/access/common/reloptions.c * *------------------------------------------------------------------------- */ @@ -782,6 +782,9 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, Oid amoptions) case RELKIND_INDEX: options = index_reloptions(amoptions, datum, false); break; + case RELKIND_FOREIGN_TABLE: + options = NULL; + break; default: Assert(false); /* can't get here */ options = NULL; /* keep compiler quiet */ @@ -1174,7 +1177,7 @@ heap_reloptions(char relkind, Datum reloptions, bool validate) case RELKIND_RELATION: return default_reloptions(reloptions, validate, RELOPT_KIND_HEAP); default: - /* sequences, composite types and views are not supported */ + /* other relkinds are not supported */ return NULL; } } @@ -1203,7 +1206,7 @@ index_reloptions(RegProcedure amoptions, Datum reloptions, bool validate) /* Can't use OidFunctionCallN because we might get a NULL result */ fmgr_info(amoptions, &flinfo); - InitFunctionCallInfoData(fcinfo, &flinfo, 2, NULL, NULL); + InitFunctionCallInfoData(fcinfo, &flinfo, 2, InvalidOid, NULL, NULL); fcinfo.arg[0] = reloptions; fcinfo.arg[1] = BoolGetDatum(validate); diff --git a/src/backend/access/common/scankey.c b/src/backend/access/common/scankey.c index 23d6b88d41..c879b8aaa4 100644 --- a/src/backend/access/common/scankey.c +++ b/src/backend/access/common/scankey.c @@ -3,18 +3,19 @@ * scankey.c * scan key support code * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/common/scankey.c,v 1.34 2010/01/02 16:57:33 momjian Exp $ + * src/backend/access/common/scankey.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/skey.h" +#include "catalog/pg_collation.h" /* @@ -33,6 +34,7 @@ ScanKeyEntryInitialize(ScanKey entry, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, + Oid collation, RegProcedure procedure, Datum argument) { @@ -40,9 +42,12 @@ ScanKeyEntryInitialize(ScanKey entry, entry->sk_attno = attributeNumber; entry->sk_strategy = strategy; entry->sk_subtype = subtype; + entry->sk_collation = collation; entry->sk_argument = argument; if (RegProcedureIsValid(procedure)) + { fmgr_info(procedure, &entry->sk_func); + } else { Assert(flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL)); @@ -53,12 +58,16 @@ ScanKeyEntryInitialize(ScanKey entry, /* * ScanKeyInit * Shorthand version of ScanKeyEntryInitialize: flags and subtype - * are assumed to be zero (the usual value). + * are assumed to be zero (the usual value), and collation is defaulted. * * This is the recommended version for hardwired lookups in system catalogs. * It cannot handle NULL arguments, unary operators, or nondefault operators, * but we need none of those features for most hardwired lookups. * + * We set collation to DEFAULT_COLLATION_OID always. This is appropriate + * for textual columns in system catalogs, and it will be ignored for + * non-textual columns, so it's not worth trying to be more finicky. + * * Note: CurrentMemoryContext at call should be as long-lived as the ScanKey * itself, because that's what will be used for any subsidiary info attached * to the ScanKey's FmgrInfo record. @@ -74,6 +83,7 @@ ScanKeyInit(ScanKey entry, entry->sk_attno = attributeNumber; entry->sk_strategy = strategy; entry->sk_subtype = InvalidOid; + entry->sk_collation = DEFAULT_COLLATION_OID; entry->sk_argument = argument; fmgr_info(procedure, &entry->sk_func); } @@ -93,6 +103,7 @@ ScanKeyEntryInitializeWithInfo(ScanKey entry, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, + Oid collation, FmgrInfo *finfo, Datum argument) { @@ -100,6 +111,7 @@ ScanKeyEntryInitializeWithInfo(ScanKey entry, entry->sk_attno = attributeNumber; entry->sk_strategy = strategy; entry->sk_subtype = subtype; + entry->sk_collation = collation; entry->sk_argument = argument; fmgr_info_copy(&entry->sk_func, finfo, CurrentMemoryContext); } diff --git a/src/backend/access/common/tupconvert.c b/src/backend/access/common/tupconvert.c index fec3b3ef58..34e5f11440 100644 --- a/src/backend/access/common/tupconvert.c +++ b/src/backend/access/common/tupconvert.c @@ -9,12 +9,12 @@ * executor's "junkfilter" routines, but these functions work on bare * HeapTuples rather than TupleTableSlots. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/common/tupconvert.c,v 1.4 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/common/tupconvert.c * *------------------------------------------------------------------------- */ diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c index 2125fdfb0b..16979c4ea7 100644 --- a/src/backend/access/common/tupdesc.c +++ b/src/backend/access/common/tupdesc.c @@ -3,12 +3,12 @@ * tupdesc.c * POSTGRES tuple descriptor support code * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/common/tupdesc.c,v 1.133 2010/02/14 18:42:12 rhaas Exp $ + * src/backend/access/common/tupdesc.c * * NOTES * some of the executor utility code such as "ExecTypeFromTL" should be @@ -360,6 +360,8 @@ equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2) return false; if (attr1->attinhcount != attr2->attinhcount) return false; + if (attr1->attcollation != attr2->attcollation) + return false; /* attacl and attoptions are not even present... */ } @@ -427,6 +429,10 @@ equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2) * TupleDescInitEntry * This function initializes a single attribute structure in * a previously allocated tuple descriptor. + * + * Note that attcollation is set to the default for the specified datatype. + * If a nondefault collation is needed, insert it afterwards using + * TupleDescInitEntryCollation. */ void TupleDescInitEntry(TupleDesc desc, @@ -488,10 +494,32 @@ TupleDescInitEntry(TupleDesc desc, att->attbyval = typeForm->typbyval; att->attalign = typeForm->typalign; att->attstorage = typeForm->typstorage; + att->attcollation = typeForm->typcollation; ReleaseSysCache(tuple); } +/* + * TupleDescInitEntryCollation + * + * Assign a nondefault collation to a previously initialized tuple descriptor + * entry. + */ +void +TupleDescInitEntryCollation(TupleDesc desc, + AttrNumber attributeNumber, + Oid collationid) +{ + /* + * sanity checks + */ + AssertArg(PointerIsValid(desc)); + AssertArg(attributeNumber >= 1); + AssertArg(attributeNumber <= desc->natts); + + desc->attrs[attributeNumber - 1]->attcollation = collationid; +} + /* * BuildDescForRelation @@ -513,6 +541,7 @@ BuildDescForRelation(List *schema) char *attname; Oid atttypid; int32 atttypmod; + Oid attcollation; int attdim; /* @@ -536,7 +565,8 @@ BuildDescForRelation(List *schema) attnum++; attname = entry->colname; - atttypid = typenameTypeId(NULL, entry->typeName, &atttypmod); + typenameTypeIdAndMod(NULL, entry->typeName, &atttypid, &atttypmod); + attcollation = GetColumnDefCollation(NULL, entry, atttypid); attdim = list_length(entry->typeName->arrayBounds); if (entry->typeName->setof) @@ -549,6 +579,7 @@ BuildDescForRelation(List *schema) atttypid, atttypmod, attdim); /* Override TupleDescInitEntry's settings as requested */ + TupleDescInitEntryCollation(desc, attnum, attcollation); if (entry->storage) desc->attrs[attnum - 1]->attstorage = entry->storage; @@ -582,24 +613,28 @@ BuildDescForRelation(List *schema) * BuildDescFromLists * * Build a TupleDesc given lists of column names (as String nodes), - * column type OIDs, and column typmods. No constraints are generated. + * column type OIDs, typmods, and collation OIDs. + * + * No constraints are generated. * * This is essentially a cut-down version of BuildDescForRelation for use * with functions returning RECORD. */ TupleDesc -BuildDescFromLists(List *names, List *types, List *typmods) +BuildDescFromLists(List *names, List *types, List *typmods, List *collations) { int natts; AttrNumber attnum; ListCell *l1; ListCell *l2; ListCell *l3; + ListCell *l4; TupleDesc desc; natts = list_length(names); Assert(natts == list_length(types)); Assert(natts == list_length(typmods)); + Assert(natts == list_length(collations)); /* * allocate a new tuple descriptor @@ -610,20 +645,25 @@ BuildDescFromLists(List *names, List *types, List *typmods) l2 = list_head(types); l3 = list_head(typmods); + l4 = list_head(collations); foreach(l1, names) { char *attname = strVal(lfirst(l1)); Oid atttypid; int32 atttypmod; + Oid attcollation; atttypid = lfirst_oid(l2); l2 = lnext(l2); atttypmod = lfirst_int(l3); l3 = lnext(l3); + attcollation = lfirst_oid(l4); + l4 = lnext(l4); attnum++; TupleDescInitEntry(desc, attnum, attname, atttypid, atttypmod, 0); + TupleDescInitEntryCollation(desc, attnum, attcollation); } return desc; diff --git a/src/backend/access/gin/Makefile b/src/backend/access/gin/Makefile index 23b75fc1d8..889dde6a27 100644 --- a/src/backend/access/gin/Makefile +++ b/src/backend/access/gin/Makefile @@ -4,7 +4,7 @@ # Makefile for access/gin # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/access/gin/Makefile,v 1.4 2009/03/24 20:17:10 tgl Exp $ +# src/backend/access/gin/Makefile # #------------------------------------------------------------------------- diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README index cd406935e0..67159d8529 100644 --- a/src/backend/access/gin/README +++ b/src/backend/access/gin/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/access/gin/README,v 1.7 2010/02/08 04:33:52 tgl Exp $ +src/backend/access/gin/README Gin for PostgreSQL ================== @@ -9,29 +9,29 @@ Gin stands for Generalized Inverted Index and should be considered as a genie, not a drink. Generalized means that the index does not know which operation it accelerates. -It instead works with custom strategies, defined for specific data types (read -"Index Method Strategies" in the PostgreSQL documentation). In that sense, Gin +It instead works with custom strategies, defined for specific data types (read +"Index Method Strategies" in the PostgreSQL documentation). In that sense, Gin is similar to GiST and differs from btree indices, which have predefined, comparison-based operations. -An inverted index is an index structure storing a set of (key, posting list) -pairs, where 'posting list' is a set of documents in which the key occurs. -(A text document would usually contain many keys.) The primary goal of +An inverted index is an index structure storing a set of (key, posting list) +pairs, where 'posting list' is a set of heap rows in which the key occurs. +(A text document would usually contain many keys.) The primary goal of Gin indices is support for highly scalable, full-text search in PostgreSQL. -Gin consists of a B-tree index constructed over entries (ET, entries tree), -where each entry is an element of the indexed value (element of array, lexeme -for tsvector) and where each tuple in a leaf page is either a pointer to a -B-tree over item pointers (PT, posting tree), or a list of item pointers -(PL, posting list) if the tuple is small enough. +A Gin index consists of a B-tree index constructed over key values, +where each key is an element of some indexed items (element of array, lexeme +for tsvector) and where each tuple in a leaf page contains either a pointer to +a B-tree over item pointers (posting tree), or a simple list of item pointers +(posting list) if the list is small enough. -Note: There is no delete operation for ET. The reason for this is that in -our experience, the set of distinct words in a large corpus changes very -rarely. This greatly simplifies the code and concurrency algorithms. +Note: There is no delete operation in the key (entry) tree. The reason for +this is that in our experience, the set of distinct words in a large corpus +changes very slowly. This greatly simplifies the code and concurrency +algorithms. -Gin comes with built-in support for one-dimensional arrays (eg. integer[], -text[]), but no support for NULL elements. The following operations are -available: +Core PostgreSQL includes built-in Gin support for one-dimensional arrays +(eg. integer[], text[]). The following operations are available: * contains: value_array @> query_array * overlaps: value_array && query_array @@ -59,61 +59,177 @@ Gin Fuzzy Limit There are often situations when a full-text search returns a very large set of results. Since reading tuples from the disk and sorting them could take a -lot of time, this is unacceptable for production. (Note that the search +lot of time, this is unacceptable for production. (Note that the search itself is very fast.) -Such queries usually contain very frequent lexemes, so the results are not -very helpful. To facilitate execution of such queries Gin has a configurable -soft upper limit on the size of the returned set, determined by the -'gin_fuzzy_search_limit' GUC variable. This is set to 0 by default (no +Such queries usually contain very frequent lexemes, so the results are not +very helpful. To facilitate execution of such queries Gin has a configurable +soft upper limit on the size of the returned set, determined by the +'gin_fuzzy_search_limit' GUC variable. This is set to 0 by default (no limit). If a non-zero search limit is set, then the returned set is a subset of the whole result set, chosen at random. -"Soft" means that the actual number of returned results could slightly differ -from the specified limit, depending on the query and the quality of the +"Soft" means that the actual number of returned results could differ +from the specified limit, depending on the query and the quality of the system's random number generator. From experience, a value of 'gin_fuzzy_search_limit' in the thousands (eg. 5000-20000) works well. This means that 'gin_fuzzy_search_limit' will -have no effect for queries returning a result set with less tuples than this +have no effect for queries returning a result set with less tuples than this number. -Limitations ------------ - - * No support for multicolumn indices - * Gin doesn't uses scan->kill_prior_tuple & scan->ignore_killed_tuples - * Gin searches entries only by equality matching. This may be improved in - future. - * Gin doesn't support full scans of indices. - * Gin doesn't index NULL values. +Index structure +--------------- -Open Items ----------- +The "items" that a GIN index indexes are composite values that contain +zero or more "keys". For example, an item might be an integer array, and +then the keys would be the individual integer values. The index actually +stores and searches for the key values, not the items per se. In the +pg_opclass entry for a GIN opclass, the opcintype is the data type of the +items, and the opckeytype is the data type of the keys. GIN is optimized +for cases where items contain many keys and the same key values appear +in many different items. + +A GIN index contains a metapage, a btree of key entries, and possibly +"posting tree" pages, which hold the overflow when a key entry acquires +too many heap tuple pointers to fit in a btree page. Additionally, if the +fast-update feature is enabled, there can be "list pages" holding "pending" +key entries that haven't yet been merged into the main btree. The list +pages have to be scanned linearly when doing a search, so the pending +entries should be merged into the main btree before there get to be too +many of them. The advantage of the pending list is that bulk insertion of +a few thousand entries can be much faster than retail insertion. (The win +comes mainly from not having to do multiple searches/insertions when the +same key appears in multiple new heap tuples.) + +Key entries are nominally of the same IndexEntry format as used in other +index types, but since a leaf key entry typically refers to multiple heap +tuples, there are significant differences. (See GinFormTuple, which works +by building a "normal" index tuple and then modifying it.) The points to +know are: + +* In a single-column index, a key tuple just contains the key datum, but +in a multi-column index, a key tuple contains the pair (column number, +key datum) where the column number is stored as an int2. This is needed +to support different key data types in different columns. This much of +the tuple is built by index_form_tuple according to the usual rules. +The column number (if present) can never be null, but the key datum can +be, in which case a null bitmap is present as usual. (As usual for index +tuples, the size of the null bitmap is fixed at INDEX_MAX_KEYS.) + +* If the key datum is null (ie, IndexTupleHasNulls() is true), then +just after the nominal index data (ie, at offset IndexInfoFindDataOffset +or IndexInfoFindDataOffset + sizeof(int2)) there is a byte indicating +the "category" of the null entry. These are the possible categories: + 1 = ordinary null key value extracted from an indexable item + 2 = placeholder for zero-key indexable item + 3 = placeholder for null indexable item +Placeholder null entries are inserted into the index because otherwise +there would be no index entry at all for an empty or null indexable item, +which would mean that full index scans couldn't be done and various corner +cases would give wrong answers. The different categories of null entries +are treated as distinct keys by the btree, but heap itempointers for the +same category of null entry are merged into one index entry just as happens +with ordinary key entries. + +* In a key entry at the btree leaf level, at the next SHORTALIGN boundary, +there is an array of zero or more ItemPointers, which store the heap tuple +TIDs for which the indexable items contain this key. This is called the +"posting list". The TIDs in a posting list must appear in sorted order. +If the list would be too big for the index tuple to fit on an index page, +the ItemPointers are pushed out to a separate posting page or pages, and +none appear in the key entry itself. The separate pages are called a +"posting tree"; they are organized as a btree of ItemPointer values. +Note that in either case, the ItemPointers associated with a key can +easily be read out in sorted order; this is relied on by the scan +algorithms. + +* The index tuple header fields of a leaf key entry are abused as follows: + +1) Posting list case: + +* ItemPointerGetBlockNumber(&itup->t_tid) contains the offset from index + tuple start to the posting list. + Access macros: GinGetPostingOffset(itup) / GinSetPostingOffset(itup,n) + +* ItemPointerGetOffsetNumber(&itup->t_tid) contains the number of elements + in the posting list (number of heap itempointers). + Access macros: GinGetNPosting(itup) / GinSetNPosting(itup,n) + +* If IndexTupleHasNulls(itup) is true, the null category byte can be + accessed/set with GinGetNullCategory(itup,gs) / GinSetNullCategory(itup,gs,c) + +* The posting list can be accessed with GinGetPosting(itup) + +2) Posting tree case: + +* ItemPointerGetBlockNumber(&itup->t_tid) contains the index block number + of the root of the posting tree. + Access macros: GinGetPostingTree(itup) / GinSetPostingTree(itup, blkno) + +* ItemPointerGetOffsetNumber(&itup->t_tid) contains the magic number + GIN_TREE_POSTING, which distinguishes this from the posting-list case + (it's large enough that that many heap itempointers couldn't possibly + fit on an index page). This value is inserted automatically by the + GinSetPostingTree macro. + +* If IndexTupleHasNulls(itup) is true, the null category byte can be + accessed/set with GinGetNullCategory(itup) / GinSetNullCategory(itup,c) + +* The posting list is not present and must not be accessed. + +Use the macro GinIsPostingTree(itup) to determine which case applies. + +In both cases, itup->t_info & INDEX_SIZE_MASK contains actual total size of +tuple, and the INDEX_VAR_MASK and INDEX_NULL_MASK bits have their normal +meanings as set by index_form_tuple. + +Index tuples in non-leaf levels of the btree contain the optional column +number, key datum, and null category byte as above. They do not contain +a posting list. ItemPointerGetBlockNumber(&itup->t_tid) is the downlink +to the next lower btree level, and ItemPointerGetOffsetNumber(&itup->t_tid) +is InvalidOffsetNumber. Use the access macros GinGetDownlink/GinSetDownlink +to get/set the downlink. + +Index entries that appear in "pending list" pages work a tad differently as +well. The optional column number, key datum, and null category byte are as +for other GIN index entries. However, there is always exactly one heap +itempointer associated with a pending entry, and it is stored in the t_tid +header field just as in non-GIN indexes. There is no posting list. +Furthermore, the code that searches the pending list assumes that all +entries for a given heap tuple appear consecutively in the pending list and +are sorted by the column-number-plus-key-datum. The GIN_LIST_FULLROW page +flag bit tells whether entries for a given heap tuple are spread across +multiple pending-list pages. If GIN_LIST_FULLROW is set, the page contains +all the entries for one or more heap tuples. If GIN_LIST_FULLROW is clear, +the page contains entries for only one heap tuple, *and* they are not all +the entries for that tuple. (Thus, a heap tuple whose entries do not all +fit on one pending-list page must have those pages to itself, even if this +results in wasting much of the space on the preceding page and the last +page for the tuple.) -We appreciate any comments, help and suggestions. +Limitations +----------- - * Teach optimizer/executor that GIN is intrinsically clustered. i.e., it - always returns ItemPointer in ascending order. - * Tweak gincostestimate. + * Gin doesn't use scan->kill_prior_tuple & scan->ignore_killed_tuples + * Gin searches entries only by equality matching, or simple range + matching using the "partial match" feature. TODO ---- Nearest future: - * Opclasses for all types (no programming, just many catalog changes). + * Opclasses for more types (no programming, just many catalog changes) Distant future: * Replace B-tree of entries to something like GiST - * Add multicolumn support - * Optimize insert operations (background index insertion) Authors ------- -All work was done by Teodor Sigaev (teodor@sigaev.ru) and Oleg Bartunov +Original work was done by Teodor Sigaev (teodor@sigaev.ru) and Oleg Bartunov (oleg@sai.msu.su). diff --git a/src/backend/access/gin/ginarrayproc.c b/src/backend/access/gin/ginarrayproc.c index db5c9e3d19..2de58604ee 100644 --- a/src/backend/access/gin/ginarrayproc.c +++ b/src/backend/access/gin/ginarrayproc.c @@ -4,17 +4,19 @@ * support functions for GIN's indexing of any array * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginarrayproc.c,v 1.17 2010/01/02 16:57:33 momjian Exp $ + * src/backend/access/gin/ginarrayproc.c *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/gin.h" +#include "access/skey.h" #include "utils/array.h" +#include "utils/builtins.h" #include "utils/lsyscache.h" @@ -23,34 +25,23 @@ #define GinContainedStrategy 3 #define GinEqualStrategy 4 -#define ARRAYCHECK(x) do { \ - if ( ARR_HASNULL(x) ) \ - ereport(ERROR, \ - (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), \ - errmsg("array must not contain null values"))); \ -} while(0) - /* - * Function used as extractValue and extractQuery both + * extractValue support function */ Datum ginarrayextract(PG_FUNCTION_ARGS) { - ArrayType *array; - int32 *nentries = (int32 *) PG_GETARG_POINTER(1); - Datum *entries = NULL; + /* Make copy of array input to ensure it doesn't disappear while in use */ + ArrayType *array = PG_GETARG_ARRAYTYPE_P_COPY(0); + int32 *nkeys = (int32 *) PG_GETARG_POINTER(1); + bool **nullFlags = (bool **) PG_GETARG_POINTER(2); int16 elmlen; bool elmbyval; char elmalign; - - /* - * we should guarantee that array will not be destroyed during all - * operation - */ - array = PG_GETARG_ARRAYTYPE_P_COPY(0); - - ARRAYCHECK(array); + Datum *elems; + bool *nulls; + int nelems; get_typlenbyvalalign(ARR_ELEMTYPE(array), &elmlen, &elmbyval, &elmalign); @@ -58,89 +49,159 @@ ginarrayextract(PG_FUNCTION_ARGS) deconstruct_array(array, ARR_ELEMTYPE(array), elmlen, elmbyval, elmalign, - &entries, NULL, (int *) nentries); + &elems, &nulls, &nelems); - if (*nentries == 0 && PG_NARGS() == 3) - { - switch (PG_GETARG_UINT16(2)) /* StrategyNumber */ - { - case GinOverlapStrategy: - *nentries = -1; /* nobody can be found */ - break; - case GinContainsStrategy: - case GinContainedStrategy: - case GinEqualStrategy: - default: /* require fullscan: GIN can't find void - * arrays */ - break; - } - } + *nkeys = nelems; + *nullFlags = nulls; - /* we should not free array, entries[i] points into it */ - PG_RETURN_POINTER(entries); + /* we should not free array, elems[i] points into it */ + PG_RETURN_POINTER(elems); } +/* + * Formerly, ginarrayextract had only two arguments. Now it has three, + * but we still need a pg_proc entry with two args to support reloading + * pre-9.1 contrib/intarray opclass declarations. This compatibility + * function should go away eventually. + */ +Datum +ginarrayextract_2args(PG_FUNCTION_ARGS) +{ + if (PG_NARGS() < 3) /* should not happen */ + elog(ERROR, "ginarrayextract requires three arguments"); + return ginarrayextract(fcinfo); +} + +/* + * extractQuery support function + */ Datum ginqueryarrayextract(PG_FUNCTION_ARGS) { - PG_RETURN_DATUM(DirectFunctionCall3(ginarrayextract, - PG_GETARG_DATUM(0), - PG_GETARG_DATUM(1), - PG_GETARG_DATUM(2))); + /* Make copy of array input to ensure it doesn't disappear while in use */ + ArrayType *array = PG_GETARG_ARRAYTYPE_P_COPY(0); + int32 *nkeys = (int32 *) PG_GETARG_POINTER(1); + StrategyNumber strategy = PG_GETARG_UINT16(2); + + /* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */ + /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ + bool **nullFlags = (bool **) PG_GETARG_POINTER(5); + int32 *searchMode = (int32 *) PG_GETARG_POINTER(6); + int16 elmlen; + bool elmbyval; + char elmalign; + Datum *elems; + bool *nulls; + int nelems; + + get_typlenbyvalalign(ARR_ELEMTYPE(array), + &elmlen, &elmbyval, &elmalign); + + deconstruct_array(array, + ARR_ELEMTYPE(array), + elmlen, elmbyval, elmalign, + &elems, &nulls, &nelems); + + *nkeys = nelems; + *nullFlags = nulls; + + switch (strategy) + { + case GinOverlapStrategy: + *searchMode = GIN_SEARCH_MODE_DEFAULT; + break; + case GinContainsStrategy: + if (nelems > 0) + *searchMode = GIN_SEARCH_MODE_DEFAULT; + else /* everything contains the empty set */ + *searchMode = GIN_SEARCH_MODE_ALL; + break; + case GinContainedStrategy: + /* empty set is contained in everything */ + *searchMode = GIN_SEARCH_MODE_INCLUDE_EMPTY; + break; + case GinEqualStrategy: + if (nelems > 0) + *searchMode = GIN_SEARCH_MODE_DEFAULT; + else + *searchMode = GIN_SEARCH_MODE_INCLUDE_EMPTY; + break; + default: + elog(ERROR, "ginqueryarrayextract: unknown strategy number: %d", + strategy); + } + + /* we should not free array, elems[i] points into it */ + PG_RETURN_POINTER(elems); } +/* + * consistent support function + */ Datum ginarrayconsistent(PG_FUNCTION_ARGS) { bool *check = (bool *) PG_GETARG_POINTER(0); StrategyNumber strategy = PG_GETARG_UINT16(1); - ArrayType *query = PG_GETARG_ARRAYTYPE_P(2); - /* int32 nkeys = PG_GETARG_INT32(3); */ + /* ArrayType *query = PG_GETARG_ARRAYTYPE_P(2); */ + int32 nkeys = PG_GETARG_INT32(3); + /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ bool *recheck = (bool *) PG_GETARG_POINTER(5); - bool res; - int i, - nentries; - /* ARRAYCHECK was already done by previous ginarrayextract call */ + /* Datum *queryKeys = (Datum *) PG_GETARG_POINTER(6); */ + bool *nullFlags = (bool *) PG_GETARG_POINTER(7); + bool res; + int32 i; switch (strategy) { case GinOverlapStrategy: /* result is not lossy */ *recheck = false; - /* at least one element in check[] is true, so result = true */ - res = true; - break; - case GinContainedStrategy: - /* we will need recheck */ - *recheck = true; - /* at least one element in check[] is true, so result = true */ - res = true; + /* must have a match for at least one non-null element */ + res = false; + for (i = 0; i < nkeys; i++) + { + if (check[i] && !nullFlags[i]) + { + res = true; + break; + } + } break; case GinContainsStrategy: /* result is not lossy */ *recheck = false; - /* must have all elements in check[] true */ - nentries = ArrayGetNItems(ARR_NDIM(query), ARR_DIMS(query)); + /* must have all elements in check[] true, and no nulls */ res = true; - for (i = 0; i < nentries; i++) + for (i = 0; i < nkeys; i++) { - if (!check[i]) + if (!check[i] || nullFlags[i]) { res = false; break; } } break; + case GinContainedStrategy: + /* we will need recheck */ + *recheck = true; + /* can't do anything else useful here */ + res = true; + break; case GinEqualStrategy: /* we will need recheck */ *recheck = true; - /* must have all elements in check[] true */ - nentries = ArrayGetNItems(ARR_NDIM(query), ARR_DIMS(query)); + + /* + * Must have all elements in check[] true; no discrimination + * against nulls here. This is because array_contain_compare and + * array_eq handle nulls differently ... + */ res = true; - for (i = 0; i < nentries; i++) + for (i = 0; i < nkeys; i++) { if (!check[i]) { diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index bb150a4689..739fa8afff 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -4,17 +4,17 @@ * page utilities routines for the postgres inverted index access method. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginbtree.c,v 1.15 2010/01/02 16:57:33 momjian Exp $ + * src/backend/access/gin/ginbtree.c *------------------------------------------------------------------------- */ #include "postgres.h" -#include "access/gin.h" +#include "access/gin_private.h" #include "miscadmin.h" #include "storage/bufmgr.h" #include "utils/rel.h" @@ -104,7 +104,8 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack) * ok, page is correctly locked, we should check to move right .., * root never has a right link, so small optimization */ - while (btree->fullScan == FALSE && stack->blkno != rootBlkno && btree->isMoveRight(btree, page)) + while (btree->fullScan == FALSE && stack->blkno != rootBlkno && + btree->isMoveRight(btree, page)) { BlockNumber rightlink = GinPageGetOpaque(page)->rightlink; @@ -173,8 +174,8 @@ freeGinBtreeStack(GinBtreeStack *stack) * with vacuum process */ void -findParents(GinBtree btree, GinBtreeStack *stack, - BlockNumber rootBlkno) +ginFindParents(GinBtree btree, GinBtreeStack *stack, + BlockNumber rootBlkno) { Page page; @@ -226,7 +227,6 @@ findParents(GinBtree btree, GinBtreeStack *stack, LockBuffer(root->buffer, GIN_UNLOCK); Assert(blkno != InvalidBlockNumber); - for (;;) { buffer = ReadBuffer(btree->index, blkno); @@ -267,9 +267,14 @@ findParents(GinBtree btree, GinBtreeStack *stack, /* * Insert value (stored in GinBtree) to tree described by stack + * + * During an index build, buildStats is non-null and the counters + * it contains should be incremented as needed. + * + * NB: the passed-in stack is freed, as though by freeGinBtreeStack. */ void -ginInsertValue(GinBtree btree, GinBtreeStack *stack) +ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats) { GinBtreeStack *parent = stack; BlockNumber rootBlkno = InvalidBuffer; @@ -299,7 +304,7 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack) MarkBufferDirty(stack->buffer); - if (!btree->index->rd_istemp) + if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; @@ -308,10 +313,11 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack) PageSetTLI(page, ThisTimeLineID); } - UnlockReleaseBuffer(stack->buffer); + LockBuffer(stack->buffer, GIN_UNLOCK); END_CRIT_SECTION(); - freeGinBtreeStack(stack->parent); + freeGinBtreeStack(stack); + return; } else @@ -325,9 +331,17 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack) */ newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off, &rdata); - ((ginxlogSplit *) (rdata->data))->rootBlkno = rootBlkno; + /* During index build, count the newly-split page */ + if (buildStats) + { + if (btree->isData) + buildStats->nDataPages++; + else + buildStats->nEntryPages++; + } + parent = stack->parent; if (parent == NULL) @@ -341,7 +355,6 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack) ((ginxlogSplit *) (rdata->data))->isRootSplit = TRUE; ((ginxlogSplit *) (rdata->data))->rrlink = InvalidBlockNumber; - page = BufferGetPage(stack->buffer); lpage = BufferGetPage(lbuffer); rpage = BufferGetPage(rbuffer); @@ -360,7 +373,7 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack) MarkBufferDirty(lbuffer); MarkBufferDirty(stack->buffer); - if (!btree->index->rd_istemp) + if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; @@ -375,10 +388,20 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack) UnlockReleaseBuffer(rbuffer); UnlockReleaseBuffer(lbuffer); - UnlockReleaseBuffer(stack->buffer); - + LockBuffer(stack->buffer, GIN_UNLOCK); END_CRIT_SECTION(); + freeGinBtreeStack(stack); + + /* During index build, count the newly-added root page */ + if (buildStats) + { + if (btree->isData) + buildStats->nDataPages++; + else + buildStats->nEntryPages++; + } + return; } else @@ -399,7 +422,7 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack) MarkBufferDirty(rbuffer); MarkBufferDirty(stack->buffer); - if (!btree->index->rd_istemp) + if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; @@ -433,7 +456,7 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack) * rightmost page, but we don't find parent, we should use * plain search... */ - findParents(btree, stack, rootBlkno); + ginFindParents(btree, stack, rootBlkno); parent = stack->parent; page = BufferGetPage(parent->buffer); break; diff --git a/src/backend/access/gin/ginbulk.c b/src/backend/access/gin/ginbulk.c index bb726e69f4..9e5bab194d 100644 --- a/src/backend/access/gin/ginbulk.c +++ b/src/backend/access/gin/ginbulk.c @@ -4,83 +4,117 @@ * routines for fast build of inverted index * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginbulk.c,v 1.19 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/gin/ginbulk.c *------------------------------------------------------------------------- */ #include "postgres.h" -#include "access/gin.h" +#include "access/gin_private.h" #include "utils/datum.h" #include "utils/memutils.h" -#define DEF_NENTRY 2048 -#define DEF_NPTR 4 +#define DEF_NENTRY 2048 /* GinEntryAccumulator allocation quantum */ +#define DEF_NPTR 5 /* ItemPointer initial allocation quantum */ -static void * -ginAppendData(void *old, void *new, void *arg) -{ - EntryAccumulator *eo = (EntryAccumulator *) old, - *en = (EntryAccumulator *) new; +/* Combiner function for rbtree.c */ +static void +ginCombineData(RBNode *existing, const RBNode *newdata, void *arg) +{ + GinEntryAccumulator *eo = (GinEntryAccumulator *) existing; + const GinEntryAccumulator *en = (const GinEntryAccumulator *) newdata; BuildAccumulator *accum = (BuildAccumulator *) arg; - if (eo->number >= eo->length) + /* + * Note this code assumes that newdata contains only one itempointer. + */ + if (eo->count >= eo->maxcount) { accum->allocatedMemory -= GetMemoryChunkSpace(eo->list); - eo->length *= 2; - eo->list = (ItemPointerData *) repalloc(eo->list, - sizeof(ItemPointerData) * eo->length); + eo->maxcount *= 2; + eo->list = (ItemPointerData *) + repalloc(eo->list, sizeof(ItemPointerData) * eo->maxcount); accum->allocatedMemory += GetMemoryChunkSpace(eo->list); } - /* If item pointers are not ordered, they will need to be sorted. */ + /* If item pointers are not ordered, they will need to be sorted later */ if (eo->shouldSort == FALSE) { int res; - res = compareItemPointers(eo->list + eo->number - 1, en->list); + res = ginCompareItemPointers(eo->list + eo->count - 1, en->list); Assert(res != 0); if (res > 0) eo->shouldSort = TRUE; } - eo->list[eo->number] = en->list[0]; - eo->number++; - - return old; + eo->list[eo->count] = en->list[0]; + eo->count++; } +/* Comparator function for rbtree.c */ static int -cmpEntryAccumulator(const void *a, const void *b, void *arg) +cmpEntryAccumulator(const RBNode *a, const RBNode *b, void *arg) +{ + const GinEntryAccumulator *ea = (const GinEntryAccumulator *) a; + const GinEntryAccumulator *eb = (const GinEntryAccumulator *) b; + BuildAccumulator *accum = (BuildAccumulator *) arg; + + return ginCompareAttEntries(accum->ginstate, + ea->attnum, ea->key, ea->category, + eb->attnum, eb->key, eb->category); +} + +/* Allocator function for rbtree.c */ +static RBNode * +ginAllocEntryAccumulator(void *arg) { - EntryAccumulator *ea = (EntryAccumulator *) a; - EntryAccumulator *eb = (EntryAccumulator *) b; BuildAccumulator *accum = (BuildAccumulator *) arg; + GinEntryAccumulator *ea; + + /* + * Allocate memory by rather big chunks to decrease overhead. We have no + * need to reclaim RBNodes individually, so this costs nothing. + */ + if (accum->entryallocator == NULL || accum->eas_used >= DEF_NENTRY) + { + accum->entryallocator = palloc(sizeof(GinEntryAccumulator) * DEF_NENTRY); + accum->allocatedMemory += GetMemoryChunkSpace(accum->entryallocator); + accum->eas_used = 0; + } + + /* Allocate new RBNode from current chunk */ + ea = accum->entryallocator + accum->eas_used; + accum->eas_used++; - return compareAttEntries(accum->ginstate, ea->attnum, ea->value, - eb->attnum, eb->value); + return (RBNode *) ea; } void ginInitBA(BuildAccumulator *accum) { + /* accum->ginstate is intentionally not set here */ accum->allocatedMemory = 0; accum->entryallocator = NULL; - accum->tree = rb_create(cmpEntryAccumulator, ginAppendData, NULL, accum); - accum->iterator = NULL; - accum->tmpList = NULL; + accum->eas_used = 0; + accum->tree = rb_create(sizeof(GinEntryAccumulator), + cmpEntryAccumulator, + ginCombineData, + ginAllocEntryAccumulator, + NULL, /* no freefunc needed */ + (void *) accum); } /* - * This is basically the same as datumCopy(), but modified to count - * palloc'd space in accum. + * This is basically the same as datumCopy(), but extended to count + * palloc'd space in accum->allocatedMemory. */ static Datum getDatumCopy(BuildAccumulator *accum, OffsetNumber attnum, Datum value) @@ -102,62 +136,53 @@ getDatumCopy(BuildAccumulator *accum, OffsetNumber attnum, Datum value) * Find/store one entry from indexed value. */ static void -ginInsertEntry(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber attnum, Datum entry) +ginInsertBAEntry(BuildAccumulator *accum, + ItemPointer heapptr, OffsetNumber attnum, + Datum key, GinNullCategory category) { - EntryAccumulator *key, - *ea; + GinEntryAccumulator eatmp; + GinEntryAccumulator *ea; + bool isNew; /* - * Allocate memory by rather big chunk to decrease overhead, we don't keep - * pointer to previously allocated chunks because they will free by - * MemoryContextReset() call. + * For the moment, fill only the fields of eatmp that will be looked at by + * cmpEntryAccumulator or ginCombineData. */ - if (accum->entryallocator == NULL || accum->length >= DEF_NENTRY) - { - accum->entryallocator = palloc(sizeof(EntryAccumulator) * DEF_NENTRY); - accum->allocatedMemory += GetMemoryChunkSpace(accum->entryallocator); - accum->length = 0; - } - - /* "Allocate" new key in chunk */ - key = accum->entryallocator + accum->length; - accum->length++; - - key->attnum = attnum; - key->value = entry; - /* To prevent multiple palloc/pfree cycles, we reuse array */ - if (accum->tmpList == NULL) - accum->tmpList = - (ItemPointerData *) palloc(sizeof(ItemPointerData) * DEF_NPTR); - key->list = accum->tmpList; - key->list[0] = *heapptr; + eatmp.attnum = attnum; + eatmp.key = key; + eatmp.category = category; + /* temporarily set up single-entry itempointer list */ + eatmp.list = heapptr; - ea = rb_insert(accum->tree, key); + ea = (GinEntryAccumulator *) rb_insert(accum->tree, (RBNode *) &eatmp, + &isNew); - if (ea == NULL) + if (isNew) { /* - * The key has been inserted, so continue initialization. + * Finish initializing new tree entry, including making permanent + * copies of the datum (if it's not null) and itempointer. */ - key->value = getDatumCopy(accum, attnum, entry); - key->length = DEF_NPTR; - key->number = 1; - key->shouldSort = FALSE; - accum->allocatedMemory += GetMemoryChunkSpace(key->list); - accum->tmpList = NULL; + if (category == GIN_CAT_NORM_KEY) + ea->key = getDatumCopy(accum, attnum, key); + ea->maxcount = DEF_NPTR; + ea->count = 1; + ea->shouldSort = FALSE; + ea->list = + (ItemPointerData *) palloc(sizeof(ItemPointerData) * DEF_NPTR); + ea->list[0] = *heapptr; + accum->allocatedMemory += GetMemoryChunkSpace(ea->list); } else { /* - * The key has been appended, so "free" allocated key by decrementing - * chunk's counter. + * ginCombineData did everything needed. */ - accum->length--; } } /* - * Insert one heap pointer. + * Insert the entries for one heap pointer. * * Since the entries are being inserted into a balanced binary tree, you * might think that the order of insertion wouldn't be critical, but it turns @@ -169,22 +194,24 @@ ginInsertEntry(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber attnum * We do this as follows. First, we imagine that we have an array whose size * is the smallest power of two greater than or equal to the actual array * size. Second, we insert the middle entry of our virtual array into the - * tree; then, we insert the middles of each half of out virtual array, then + * tree; then, we insert the middles of each half of our virtual array, then * middles of quarters, etc. */ void -ginInsertRecordBA(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber attnum, - Datum *entries, int32 nentry) +ginInsertBAEntries(BuildAccumulator *accum, + ItemPointer heapptr, OffsetNumber attnum, + Datum *entries, GinNullCategory *categories, + int32 nentries) { - uint32 step = nentry; + uint32 step = nentries; - if (nentry <= 0) + if (nentries <= 0) return; Assert(ItemPointerIsValid(heapptr) && attnum >= FirstOffsetNumber); /* - * step will contain largest power of 2 and <= nentry + * step will contain largest power of 2 and <= nentries */ step |= (step >> 1); step |= (step >> 2); @@ -198,8 +225,9 @@ ginInsertRecordBA(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber att { int i; - for (i = step - 1; i < nentry && i >= 0; i += step << 1 /* *2 */ ) - ginInsertEntry(accum, heapptr, attnum, entries[i]); + for (i = step - 1; i < nentries && i >= 0; i += step << 1 /* *2 */ ) + ginInsertBAEntry(accum, heapptr, attnum, + entries[i], categories[i]); step >>= 1; /* /2 */ } @@ -208,35 +236,49 @@ ginInsertRecordBA(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber att static int qsortCompareItemPointers(const void *a, const void *b) { - int res = compareItemPointers((ItemPointer) a, (ItemPointer) b); + int res = ginCompareItemPointers((ItemPointer) a, (ItemPointer) b); + /* Assert that there are no equal item pointers being sorted */ Assert(res != 0); return res; } +/* Prepare to read out the rbtree contents using ginGetBAEntry */ +void +ginBeginBAScan(BuildAccumulator *accum) +{ + rb_begin_iterate(accum->tree, LeftRightWalk); +} + +/* + * Get the next entry in sequence from the BuildAccumulator's rbtree. + * This consists of a single key datum and a list (array) of one or more + * heap TIDs in which that key is found. The list is guaranteed sorted. + */ ItemPointerData * -ginGetEntry(BuildAccumulator *accum, OffsetNumber *attnum, Datum *value, uint32 *n) +ginGetBAEntry(BuildAccumulator *accum, + OffsetNumber *attnum, Datum *key, GinNullCategory *category, + uint32 *n) { - EntryAccumulator *entry; + GinEntryAccumulator *entry; ItemPointerData *list; - if (accum->iterator == NULL) - accum->iterator = rb_begin_iterate(accum->tree, LeftRightWalk); - - entry = rb_iterate(accum->iterator); + entry = (GinEntryAccumulator *) rb_iterate(accum->tree); if (entry == NULL) - return NULL; + return NULL; /* no more entries */ - *n = entry->number; *attnum = entry->attnum; - *value = entry->value; + *key = entry->key; + *category = entry->category; list = entry->list; + *n = entry->count; - Assert(list != NULL); + Assert(list != NULL && entry->count > 0); - if (entry->shouldSort && entry->number > 1) - qsort(list, *n, sizeof(ItemPointerData), qsortCompareItemPointers); + if (entry->shouldSort && entry->count > 1) + qsort(list, entry->count, sizeof(ItemPointerData), + qsortCompareItemPointers); return list; } diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index b456e167ce..41dbe9fd11 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -4,31 +4,37 @@ * page utilities routines for the postgres inverted index access method. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/gindatapage.c,v 1.17 2010/01/02 16:57:33 momjian Exp $ + * src/backend/access/gin/gindatapage.c *------------------------------------------------------------------------- */ #include "postgres.h" -#include "access/gin.h" +#include "access/gin_private.h" #include "storage/bufmgr.h" #include "utils/rel.h" int -compareItemPointers(ItemPointer a, ItemPointer b) +ginCompareItemPointers(ItemPointer a, ItemPointer b) { - if (GinItemPointerGetBlockNumber(a) == GinItemPointerGetBlockNumber(b)) + BlockNumber ba = GinItemPointerGetBlockNumber(a); + BlockNumber bb = GinItemPointerGetBlockNumber(b); + + if (ba == bb) { - if (GinItemPointerGetOffsetNumber(a) == GinItemPointerGetOffsetNumber(b)) + OffsetNumber oa = GinItemPointerGetOffsetNumber(a); + OffsetNumber ob = GinItemPointerGetOffsetNumber(b); + + if (oa == ob) return 0; - return (GinItemPointerGetOffsetNumber(a) > GinItemPointerGetOffsetNumber(b)) ? 1 : -1; + return (oa > ob) ? 1 : -1; } - return (GinItemPointerGetBlockNumber(a) > GinItemPointerGetBlockNumber(b)) ? 1 : -1; + return (ba > bb) ? 1 : -1; } /* @@ -37,9 +43,9 @@ compareItemPointers(ItemPointer a, ItemPointer b) * Caller is responsible that there is enough space at *dst. */ uint32 -MergeItemPointers(ItemPointerData *dst, - ItemPointerData *a, uint32 na, - ItemPointerData *b, uint32 nb) +ginMergeItemPointers(ItemPointerData *dst, + ItemPointerData *a, uint32 na, + ItemPointerData *b, uint32 nb) { ItemPointerData *dptr = dst; ItemPointerData *aptr = a, @@ -47,7 +53,7 @@ MergeItemPointers(ItemPointerData *dst, while (aptr - a < na && bptr - b < nb) { - int cmp = compareItemPointers(aptr, bptr); + int cmp = ginCompareItemPointers(aptr, bptr); if (cmp > 0) *dptr++ = *bptr++; @@ -82,7 +88,7 @@ dataIsMoveRight(GinBtree btree, Page page) if (GinPageRightMost(page)) return FALSE; - return (compareItemPointers(btree->items + btree->curitem, iptr) > 0) ? TRUE : FALSE; + return (ginCompareItemPointers(btree->items + btree->curitem, iptr) > 0) ? TRUE : FALSE; } /* @@ -122,16 +128,17 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack) pitem = (PostingItem *) GinDataPageGetItem(page, mid); if (mid == maxoff) - + { /* * Right infinity, page already correctly chosen with a help of * dataIsMoveRight */ result = -1; + } else { pitem = (PostingItem *) GinDataPageGetItem(page, mid); - result = compareItemPointers(btree->items + btree->curitem, &(pitem->key)); + result = ginCompareItemPointers(btree->items + btree->curitem, &(pitem->key)); } if (result == 0) @@ -189,7 +196,7 @@ dataLocateLeafItem(GinBtree btree, GinBtreeStack *stack) { OffsetNumber mid = low + ((high - low) / 2); - result = compareItemPointers(btree->items + btree->curitem, (ItemPointer) GinDataPageGetItem(page, mid)); + result = ginCompareItemPointers(btree->items + btree->curitem, (ItemPointer) GinDataPageGetItem(page, mid)); if (result == 0) { @@ -220,7 +227,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor Assert(!GinPageIsLeaf(page)); Assert(GinPageIsData(page)); - /* if page isn't changed, we returns storedOff */ + /* if page isn't changed, we return storedOff */ if (storedOff >= FirstOffsetNumber && storedOff <= maxoff) { pitem = (PostingItem *) GinDataPageGetItem(page, storedOff); @@ -286,9 +293,11 @@ GinDataPageAddItem(Page page, void *data, OffsetNumber offset) { ptr = GinDataPageGetItem(page, offset); if (maxoff + 1 - offset != 0) - memmove(ptr + GinSizeOfItem(page), ptr, (maxoff - offset + 1) * GinSizeOfItem(page)); + memmove(ptr + GinSizeOfDataPageItem(page), + ptr, + (maxoff - offset + 1) * GinSizeOfDataPageItem(page)); } - memcpy(ptr, data, GinSizeOfItem(page)); + memcpy(ptr, data, GinSizeOfDataPageItem(page)); GinPageGetOpaque(page)->maxoff++; } @@ -297,7 +306,7 @@ GinDataPageAddItem(Page page, void *data, OffsetNumber offset) * Deletes posting item from non-leaf page */ void -PageDeletePostingItem(Page page, OffsetNumber offset) +GinPageDeletePostingItem(Page page, OffsetNumber offset) { OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; @@ -372,10 +381,12 @@ static void dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata) { Page page = BufferGetPage(buf); + int sizeofitem = GinSizeOfDataPageItem(page); + int cnt = 0; + + /* these must be static so they can be returned to caller */ static XLogRecData rdata[3]; - int sizeofitem = GinSizeOfItem(page); static ginxlogInsert data; - int cnt = 0; *prdata = rdata; Assert(GinPageIsData(page)); @@ -453,21 +464,23 @@ dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prda static Page dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata) { - static ginxlogSplit data; - static XLogRecData rdata[4]; - static char vector[2 * BLCKSZ]; char *ptr; OffsetNumber separator; ItemPointer bound; Page lpage = PageGetTempPageCopy(BufferGetPage(lbuf)); ItemPointerData oldbound = *GinDataPageGetRightBound(lpage); - int sizeofitem = GinSizeOfItem(lpage); + int sizeofitem = GinSizeOfDataPageItem(lpage); OffsetNumber maxoff = GinPageGetOpaque(lpage)->maxoff; Page rpage = BufferGetPage(rbuf); Size pageSize = PageGetPageSize(lpage); Size freeSpace; uint32 nCopied = 1; + /* these must be static so they can be returned to caller */ + static ginxlogSplit data; + static XLogRecData rdata[4]; + static char vector[2 * BLCKSZ]; + GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize); freeSpace = GinDataPageGetFreeSpace(rpage); @@ -482,9 +495,11 @@ dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRe if (GinPageIsLeaf(lpage) && GinPageRightMost(lpage) && off > GinPageGetOpaque(lpage)->maxoff) { nCopied = 0; - while (btree->curitem < btree->nitem && maxoff * sizeof(ItemPointerData) < 2 * (freeSpace - sizeof(ItemPointerData))) + while (btree->curitem < btree->nitem && + maxoff * sizeof(ItemPointerData) < 2 * (freeSpace - sizeof(ItemPointerData))) { - memcpy(vector + maxoff * sizeof(ItemPointerData), btree->items + btree->curitem, + memcpy(vector + maxoff * sizeof(ItemPointerData), + btree->items + btree->curitem, sizeof(ItemPointerData)); maxoff++; nCopied++; @@ -571,7 +586,7 @@ dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRe * Also called from ginxlog, should not use btree */ void -dataFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf) +ginDataFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf) { Page page = BufferGetPage(root), lpage = BufferGetPage(lbuf), @@ -589,20 +604,23 @@ dataFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf) } void -prepareDataScan(GinBtree btree, Relation index) +ginPrepareDataScan(GinBtree btree, Relation index) { memset(btree, 0, sizeof(GinBtreeData)); + btree->index = index; - btree->isMoveRight = dataIsMoveRight; + btree->findChildPage = dataLocateItem; + btree->isMoveRight = dataIsMoveRight; btree->findItem = dataLocateLeafItem; btree->findChildPtr = dataFindChildPtr; btree->getLeftMostPage = dataGetLeftMostPage; btree->isEnoughSpace = dataIsEnoughSpace; btree->placeToPage = dataPlaceToPage; btree->splitPage = dataSplitPage; - btree->fillRoot = dataFillRoot; + btree->fillRoot = ginDataFillRoot; + btree->isData = TRUE; btree->searchMode = FALSE; btree->isDelete = FALSE; btree->fullScan = FALSE; @@ -610,11 +628,11 @@ prepareDataScan(GinBtree btree, Relation index) } GinPostingTreeScan * -prepareScanPostingTree(Relation index, BlockNumber rootBlkno, bool searchMode) +ginPrepareScanPostingTree(Relation index, BlockNumber rootBlkno, bool searchMode) { GinPostingTreeScan *gdi = (GinPostingTreeScan *) palloc0(sizeof(GinPostingTreeScan)); - prepareDataScan(&gdi->btree, index); + ginPrepareDataScan(&gdi->btree, index); gdi->btree.searchMode = searchMode; gdi->btree.fullScan = searchMode; @@ -628,7 +646,9 @@ prepareScanPostingTree(Relation index, BlockNumber rootBlkno, bool searchMode) * Inserts array of item pointers, may execute several tree scan (very rare) */ void -insertItemPointer(GinPostingTreeScan *gdi, ItemPointerData *items, uint32 nitem) +ginInsertItemPointers(GinPostingTreeScan *gdi, + ItemPointerData *items, uint32 nitem, + GinStatsData *buildStats) { BlockNumber rootBlkno = gdi->stack->blkno; @@ -653,14 +673,14 @@ insertItemPointer(GinPostingTreeScan *gdi, ItemPointerData *items, uint32 nitem) freeGinBtreeStack(gdi->stack); } else - ginInsertValue(&(gdi->btree), gdi->stack); + ginInsertValue(&(gdi->btree), gdi->stack, buildStats); gdi->stack = NULL; } } Buffer -scanBeginPostingTree(GinPostingTreeScan *gdi) +ginScanBeginPostingTree(GinPostingTreeScan *gdi) { gdi->stack = ginFindLeafPage(&gdi->btree, gdi->stack); return gdi->stack->buffer; diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c index 6d307c8d59..fa134f9fc3 100644 --- a/src/backend/access/gin/ginentrypage.c +++ b/src/backend/access/gin/ginentrypage.c @@ -4,17 +4,17 @@ * page utilities routines for the postgres inverted index access method. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.24 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/gin/ginentrypage.c *------------------------------------------------------------------------- */ #include "postgres.h" -#include "access/gin.h" +#include "access/gin_private.h" #include "storage/bufmgr.h" #include "utils/rel.h" @@ -24,107 +24,116 @@ * If the tuple would be too big to be stored, function throws a suitable * error if errorTooBig is TRUE, or returns NULL if errorTooBig is FALSE. * - * On leaf pages, Index tuple has non-traditional layout. Tuple may contain - * posting list or root blocknumber of posting tree. - * Macros: GinIsPostingTree(itup) / GinSetPostingTree(itup, blkno) - * 1) Posting list - * - itup->t_info & INDEX_SIZE_MASK contains total size of tuple as usual - * - ItemPointerGetBlockNumber(&itup->t_tid) contains original - * size of tuple (without posting list). - * Macros: GinGetOrigSizePosting(itup) / GinSetOrigSizePosting(itup,n) - * - ItemPointerGetOffsetNumber(&itup->t_tid) contains number - * of elements in posting list (number of heap itempointers) - * Macros: GinGetNPosting(itup) / GinSetNPosting(itup,n) - * - After standard part of tuple there is a posting list, ie, array - * of heap itempointers - * Macros: GinGetPosting(itup) - * 2) Posting tree - * - itup->t_info & INDEX_SIZE_MASK contains size of tuple as usual - * - ItemPointerGetBlockNumber(&itup->t_tid) contains block number of - * root of posting tree - * - ItemPointerGetOffsetNumber(&itup->t_tid) contains magic number - * GIN_TREE_POSTING, which distinguishes this from posting-list case - * - * Attributes of an index tuple are different for single and multicolumn index. - * For single-column case, index tuple stores only value to be indexed. - * For multicolumn case, it stores two attributes: column number of value - * and value. + * See src/backend/access/gin/README for a description of the index tuple + * format that is being built here. We build on the assumption that we + * are making a leaf-level key entry containing a posting list of nipd items. + * If the caller is actually trying to make a posting-tree entry, non-leaf + * entry, or pending-list entry, it should pass nipd = 0 and then overwrite + * the t_tid fields as necessary. In any case, ipd can be NULL to skip + * copying any itempointers into the posting list; the caller is responsible + * for filling the posting list afterwards, if ipd = NULL and nipd > 0. */ IndexTuple -GinFormTuple(Relation index, GinState *ginstate, - OffsetNumber attnum, Datum key, - ItemPointerData *ipd, uint32 nipd, bool errorTooBig) +GinFormTuple(GinState *ginstate, + OffsetNumber attnum, Datum key, GinNullCategory category, + ItemPointerData *ipd, uint32 nipd, + bool errorTooBig) { - bool isnull[2] = {FALSE, FALSE}; + Datum datums[2]; + bool isnull[2]; IndexTuple itup; uint32 newsize; + /* Build the basic tuple: optional column number, plus key datum */ if (ginstate->oneCol) - itup = index_form_tuple(ginstate->origTupdesc, &key, isnull); + { + datums[0] = key; + isnull[0] = (category != GIN_CAT_NORM_KEY); + } else { - Datum datums[2]; - datums[0] = UInt16GetDatum(attnum); + isnull[0] = false; datums[1] = key; - itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull); + isnull[1] = (category != GIN_CAT_NORM_KEY); } - GinSetOrigSizePosting(itup, IndexTupleSize(itup)); + itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull); + + /* + * Determine and store offset to the posting list, making sure there is + * room for the category byte if needed. + * + * Note: because index_form_tuple MAXALIGNs the tuple size, there may well + * be some wasted pad space. Is it worth recomputing the data length to + * prevent that? That would also allow us to Assert that the real data + * doesn't overlap the GinNullCategory byte, which this code currently + * takes on faith. + */ + newsize = IndexTupleSize(itup); - if (nipd > 0) + if (IndexTupleHasNulls(itup)) { - newsize = MAXALIGN(SHORTALIGN(IndexTupleSize(itup)) + sizeof(ItemPointerData) * nipd); - if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize)) - { - if (errorTooBig) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", - (unsigned long) newsize, - (unsigned long) Min(INDEX_SIZE_MASK, - GinMaxItemSize), - RelationGetRelationName(index)))); - return NULL; - } + uint32 minsize; + + Assert(category != GIN_CAT_NORM_KEY); + minsize = GinCategoryOffset(itup, ginstate) + sizeof(GinNullCategory); + newsize = Max(newsize, minsize); + } + + newsize = SHORTALIGN(newsize); + + GinSetPostingOffset(itup, newsize); + GinSetNPosting(itup, nipd); + + /* + * Add space needed for posting list, if any. Then check that the tuple + * won't be too big to store. + */ + newsize += sizeof(ItemPointerData) * nipd; + newsize = MAXALIGN(newsize); + if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize)) + { + if (errorTooBig) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", + (unsigned long) newsize, + (unsigned long) Min(INDEX_SIZE_MASK, + GinMaxItemSize), + RelationGetRelationName(ginstate->index)))); + pfree(itup); + return NULL; + } + + /* + * Resize tuple if needed + */ + if (newsize != IndexTupleSize(itup)) + { itup = repalloc(itup, newsize); - /* set new size */ + /* set new size in tuple header */ itup->t_info &= ~INDEX_SIZE_MASK; itup->t_info |= newsize; - - if (ipd) - memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd); - GinSetNPosting(itup, nipd); } - else - { - /* - * Gin tuple without any ItemPointers should be large enough to keep - * one ItemPointer, to prevent inconsistency between - * ginHeapTupleFastCollect and ginEntryInsert called by - * ginHeapTupleInsert. ginHeapTupleFastCollect forms tuple without - * extra pointer to heap, but ginEntryInsert (called for pending list - * cleanup during vacuum) will form the same tuple with one - * ItemPointer. - */ - newsize = MAXALIGN(SHORTALIGN(IndexTupleSize(itup)) + sizeof(ItemPointerData)); - if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize)) - { - if (errorTooBig) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", - (unsigned long) newsize, - (unsigned long) Min(INDEX_SIZE_MASK, - GinMaxItemSize), - RelationGetRelationName(index)))); - return NULL; - } - GinSetNPosting(itup, 0); + /* + * Insert category byte, if needed + */ + if (category != GIN_CAT_NORM_KEY) + { + Assert(IndexTupleHasNulls(itup)); + GinSetNullCategory(itup, ginstate, category); } + + /* + * Copy in the posting list, if provided + */ + if (ipd) + memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd); + return itup; } @@ -140,7 +149,8 @@ GinShortenTuple(IndexTuple itup, uint32 nipd) Assert(nipd <= GinGetNPosting(itup)); - newsize = MAXALIGN(SHORTALIGN(GinGetOrigSizePosting(itup)) + sizeof(ItemPointerData) * nipd); + newsize = GinGetPostingOffset(itup) + sizeof(ItemPointerData) * nipd; + newsize = MAXALIGN(newsize); Assert(newsize <= (itup->t_info & INDEX_SIZE_MASK)); @@ -151,8 +161,45 @@ GinShortenTuple(IndexTuple itup, uint32 nipd) } /* + * Form a non-leaf entry tuple by copying the key data from the given tuple, + * which can be either a leaf or non-leaf entry tuple. + * + * Any posting list in the source tuple is not copied. The specified child + * block number is inserted into t_tid. + */ +static IndexTuple +GinFormInteriorTuple(IndexTuple itup, Page page, BlockNumber childblk) +{ + IndexTuple nitup; + + if (GinPageIsLeaf(page) && !GinIsPostingTree(itup)) + { + /* Tuple contains a posting list, just copy stuff before that */ + uint32 origsize = GinGetPostingOffset(itup); + + origsize = MAXALIGN(origsize); + nitup = (IndexTuple) palloc(origsize); + memcpy(nitup, itup, origsize); + /* ... be sure to fix the size header field ... */ + nitup->t_info &= ~INDEX_SIZE_MASK; + nitup->t_info |= origsize; + } + else + { + /* Copy the tuple as-is */ + nitup = (IndexTuple) palloc(IndexTupleSize(itup)); + memcpy(nitup, itup, IndexTupleSize(itup)); + } + + /* Now insert the correct downlink */ + GinSetDownlink(nitup, childblk); + + return nitup; +} + +/* * Entry tree is a "static", ie tuple never deletes from it, - * so we don't use right bound, we use rightest key instead. + * so we don't use right bound, we use rightmost key instead. */ static IndexTuple getRightMostTuple(Page page) @@ -166,16 +213,20 @@ static bool entryIsMoveRight(GinBtree btree, Page page) { IndexTuple itup; + OffsetNumber attnum; + Datum key; + GinNullCategory category; if (GinPageRightMost(page)) return FALSE; itup = getRightMostTuple(page); + attnum = gintuple_get_attrnum(btree->ginstate, itup); + key = gintuple_get_key(btree->ginstate, itup, &category); - if (compareAttEntries(btree->ginstate, - btree->entryAttnum, btree->entryValue, - gintuple_get_attrnum(btree->ginstate, itup), - gin_index_getattr(btree->ginstate, itup)) > 0) + if (ginCompareAttEntries(btree->ginstate, + btree->entryAttnum, btree->entryKey, btree->entryCategory, + attnum, key, category) > 0) return TRUE; return FALSE; @@ -183,7 +234,7 @@ entryIsMoveRight(GinBtree btree, Page page) /* * Find correct tuple in non-leaf page. It supposed that - * page correctly choosen and searching value SHOULD be on page + * page correctly chosen and searching value SHOULD be on page */ static BlockNumber entryLocateEntry(GinBtree btree, GinBtreeStack *stack) @@ -216,22 +267,31 @@ entryLocateEntry(GinBtree btree, GinBtreeStack *stack) OffsetNumber mid = low + ((high - low) / 2); if (mid == maxoff && GinPageRightMost(page)) + { /* Right infinity */ result = -1; + } else { + OffsetNumber attnum; + Datum key; + GinNullCategory category; + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid)); - result = compareAttEntries(btree->ginstate, - btree->entryAttnum, btree->entryValue, - gintuple_get_attrnum(btree->ginstate, itup), - gin_index_getattr(btree->ginstate, itup)); + attnum = gintuple_get_attrnum(btree->ginstate, itup); + key = gintuple_get_key(btree->ginstate, itup, &category); + result = ginCompareAttEntries(btree->ginstate, + btree->entryAttnum, + btree->entryKey, + btree->entryCategory, + attnum, key, category); } if (result == 0) { stack->off = mid; - Assert(GinItemPointerGetBlockNumber(&(itup)->t_tid) != GIN_ROOT_BLKNO); - return GinItemPointerGetBlockNumber(&(itup)->t_tid); + Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO); + return GinGetDownlink(itup); } else if (result > 0) low = mid + 1; @@ -243,13 +303,13 @@ entryLocateEntry(GinBtree btree, GinBtreeStack *stack) stack->off = high; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, high)); - Assert(GinItemPointerGetBlockNumber(&(itup)->t_tid) != GIN_ROOT_BLKNO); - return GinItemPointerGetBlockNumber(&(itup)->t_tid); + Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO); + return GinGetDownlink(itup); } /* * Searches correct position for value on leaf page. - * Page should be corrrectly choosen. + * Page should be correctly chosen. * Returns true if value found on page. */ static bool @@ -258,7 +318,6 @@ entryLocateLeafEntry(GinBtree btree, GinBtreeStack *stack) Page page = BufferGetPage(stack->buffer); OffsetNumber low, high; - IndexTuple itup; Assert(GinPageIsLeaf(page)); Assert(!GinPageIsData(page)); @@ -283,13 +342,20 @@ entryLocateLeafEntry(GinBtree btree, GinBtreeStack *stack) while (high > low) { OffsetNumber mid = low + ((high - low) / 2); + IndexTuple itup; + OffsetNumber attnum; + Datum key; + GinNullCategory category; int result; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid)); - result = compareAttEntries(btree->ginstate, - btree->entryAttnum, btree->entryValue, - gintuple_get_attrnum(btree->ginstate, itup), - gin_index_getattr(btree->ginstate, itup)); + attnum = gintuple_get_attrnum(btree->ginstate, itup); + key = gintuple_get_key(btree->ginstate, itup, &category); + result = ginCompareAttEntries(btree->ginstate, + btree->entryAttnum, + btree->entryKey, + btree->entryCategory, + attnum, key, category); if (result == 0) { stack->off = mid; @@ -319,7 +385,7 @@ entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber sto if (storedOff >= FirstOffsetNumber && storedOff <= maxoff) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, storedOff)); - if (GinItemPointerGetBlockNumber(&(itup)->t_tid) == blkno) + if (GinGetDownlink(itup) == blkno) return storedOff; /* @@ -329,7 +395,7 @@ entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber sto for (i = storedOff + 1; i <= maxoff; i++) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); - if (GinItemPointerGetBlockNumber(&(itup)->t_tid) == blkno) + if (GinGetDownlink(itup) == blkno) return i; } maxoff = storedOff - 1; @@ -339,7 +405,7 @@ entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber sto for (i = FirstOffsetNumber; i <= maxoff; i++) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); - if (GinItemPointerGetBlockNumber(&(itup)->t_tid) == blkno) + if (GinGetDownlink(itup) == blkno) return i; } @@ -356,7 +422,7 @@ entryGetLeftMostPage(GinBtree btree, Page page) Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); - return GinItemPointerGetBlockNumber(&(itup)->t_tid); + return GinGetDownlink(itup); } static bool @@ -404,7 +470,7 @@ entryPreparePage(GinBtree btree, Page page, OffsetNumber off) { IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); - ItemPointerSet(&itup->t_tid, btree->rightblkno, InvalidOffsetNumber); + GinSetDownlink(itup, btree->rightblkno); ret = btree->rightblkno; } @@ -420,11 +486,13 @@ static void entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata) { Page page = BufferGetPage(buf); - static XLogRecData rdata[3]; OffsetNumber placed; - static ginxlogInsert data; int cnt = 0; + /* these must be static so they can be returned to caller */ + static XLogRecData rdata[3]; + static ginxlogInsert data; + *prdata = rdata; data.updateBlkno = entryPreparePage(btree, page, off); @@ -473,31 +541,6 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prd } /* - * Returns new tuple with copied value from source tuple. - * New tuple will not store posting list - */ -static IndexTuple -copyIndexTuple(IndexTuple itup, Page page) -{ - IndexTuple nitup; - - if (GinPageIsLeaf(page) && !GinIsPostingTree(itup)) - { - nitup = (IndexTuple) palloc(MAXALIGN(GinGetOrigSizePosting(itup))); - memcpy(nitup, itup, GinGetOrigSizePosting(itup)); - nitup->t_info &= ~INDEX_SIZE_MASK; - nitup->t_info |= GinGetOrigSizePosting(itup); - } - else - { - nitup = (IndexTuple) palloc(MAXALIGN(IndexTupleSize(itup))); - memcpy(nitup, itup, IndexTupleSize(itup)); - } - - return nitup; -} - -/* * Place tuple and split page, original buffer(lbuf) leaves untouched, * returns shadow page of lbuf filled new data. * Tuples are distributed between pages by equal size on its, not @@ -506,26 +549,28 @@ copyIndexTuple(IndexTuple itup, Page page) static Page entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata) { - static XLogRecData rdata[2]; OffsetNumber i, maxoff, separator = InvalidOffsetNumber; Size totalsize = 0; Size lsize = 0, size; - static char tupstore[2 * BLCKSZ]; char *ptr; IndexTuple itup, leftrightmost = NULL; - static ginxlogSplit data; Page page; Page lpage = PageGetTempPageCopy(BufferGetPage(lbuf)); Page rpage = BufferGetPage(rbuf); Size pageSize = PageGetPageSize(lpage); + /* these must be static so they can be returned to caller */ + static XLogRecData rdata[2]; + static ginxlogSplit data; + static char tupstore[2 * BLCKSZ]; + *prdata = rdata; data.leftChildBlkno = (GinPageIsLeaf(lpage)) ? - InvalidOffsetNumber : GinItemPointerGetBlockNumber(&(btree->entry->t_tid)); + InvalidOffsetNumber : GinGetDownlink(btree->entry); data.updateBlkno = entryPreparePage(btree, lpage, off); maxoff = PageGetMaxOffsetNumber(lpage); @@ -586,8 +631,8 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR ptr += MAXALIGN(IndexTupleSize(itup)); } - btree->entry = copyIndexTuple(leftrightmost, lpage); - ItemPointerSet(&(btree->entry)->t_tid, BufferGetBlockNumber(lbuf), InvalidOffsetNumber); + btree->entry = GinFormInteriorTuple(leftrightmost, lpage, + BufferGetBlockNumber(lbuf)); btree->rightblkno = BufferGetBlockNumber(rbuf); @@ -615,7 +660,7 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR } /* - * return newly allocate rightmost tuple + * return newly allocated rightmost tuple */ IndexTuple ginPageGetLinkItup(Buffer buf) @@ -625,8 +670,7 @@ ginPageGetLinkItup(Buffer buf) Page page = BufferGetPage(buf); itup = getRightMostTuple(page); - nitup = copyIndexTuple(itup, page); - ItemPointerSet(&nitup->t_tid, BufferGetBlockNumber(buf), InvalidOffsetNumber); + nitup = GinFormInteriorTuple(itup, page, BufferGetBlockNumber(buf)); return nitup; } @@ -636,7 +680,7 @@ ginPageGetLinkItup(Buffer buf) * Also called from ginxlog, should not use btree */ void -entryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf) +ginEntryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf) { Page page; IndexTuple itup; @@ -646,34 +690,47 @@ entryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf) itup = ginPageGetLinkItup(lbuf); if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index root page"); + pfree(itup); itup = ginPageGetLinkItup(rbuf); if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index root page"); + pfree(itup); } +/* + * Set up GinBtree for entry page access + * + * Note: during WAL recovery, there may be no valid data in ginstate + * other than a faked-up Relation pointer; the key datum is bogus too. + */ void -prepareEntryScan(GinBtree btree, Relation index, OffsetNumber attnum, Datum value, GinState *ginstate) +ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum, + Datum key, GinNullCategory category, + GinState *ginstate) { memset(btree, 0, sizeof(GinBtreeData)); - btree->isMoveRight = entryIsMoveRight; + btree->index = ginstate->index; + btree->ginstate = ginstate; + btree->findChildPage = entryLocateEntry; + btree->isMoveRight = entryIsMoveRight; btree->findItem = entryLocateLeafEntry; btree->findChildPtr = entryFindChildPtr; btree->getLeftMostPage = entryGetLeftMostPage; btree->isEnoughSpace = entryIsEnoughSpace; btree->placeToPage = entryPlaceToPage; btree->splitPage = entrySplitPage; - btree->fillRoot = entryFillRoot; + btree->fillRoot = ginEntryFillRoot; - btree->index = index; - btree->ginstate = ginstate; - btree->entryAttnum = attnum; - btree->entryValue = value; - - btree->isDelete = FALSE; + btree->isData = FALSE; btree->searchMode = FALSE; btree->fullScan = FALSE; btree->isBuild = FALSE; + + btree->entryAttnum = attnum; + btree->entryKey = key; + btree->entryCategory = category; + btree->isDelete = FALSE; } diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index f8e0b5ad40..82419e37ac 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -7,19 +7,18 @@ * transfer pending entries into the regular index structure. This * wins because bulk insertion is much more efficient than retail. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginfast.c,v 1.7 2010/02/11 14:29:50 teodor Exp $ + * src/backend/access/gin/ginfast.c * *------------------------------------------------------------------------- */ #include "postgres.h" -#include "access/genam.h" -#include "access/gin.h" +#include "access/gin_private.h" #include "catalog/index.h" #include "commands/vacuum.h" #include "miscadmin.h" @@ -30,12 +29,13 @@ #define GIN_PAGE_FREESIZE \ ( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) ) -typedef struct DatumArray +typedef struct KeyArray { - Datum *values; /* expansible array */ + Datum *keys; /* expansible array */ + GinNullCategory *categories; /* another expansible array */ int32 nvalues; /* current number of valid entries */ - int32 maxvalues; /* allocated size of array */ -} DatumArray; + int32 maxvalues; /* allocated size of arrays */ +} KeyArray; /* @@ -88,8 +88,9 @@ writeListPage(Relation index, Buffer buffer, GinPageGetOpaque(page)->rightlink = rightlink; /* - * tail page may contain only the whole row(s) or final part of row placed - * on previous pages + * tail page may contain only whole row(s) or final part of row placed on + * previous pages (a "row" here meaning all the index tuples generated for + * one heap tuple) */ if (rightlink == InvalidBlockNumber) { @@ -103,7 +104,7 @@ writeListPage(Relation index, Buffer buffer, MarkBufferDirty(buffer); - if (!index->rd_istemp) + if (RelationNeedsWAL(index)) { XLogRecData rdata[2]; ginxlogInsertListPage data; @@ -210,13 +211,16 @@ makeSublist(Relation index, IndexTuple *tuples, int32 ntuples, } /* - * Inserts collected values during normal insertion. Function guarantees - * that all values of heap will be stored sequentially, preserving order + * Write the index tuples contained in *collector into the index's + * pending list. + * + * Function guarantees that all these tuples will be inserted consecutively, + * preserving order */ void -ginHeapTupleFastInsert(Relation index, GinState *ginstate, - GinTupleCollector *collector) +ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) { + Relation index = ginstate->index; Buffer metabuffer; Page metapage; GinMetaPageData *metadata = NULL; @@ -291,7 +295,12 @@ ginHeapTupleFastInsert(Relation index, GinState *ginstate, */ START_CRIT_SECTION(); - memcpy(metadata, &sublist, sizeof(GinMetaPageData)); + metadata->head = sublist.head; + metadata->tail = sublist.tail; + metadata->tailFreeSize = sublist.tailFreeSize; + + metadata->nPendingPages = sublist.nPendingPages; + metadata->nPendingHeapTuples = sublist.nPendingHeapTuples; } else { @@ -384,7 +393,7 @@ ginHeapTupleFastInsert(Relation index, GinState *ginstate, */ MarkBufferDirty(metabuffer); - if (!index->rd_istemp) + if (RelationNeedsWAL(index)) { XLogRecPtr recptr; @@ -421,34 +430,40 @@ ginHeapTupleFastInsert(Relation index, GinState *ginstate, END_CRIT_SECTION(); if (needCleanup) - ginInsertCleanup(index, ginstate, false, NULL); + ginInsertCleanup(ginstate, false, NULL); } /* - * Collect values from one tuples to be indexed. All values for - * one tuples should be written at once - to guarantee consistent state + * Create temporary index tuples for a single indexable item (one index column + * for the heap tuple specified by ht_ctid), and append them to the array + * in *collector. They will subsequently be written out using + * ginHeapTupleFastInsert. Note that to guarantee consistent state, all + * temp tuples for a given heap tuple must be written in one call to + * ginHeapTupleFastInsert. */ -uint32 -ginHeapTupleFastCollect(Relation index, GinState *ginstate, +void +ginHeapTupleFastCollect(GinState *ginstate, GinTupleCollector *collector, - OffsetNumber attnum, Datum value, ItemPointer item) + OffsetNumber attnum, Datum value, bool isNull, + ItemPointer ht_ctid) { Datum *entries; + GinNullCategory *categories; int32 i, nentries; - entries = extractEntriesSU(ginstate, attnum, value, &nentries); - - if (nentries == 0) - /* nothing to insert */ - return 0; + /* + * Extract the key values that need to be inserted in the index + */ + entries = ginExtractEntries(ginstate, attnum, value, isNull, + &nentries, &categories); /* * Allocate/reallocate memory for storing collected tuples */ if (collector->tuples == NULL) { - collector->lentuples = nentries * index->rd_att->natts; + collector->lentuples = nentries * ginstate->origTupdesc->natts; collector->tuples = (IndexTuple *) palloc(sizeof(IndexTuple) * collector->lentuples); } @@ -460,19 +475,19 @@ ginHeapTupleFastCollect(Relation index, GinState *ginstate, } /* - * Creates tuple's array + * Build an index tuple for each key value, and add to array. In pending + * tuples we just stick the heap TID into t_tid. */ for (i = 0; i < nentries; i++) { - collector->tuples[collector->ntuples + i] = - GinFormTuple(index, ginstate, attnum, entries[i], NULL, 0, true); - collector->tuples[collector->ntuples + i]->t_tid = *item; - collector->sumsize += IndexTupleSize(collector->tuples[collector->ntuples + i]); - } + IndexTuple itup; - collector->ntuples += nentries; - - return nentries; + itup = GinFormTuple(ginstate, attnum, entries[i], categories[i], + NULL, 0, true); + itup->t_tid = *ht_ctid; + collector->tuples[collector->ntuples++] = itup; + collector->sumsize += IndexTupleSize(itup); + } } /* @@ -564,7 +579,7 @@ shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, MarkBufferDirty(buffers[i]); } - if (!index->rd_istemp) + if (RelationNeedsWAL(index)) { XLogRecPtr recptr; @@ -591,38 +606,55 @@ shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, return false; } -/* Add datum to DatumArray, resizing if needed */ +/* Initialize empty KeyArray */ static void -addDatum(DatumArray *datums, Datum datum) +initKeyArray(KeyArray *keys, int32 maxvalues) { - if (datums->nvalues >= datums->maxvalues) + keys->keys = (Datum *) palloc(sizeof(Datum) * maxvalues); + keys->categories = (GinNullCategory *) + palloc(sizeof(GinNullCategory) * maxvalues); + keys->nvalues = 0; + keys->maxvalues = maxvalues; +} + +/* Add datum to KeyArray, resizing if needed */ +static void +addDatum(KeyArray *keys, Datum datum, GinNullCategory category) +{ + if (keys->nvalues >= keys->maxvalues) { - datums->maxvalues *= 2; - datums->values = (Datum *) repalloc(datums->values, - sizeof(Datum) * datums->maxvalues); + keys->maxvalues *= 2; + keys->keys = (Datum *) + repalloc(keys->keys, sizeof(Datum) * keys->maxvalues); + keys->categories = (GinNullCategory *) + repalloc(keys->categories, sizeof(GinNullCategory) * keys->maxvalues); } - datums->values[datums->nvalues++] = datum; + keys->keys[keys->nvalues] = datum; + keys->categories[keys->nvalues] = category; + keys->nvalues++; } /* - * Go through all tuples >= startoff on page and collect values in memory + * Collect data from a pending-list page in preparation for insertion into + * the main index. + * + * Go through all tuples >= startoff on page and collect values in accum * - * Note that da is just workspace --- it does not carry any state across + * Note that ka is just workspace --- it does not carry any state across * calls. */ static void -processPendingPage(BuildAccumulator *accum, DatumArray *da, +processPendingPage(BuildAccumulator *accum, KeyArray *ka, Page page, OffsetNumber startoff) { ItemPointerData heapptr; OffsetNumber i, maxoff; - OffsetNumber attrnum, - curattnum; + OffsetNumber attrnum; - /* reset *da to empty */ - da->nvalues = 0; + /* reset *ka to empty */ + ka->nvalues = 0; maxoff = PageGetMaxOffsetNumber(page); Assert(maxoff >= FirstOffsetNumber); @@ -632,7 +664,11 @@ processPendingPage(BuildAccumulator *accum, DatumArray *da, for (i = startoff; i <= maxoff; i = OffsetNumberNext(i)) { IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + OffsetNumber curattnum; + Datum curkey; + GinNullCategory curcategory; + /* Check for change of heap TID or attnum */ curattnum = gintuple_get_attrnum(accum->ginstate, itup); if (!ItemPointerIsValid(&heapptr)) @@ -644,18 +680,25 @@ processPendingPage(BuildAccumulator *accum, DatumArray *da, curattnum == attrnum)) { /* - * We can insert several datums per call, but only for one heap - * tuple and one column. + * ginInsertBAEntries can insert several datums per call, but only + * for one heap tuple and one column. So call it at a boundary, + * and reset ka. */ - ginInsertRecordBA(accum, &heapptr, attrnum, da->values, da->nvalues); - da->nvalues = 0; + ginInsertBAEntries(accum, &heapptr, attrnum, + ka->keys, ka->categories, ka->nvalues); + ka->nvalues = 0; heapptr = itup->t_tid; attrnum = curattnum; } - addDatum(da, gin_index_getattr(accum->ginstate, itup)); + + /* Add key to KeyArray */ + curkey = gintuple_get_key(accum->ginstate, itup, &curcategory); + addDatum(ka, curkey, curcategory); } - ginInsertRecordBA(accum, &heapptr, attrnum, da->values, da->nvalues); + /* Dump out all remaining keys */ + ginInsertBAEntries(accum, &heapptr, attrnum, + ka->keys, ka->categories, ka->nvalues); } /* @@ -679,9 +722,10 @@ processPendingPage(BuildAccumulator *accum, DatumArray *da, * If stats isn't null, we count deleted pending pages into the counts. */ void -ginInsertCleanup(Relation index, GinState *ginstate, +ginInsertCleanup(GinState *ginstate, bool vac_delay, IndexBulkDeleteResult *stats) { + Relation index = ginstate->index; Buffer metabuffer, buffer; Page metapage, @@ -690,7 +734,7 @@ ginInsertCleanup(Relation index, GinState *ginstate, MemoryContext opCtx, oldCtx; BuildAccumulator accum; - DatumArray datums; + KeyArray datums; BlockNumber blkno; metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); @@ -726,10 +770,7 @@ ginInsertCleanup(Relation index, GinState *ginstate, oldCtx = MemoryContextSwitchTo(opCtx); - datums.maxvalues = 128; - datums.nvalues = 0; - datums.values = (Datum *) palloc(sizeof(Datum) * datums.maxvalues); - + initKeyArray(&datums, 128); ginInitBA(&accum); accum.ginstate = ginstate; @@ -748,7 +789,7 @@ ginInsertCleanup(Relation index, GinState *ginstate, } /* - * read page's datums into memory + * read page's datums into accum */ processPendingPage(&accum, &datums, page, FirstOffsetNumber); @@ -769,7 +810,8 @@ ginInsertCleanup(Relation index, GinState *ginstate, { ItemPointerData *list; uint32 nlist; - Datum entry; + Datum key; + GinNullCategory category; OffsetNumber maxoff, attnum; @@ -786,9 +828,12 @@ ginInsertCleanup(Relation index, GinState *ginstate, * significant amount of time - so, run it without locking pending * list. */ - while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL) + ginBeginBAScan(&accum); + while ((list = ginGetBAEntry(&accum, + &attnum, &key, &category, &nlist)) != NULL) { - ginEntryInsert(index, ginstate, attnum, entry, list, nlist, FALSE); + ginEntryInsert(ginstate, attnum, key, category, + list, nlist, NULL); if (vac_delay) vacuum_delay_point(); } @@ -820,8 +865,11 @@ ginInsertCleanup(Relation index, GinState *ginstate, ginInitBA(&accum); processPendingPage(&accum, &datums, page, maxoff + 1); - while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL) - ginEntryInsert(index, ginstate, attnum, entry, list, nlist, FALSE); + ginBeginBAScan(&accum); + while ((list = ginGetBAEntry(&accum, + &attnum, &key, &category, &nlist)) != NULL) + ginEntryInsert(ginstate, attnum, key, category, + list, nlist, NULL); } /* @@ -855,9 +903,8 @@ ginInsertCleanup(Relation index, GinState *ginstate, * release memory used so far and reinit state */ MemoryContextReset(opCtx); + initKeyArray(&datums, datums.maxvalues); ginInitBA(&accum); - datums.nvalues = 0; - datums.values = (Datum *) palloc(sizeof(Datum) * datums.maxvalues); } else { diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 705d167963..866785837f 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -4,17 +4,17 @@ * fetch tuples from a GIN scan. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.30 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/gin/ginget.c *------------------------------------------------------------------------- */ #include "postgres.h" -#include "access/gin.h" +#include "access/gin_private.h" #include "access/relscan.h" #include "catalog/index.h" #include "miscadmin.h" @@ -34,25 +34,58 @@ typedef struct pendingPosition /* - * Tries to refind previously taken ItemPointer on page. + * Convenience function for invoking a key's consistentFn */ static bool -findItemInPage(Page page, ItemPointer item, OffsetNumber *off) +callConsistentFn(GinState *ginstate, GinScanKey key) +{ + /* + * If we're dealing with a dummy EVERYTHING key, we don't want to call the + * consistentFn; just claim it matches. + */ + if (key->searchMode == GIN_SEARCH_MODE_EVERYTHING) + { + key->recheckCurItem = false; + return true; + } + + /* + * Initialize recheckCurItem in case the consistentFn doesn't know it + * should set it. The safe assumption in that case is to force recheck. + */ + key->recheckCurItem = true; + + return DatumGetBool(FunctionCall8Coll(&ginstate->consistentFn[key->attnum - 1], + ginstate->supportCollation[key->attnum - 1], + PointerGetDatum(key->entryRes), + UInt16GetDatum(key->strategy), + key->query, + UInt32GetDatum(key->nuserentries), + PointerGetDatum(key->extra_data), + PointerGetDatum(&key->recheckCurItem), + PointerGetDatum(key->queryValues), + PointerGetDatum(key->queryCategories))); +} + +/* + * Tries to refind previously taken ItemPointer on a posting page. + */ +static bool +findItemInPostingPage(Page page, ItemPointer item, OffsetNumber *off) { OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; int res; if (GinPageGetOpaque(page)->flags & GIN_DELETED) - /* page was deleted by concurrent vacuum */ + /* page was deleted by concurrent vacuum */ return false; /* * scan page to find equal or first greater value */ - for (*off = FirstOffsetNumber; *off <= maxoff; (*off)++) { - res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off)); + res = ginCompareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off)); if (res <= 0) return true; @@ -80,7 +113,9 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack) return false; /* no more pages */ LockBuffer(stack->buffer, GIN_UNLOCK); - stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno); + stack->buffer = ReleaseAndReadBuffer(stack->buffer, + btree->index, + stack->blkno); LockBuffer(stack->buffer, GIN_SHARE); stack->off = FirstOffsetNumber; } @@ -89,71 +124,94 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack) } /* - * Does fullscan of posting tree and saves ItemPointers - * in scanEntry->partialMatch TIDBitmap + * Scan all pages of a posting tree and save all its heap ItemPointers + * in scanEntry->matchBitmap */ static void -scanForItems(Relation index, GinScanEntry scanEntry, BlockNumber rootPostingTree) +scanPostingTree(Relation index, GinScanEntry scanEntry, + BlockNumber rootPostingTree) { GinPostingTreeScan *gdi; Buffer buffer; Page page; BlockNumber blkno; - gdi = prepareScanPostingTree(index, rootPostingTree, TRUE); + /* Descend to the leftmost leaf page */ + gdi = ginPrepareScanPostingTree(index, rootPostingTree, TRUE); - buffer = scanBeginPostingTree(gdi); + buffer = ginScanBeginPostingTree(gdi); IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */ freeGinBtreeStack(gdi->stack); pfree(gdi); /* - * Goes through all leaves + * Loop iterates through all leaf pages of posting tree */ for (;;) { page = BufferGetPage(buffer); - if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0 && GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber) + if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0 && + GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber) { - tbm_add_tuples(scanEntry->partialMatch, + tbm_add_tuples(scanEntry->matchBitmap, (ItemPointer) GinDataPageGetItem(page, FirstOffsetNumber), GinPageGetOpaque(page)->maxoff, false); scanEntry->predictNumberResult += GinPageGetOpaque(page)->maxoff; } - blkno = GinPageGetOpaque(page)->rightlink; if (GinPageRightMost(page)) - { - UnlockReleaseBuffer(buffer); - return; /* no more pages */ - } + break; /* no more pages */ + blkno = GinPageGetOpaque(page)->rightlink; LockBuffer(buffer, GIN_UNLOCK); buffer = ReleaseAndReadBuffer(buffer, index, blkno); LockBuffer(buffer, GIN_SHARE); } + + UnlockReleaseBuffer(buffer); } /* - * Collects all ItemPointer into the TIDBitmap struct - * for entries partially matched to search entry. + * Collects TIDs into scanEntry->matchBitmap for all heap tuples that + * match the search entry. This supports three different match modes: * - * Returns true if done, false if it's needed to restart scan from scratch + * 1. Partial-match support: scan from current point until the + * comparePartialFn says we're done. + * 2. SEARCH_MODE_ALL: scan from current point (which should be first + * key for the current attnum) until we hit null items or end of attnum + * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first + * key for the current attnum) until we hit end of attnum + * + * Returns true if done, false if it's necessary to restart scan from scratch */ static bool -computePartialMatchList(GinBtreeData *btree, GinBtreeStack *stack, GinScanEntry scanEntry) +collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, + GinScanEntry scanEntry) { - Page page; - IndexTuple itup; - Datum idatum; - int32 cmp; + OffsetNumber attnum; + Form_pg_attribute attr; + + /* Initialize empty bitmap result */ + scanEntry->matchBitmap = tbm_create(work_mem * 1024L); - scanEntry->partialMatch = tbm_create(work_mem * 1024L); + /* Null query cannot partial-match anything */ + if (scanEntry->isPartialMatch && + scanEntry->queryCategory != GIN_CAT_NORM_KEY) + return true; + + /* Locate tupdesc entry for key column (for attbyval/attlen data) */ + attnum = scanEntry->attnum; + attr = btree->ginstate->origTupdesc->attrs[attnum - 1]; for (;;) { + Page page; + IndexTuple itup; + Datum idatum; + GinNullCategory icategory; + /* * stack->off points to the interested entry, buffer is already locked */ @@ -166,56 +224,85 @@ computePartialMatchList(GinBtreeData *btree, GinBtreeStack *stack, GinScanEntry /* * If tuple stores another attribute then stop scan */ - if (gintuple_get_attrnum(btree->ginstate, itup) != scanEntry->attnum) + if (gintuple_get_attrnum(btree->ginstate, itup) != attnum) return true; - idatum = gin_index_getattr(btree->ginstate, itup); + /* Safe to fetch attribute value */ + idatum = gintuple_get_key(btree->ginstate, itup, &icategory); - - /*---------- - * Check of partial match. - * case cmp == 0 => match - * case cmp > 0 => not match and finish scan - * case cmp < 0 => not match and continue scan - *---------- + /* + * Check for appropriate scan stop conditions */ - cmp = DatumGetInt32(FunctionCall4(&btree->ginstate->comparePartialFn[scanEntry->attnum - 1], - scanEntry->entry, - idatum, - UInt16GetDatum(scanEntry->strategy), + if (scanEntry->isPartialMatch) + { + int32 cmp; + + /* + * In partial match, stop scan at any null (including + * placeholders); partial matches never match nulls + */ + if (icategory != GIN_CAT_NORM_KEY) + return true; + + /*---------- + * Check of partial match. + * case cmp == 0 => match + * case cmp > 0 => not match and finish scan + * case cmp < 0 => not match and continue scan + *---------- + */ + cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1], + btree->ginstate->supportCollation[attnum - 1], + scanEntry->queryKey, + idatum, + UInt16GetDatum(scanEntry->strategy), PointerGetDatum(scanEntry->extra_data))); - if (cmp > 0) - return true; - else if (cmp < 0) + if (cmp > 0) + return true; + else if (cmp < 0) + { + stack->off++; + continue; + } + } + else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL) { - stack->off++; - continue; + /* + * In ALL mode, we are not interested in null items, so we can + * stop if we get to a null-item placeholder (which will be the + * last entry for a given attnum). We do want to include NULL_KEY + * and EMPTY_ITEM entries, though. + */ + if (icategory == GIN_CAT_NULL_ITEM) + return true; } + /* + * OK, we want to return the TIDs listed in this entry. + */ if (GinIsPostingTree(itup)) { BlockNumber rootPostingTree = GinGetPostingTree(itup); - Datum newDatum, - savedDatum = datumCopy( - idatum, - btree->ginstate->origTupdesc->attrs[scanEntry->attnum - 1]->attbyval, - btree->ginstate->origTupdesc->attrs[scanEntry->attnum - 1]->attlen - ); /* * We should unlock current page (but not unpin) during tree scan * to prevent deadlock with vacuum processes. * - * We save current entry value (savedDatum) to be able to refind - * our tuple after re-locking + * We save current entry value (idatum) to be able to re-find our + * tuple after re-locking */ + if (icategory == GIN_CAT_NORM_KEY) + idatum = datumCopy(idatum, attr->attbyval, attr->attlen); + LockBuffer(stack->buffer, GIN_UNLOCK); - scanForItems(btree->index, scanEntry, rootPostingTree); + + /* Collect all the TIDs in this entry's posting tree */ + scanPostingTree(btree->index, scanEntry, rootPostingTree); /* * We lock again the entry page and while it was unlocked insert - * might occured, so we need to refind our position + * might have occurred, so we need to re-find our position. */ LockBuffer(stack->buffer, GIN_SHARE); page = BufferGetPage(stack->buffer); @@ -223,44 +310,49 @@ computePartialMatchList(GinBtreeData *btree, GinBtreeStack *stack, GinScanEntry { /* * Root page becomes non-leaf while we unlock it. We will - * start again, this situation doesn't cause often - root can - * became a non-leaf only one per life of index. + * start again, this situation doesn't occur often - root can + * became a non-leaf only once per life of index. */ - return false; } + /* Search forward to re-find idatum */ for (;;) { + Datum newDatum; + GinNullCategory newCategory; + if (moveRightIfItNeeded(btree, stack) == false) elog(ERROR, "lost saved point in index"); /* must not happen !!! */ page = BufferGetPage(stack->buffer); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); - newDatum = gin_index_getattr(btree->ginstate, itup); - if (gintuple_get_attrnum(btree->ginstate, itup) != scanEntry->attnum) + if (gintuple_get_attrnum(btree->ginstate, itup) != attnum) elog(ERROR, "lost saved point in index"); /* must not happen !!! */ + newDatum = gintuple_get_key(btree->ginstate, itup, + &newCategory); - if (compareEntries(btree->ginstate, scanEntry->attnum, newDatum, savedDatum) == 0) - { - /* Found! */ - if (btree->ginstate->origTupdesc->attrs[scanEntry->attnum - 1]->attbyval == false) - pfree(DatumGetPointer(savedDatum)); - break; - } + if (ginCompareEntries(btree->ginstate, attnum, + newDatum, newCategory, + idatum, icategory) == 0) + break; /* Found! */ stack->off++; } + + if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval) + pfree(DatumGetPointer(idatum)); } else { - tbm_add_tuples(scanEntry->partialMatch, GinGetPosting(itup), GinGetNPosting(itup), false); + tbm_add_tuples(scanEntry->matchBitmap, + GinGetPosting(itup), GinGetNPosting(itup), false); scanEntry->predictNumberResult += GinGetNPosting(itup); } /* - * Ok, we save ItemPointers, go to the next entry + * Done with this entry, go to the next */ stack->off++; } @@ -272,72 +364,72 @@ computePartialMatchList(GinBtreeData *btree, GinBtreeStack *stack, GinScanEntry * Start* functions setup beginning state of searches: finds correct buffer and pins it. */ static void -startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry) +startScanEntry(GinState *ginstate, GinScanEntry entry) { GinBtreeData btreeEntry; GinBtreeStack *stackEntry; Page page; - bool needUnlock = TRUE; + bool needUnlock; +restartScanEntry: entry->buffer = InvalidBuffer; + ItemPointerSetMin(&entry->curItem); entry->offset = InvalidOffsetNumber; entry->list = NULL; entry->nlist = 0; - entry->partialMatch = NULL; - entry->partialMatchResult = NULL; + entry->matchBitmap = NULL; + entry->matchResult = NULL; entry->reduceResult = FALSE; entry->predictNumberResult = 0; - if (entry->master != NULL) - { - entry->isFinished = entry->master->isFinished; - return; - } - /* * we should find entry, and begin scan of posting tree or just store * posting list in memory */ - - prepareEntryScan(&btreeEntry, index, entry->attnum, entry->entry, ginstate); + ginPrepareEntryScan(&btreeEntry, entry->attnum, + entry->queryKey, entry->queryCategory, + ginstate); btreeEntry.searchMode = TRUE; stackEntry = ginFindLeafPage(&btreeEntry, NULL); page = BufferGetPage(stackEntry->buffer); + needUnlock = TRUE; entry->isFinished = TRUE; - if (entry->isPartialMatch) + if (entry->isPartialMatch || + entry->queryCategory == GIN_CAT_EMPTY_QUERY) { /* - * btreeEntry.findItem points to the first equal or greater value than - * needed. So we will scan further and collect all ItemPointers + * btreeEntry.findItem locates the first item >= given search key. + * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item + * because of the way the GIN_CAT_EMPTY_QUERY category code is + * assigned.) We scan forward from there and collect all TIDs needed + * for the entry type. */ btreeEntry.findItem(&btreeEntry, stackEntry); - if (computePartialMatchList(&btreeEntry, stackEntry, entry) == false) + if (collectMatchBitmap(&btreeEntry, stackEntry, entry) == false) { /* * GIN tree was seriously restructured, so we will cleanup all * found data and rescan. See comments near 'return false' in - * computePartialMatchList() + * collectMatchBitmap() */ - if (entry->partialMatch) + if (entry->matchBitmap) { - if (entry->partialMatchIterator) - tbm_end_iterate(entry->partialMatchIterator); - entry->partialMatchIterator = NULL; - tbm_free(entry->partialMatch); - entry->partialMatch = NULL; + if (entry->matchIterator) + tbm_end_iterate(entry->matchIterator); + entry->matchIterator = NULL; + tbm_free(entry->matchBitmap); + entry->matchBitmap = NULL; } LockBuffer(stackEntry->buffer, GIN_UNLOCK); freeGinBtreeStack(stackEntry); - - startScanEntry(index, ginstate, entry); - return; + goto restartScanEntry; } - if (entry->partialMatch && !tbm_is_empty(entry->partialMatch)) + if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap)) { - entry->partialMatchIterator = tbm_begin_iterate(entry->partialMatch); + entry->matchIterator = tbm_begin_iterate(entry->matchBitmap); entry->isFinished = FALSE; } } @@ -352,17 +444,17 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry) Page page; /* - * We should unlock entry page before make deal with posting tree - * to prevent deadlocks with vacuum processes. Because entry is - * never deleted from page and posting tree is never reduced to - * the posting list, we can unlock page after getting BlockNumber - * of root of posting tree. + * We should unlock entry page before touching posting tree to + * prevent deadlocks with vacuum processes. Because entry is never + * deleted from page and posting tree is never reduced to the + * posting list, we can unlock page after getting BlockNumber of + * root of posting tree. */ LockBuffer(stackEntry->buffer, GIN_UNLOCK); needUnlock = FALSE; - gdi = prepareScanPostingTree(index, rootPostingTree, TRUE); + gdi = ginPrepareScanPostingTree(ginstate->index, rootPostingTree, TRUE); - entry->buffer = scanBeginPostingTree(gdi); + entry->buffer = ginScanBeginPostingTree(gdi); /* * We keep buffer pinned because we need to prevent deletion of @@ -402,19 +494,23 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry) } static void -startScanKey(Relation index, GinState *ginstate, GinScanKey key) +startScanKey(GinState *ginstate, GinScanKey key) { - uint32 i; - - if (!key->firstCall) - return; + ItemPointerSetMin(&key->curItem); + key->curItemMatches = false; + key->recheckCurItem = false; + key->isFinished = false; +} - for (i = 0; i < key->nentries; i++) - startScanEntry(index, ginstate, key->scanEntry + i); +static void +startScan(IndexScanDesc scan) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + GinState *ginstate = &so->ginstate; + uint32 i; - memset(key->entryRes, TRUE, sizeof(bool) * key->nentries); - key->isFinished = FALSE; - key->firstCall = FALSE; + for (i = 0; i < so->totalentries; i++) + startScanEntry(ginstate, so->entries[i]); if (GinFuzzySearchLimit > 0) { @@ -425,27 +521,20 @@ startScanKey(Relation index, GinState *ginstate, GinScanKey key) * minimal predictNumberResult. */ - for (i = 0; i < key->nentries; i++) - if (key->scanEntry[i].predictNumberResult <= key->nentries * GinFuzzySearchLimit) + for (i = 0; i < so->totalentries; i++) + if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit) return; - for (i = 0; i < key->nentries; i++) - if (key->scanEntry[i].predictNumberResult > key->nentries * GinFuzzySearchLimit) + for (i = 0; i < so->totalentries; i++) + if (so->entries[i]->predictNumberResult > so->totalentries * GinFuzzySearchLimit) { - key->scanEntry[i].predictNumberResult /= key->nentries; - key->scanEntry[i].reduceResult = TRUE; + so->entries[i]->predictNumberResult /= so->totalentries; + so->entries[i]->reduceResult = TRUE; } } -} - -static void -startScan(IndexScanDesc scan) -{ - uint32 i; - GinScanOpaque so = (GinScanOpaque) scan->opaque; for (i = 0; i < so->nkeys; i++) - startScanKey(scan->indexRelation, &so->ginstate, so->keys + i); + startScanKey(ginstate, so->keys + i); } /* @@ -454,18 +543,16 @@ startScan(IndexScanDesc scan) * to prevent interference with vacuum */ static void -entryGetNextItem(Relation index, GinScanEntry entry) +entryGetNextItem(GinState *ginstate, GinScanEntry entry) { Page page; BlockNumber blkno; for (;;) { - entry->offset++; - - if (entry->offset <= entry->nlist) + if (entry->offset < entry->nlist) { - entry->curItem = entry->list[entry->offset - 1]; + entry->curItem = entry->list[entry->offset++]; return; } @@ -484,18 +571,21 @@ entryGetNextItem(Relation index, GinScanEntry entry) if (blkno == InvalidBlockNumber) { ReleaseBuffer(entry->buffer); - ItemPointerSet(&entry->curItem, InvalidBlockNumber, InvalidOffsetNumber); + ItemPointerSetInvalid(&entry->curItem); entry->buffer = InvalidBuffer; entry->isFinished = TRUE; return; } - entry->buffer = ReleaseAndReadBuffer(entry->buffer, index, blkno); + entry->buffer = ReleaseAndReadBuffer(entry->buffer, + ginstate->index, + blkno); LockBuffer(entry->buffer, GIN_SHARE); page = BufferGetPage(entry->buffer); entry->offset = InvalidOffsetNumber; - if (!ItemPointerIsValid(&entry->curItem) || findItemInPage(page, &entry->curItem, &entry->offset)) + if (!ItemPointerIsValid(&entry->curItem) || + findItemInPostingPage(page, &entry->curItem, &entry->offset)) { /* * Found position equal to or greater than stored @@ -507,13 +597,13 @@ entryGetNextItem(Relation index, GinScanEntry entry) LockBuffer(entry->buffer, GIN_UNLOCK); if (!ItemPointerIsValid(&entry->curItem) || - compareItemPointers(&entry->curItem, entry->list + entry->offset - 1) == 0) + ginCompareItemPointers(&entry->curItem, + entry->list + entry->offset - 1) == 0) { /* * First pages are deleted or empty, or we found exact * position, so break inner loop and continue outer one. */ - break; } @@ -532,52 +622,57 @@ entryGetNextItem(Relation index, GinScanEntry entry) #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) ) /* - * Sets entry->curItem to new found heap item pointer for one - * entry of one scan key + * Sets entry->curItem to next heap item pointer for one entry of one scan key, + * or sets entry->isFinished to TRUE if there are no more. + * + * Item pointers must be returned in ascending order. + * + * Note: this can return a "lossy page" item pointer, indicating that the + * entry potentially matches all items on that heap page. However, it is + * not allowed to return both a lossy page pointer and exact (regular) + * item pointers for the same page. (Doing so would break the key-combination + * logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the + * current implementation this is guaranteed by the behavior of tidbitmaps. */ -static bool -entryGetItem(Relation index, GinScanEntry entry) +static void +entryGetItem(GinState *ginstate, GinScanEntry entry) { - if (entry->master) - { - entry->isFinished = entry->master->isFinished; - entry->curItem = entry->master->curItem; - } - else if (entry->partialMatch) + Assert(!entry->isFinished); + + if (entry->matchBitmap) { do { - if (entry->partialMatchResult == NULL || - entry->offset >= entry->partialMatchResult->ntuples) + if (entry->matchResult == NULL || + entry->offset >= entry->matchResult->ntuples) { - entry->partialMatchResult = tbm_iterate(entry->partialMatchIterator); + entry->matchResult = tbm_iterate(entry->matchIterator); - if (entry->partialMatchResult == NULL) + if (entry->matchResult == NULL) { - ItemPointerSet(&entry->curItem, InvalidBlockNumber, InvalidOffsetNumber); - tbm_end_iterate(entry->partialMatchIterator); - entry->partialMatchIterator = NULL; + ItemPointerSetInvalid(&entry->curItem); + tbm_end_iterate(entry->matchIterator); + entry->matchIterator = NULL; entry->isFinished = TRUE; break; } /* - * reset counter to the beginning of - * entry->partialMatchResult. Note: entry->offset is still - * greater than partialMatchResult->ntuples if - * partialMatchResult is lossy. So, on next call we will get - * next result from TIDBitmap. + * Reset counter to the beginning of entry->matchResult. Note: + * entry->offset is still greater than matchResult->ntuples if + * matchResult is lossy. So, on next call we will get next + * result from TIDBitmap. */ entry->offset = 0; } - if (entry->partialMatchResult->ntuples < 0) + if (entry->matchResult->ntuples < 0) { /* * lossy result, so we need to check the whole page */ ItemPointerSetLossyPage(&entry->curItem, - entry->partialMatchResult->blockno); + entry->matchResult->blockno); /* * We might as well fall out of the loop; we could not @@ -588,8 +683,8 @@ entryGetItem(Relation index, GinScanEntry entry) } ItemPointerSet(&entry->curItem, - entry->partialMatchResult->blockno, - entry->partialMatchResult->offsets[entry->offset]); + entry->matchResult->blockno, + entry->matchResult->offsets[entry->offset]); entry->offset++; } while (entry->reduceResult == TRUE && dropItem(entry)); } @@ -600,7 +695,7 @@ entryGetItem(Relation index, GinScanEntry entry) entry->curItem = entry->list[entry->offset - 1]; else { - ItemPointerSet(&entry->curItem, InvalidBlockNumber, InvalidOffsetNumber); + ItemPointerSetInvalid(&entry->curItem); entry->isFinished = TRUE; } } @@ -608,137 +703,350 @@ entryGetItem(Relation index, GinScanEntry entry) { do { - entryGetNextItem(index, entry); - } while (entry->isFinished == FALSE && entry->reduceResult == TRUE && dropItem(entry)); + entryGetNextItem(ginstate, entry); + } while (entry->isFinished == FALSE && + entry->reduceResult == TRUE && + dropItem(entry)); } - - return entry->isFinished; } /* - * Sets key->curItem to new found heap item pointer for one scan key - * Returns isFinished, ie TRUE means we did NOT get a new item pointer! - * Also, *keyrecheck is set true if recheck is needed for this scan key. - * Note: lossy page could be returned after items from the same page. + * Identify the "current" item among the input entry streams for this scan key, + * and test whether it passes the scan key qual condition. + * + * The current item is the smallest curItem among the inputs. key->curItem + * is set to that value. key->curItemMatches is set to indicate whether that + * TID passes the consistentFn test. If so, key->recheckCurItem is set true + * iff recheck is needed for this item pointer (including the case where the + * item pointer is a lossy page pointer). + * + * If all entry streams are exhausted, sets key->isFinished to TRUE. + * + * Item pointers must be returned in ascending order. + * + * Note: this can return a "lossy page" item pointer, indicating that the + * key potentially matches all items on that heap page. However, it is + * not allowed to return both a lossy page pointer and exact (regular) + * item pointers for the same page. (Doing so would break the key-combination + * logic in scanGetItem.) */ -static bool -keyGetItem(Relation index, GinState *ginstate, MemoryContext tempCtx, - GinScanKey key, bool *keyrecheck) +static void +keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key) { + ItemPointerData minItem; + ItemPointerData curPageLossy; uint32 i; + uint32 lossyEntry; + bool haveLossyEntry; GinScanEntry entry; bool res; MemoryContext oldCtx; - if (key->isFinished) - return TRUE; + Assert(!key->isFinished); - do + /* + * Find the minimum of the active entry curItems. + * + * Note: a lossy-page entry is encoded by a ItemPointer with max value for + * offset (0xffff), so that it will sort after any exact entries for the + * same page. So we'll prefer to return exact pointers not lossy + * pointers, which is good. + */ + ItemPointerSetMax(&minItem); + + for (i = 0; i < key->nentries; i++) + { + entry = key->scanEntry[i]; + if (entry->isFinished == FALSE && + ginCompareItemPointers(&entry->curItem, &minItem) < 0) + minItem = entry->curItem; + } + + if (ItemPointerIsMax(&minItem)) + { + /* all entries are finished */ + key->isFinished = TRUE; + return; + } + + /* + * We might have already tested this item; if so, no need to repeat work. + * (Note: the ">" case can happen, if minItem is exact but we previously + * had to set curItem to a lossy-page pointer.) + */ + if (ginCompareItemPointers(&key->curItem, &minItem) >= 0) + return; + + /* + * OK, advance key->curItem and perform consistentFn test. + */ + key->curItem = minItem; + + /* + * Lossy-page entries pose a problem, since we don't know the correct + * entryRes state to pass to the consistentFn, and we also don't know what + * its combining logic will be (could be AND, OR, or even NOT). If the + * logic is OR then the consistentFn might succeed for all items in the + * lossy page even when none of the other entries match. + * + * If we have a single lossy-page entry then we check to see if the + * consistentFn will succeed with only that entry TRUE. If so, we return + * a lossy-page pointer to indicate that the whole heap page must be + * checked. (On subsequent calls, we'll do nothing until minItem is past + * the page altogether, thus ensuring that we never return both regular + * and lossy pointers for the same page.) + * + * This idea could be generalized to more than one lossy-page entry, but + * ideally lossy-page entries should be infrequent so it would seldom be + * the case that we have more than one at once. So it doesn't seem worth + * the extra complexity to optimize that case. If we do find more than + * one, we just punt and return a lossy-page pointer always. + * + * Note that only lossy-page entries pointing to the current item's page + * should trigger this processing; we might have future lossy pages in the + * entry array, but they aren't relevant yet. + */ + ItemPointerSetLossyPage(&curPageLossy, + GinItemPointerGetBlockNumber(&key->curItem)); + + lossyEntry = 0; + haveLossyEntry = false; + for (i = 0; i < key->nentries; i++) + { + entry = key->scanEntry[i]; + if (entry->isFinished == FALSE && + ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0) + { + if (haveLossyEntry) + { + /* Multiple lossy entries, punt */ + key->curItem = curPageLossy; + key->curItemMatches = true; + key->recheckCurItem = true; + return; + } + lossyEntry = i; + haveLossyEntry = true; + } + } + + /* prepare for calling consistentFn in temp context */ + oldCtx = MemoryContextSwitchTo(tempCtx); + + if (haveLossyEntry) + { + /* Single lossy-page entry, so see if whole page matches */ + memset(key->entryRes, FALSE, key->nentries); + key->entryRes[lossyEntry] = TRUE; + + if (callConsistentFn(ginstate, key)) + { + /* Yes, so clean up ... */ + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(tempCtx); + + /* and return lossy pointer for whole page */ + key->curItem = curPageLossy; + key->curItemMatches = true; + key->recheckCurItem = true; + return; + } + } + + /* + * At this point we know that we don't need to return a lossy whole-page + * pointer, but we might have matches for individual exact item pointers, + * possibly in combination with a lossy pointer. Our strategy if there's + * a lossy pointer is to try the consistentFn both ways and return a hit + * if it accepts either one (forcing the hit to be marked lossy so it will + * be rechecked). An exception is that we don't need to try it both ways + * if the lossy pointer is in a "hidden" entry, because the consistentFn's + * result can't depend on that. + * + * Prepare entryRes array to be passed to consistentFn. + */ + for (i = 0; i < key->nentries; i++) + { + entry = key->scanEntry[i]; + if (entry->isFinished == FALSE && + ginCompareItemPointers(&entry->curItem, &key->curItem) == 0) + key->entryRes[i] = TRUE; + else + key->entryRes[i] = FALSE; + } + if (haveLossyEntry) + key->entryRes[lossyEntry] = TRUE; + + res = callConsistentFn(ginstate, key); + + if (!res && haveLossyEntry && lossyEntry < key->nuserentries) + { + /* try the other way for the lossy item */ + key->entryRes[lossyEntry] = FALSE; + + res = callConsistentFn(ginstate, key); + } + + key->curItemMatches = res; + /* If we matched a lossy entry, force recheckCurItem = true */ + if (haveLossyEntry) + key->recheckCurItem = true; + + /* clean up after consistentFn calls */ + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(tempCtx); +} + +/* + * Get next heap item pointer (after advancePast) from scan. + * Returns true if anything found. + * On success, *item and *recheck are set. + * + * Note: this is very nearly the same logic as in keyGetItem(), except + * that we know the keys are to be combined with AND logic, whereas in + * keyGetItem() the combination logic is known only to the consistentFn. + */ +static bool +scanGetItem(IndexScanDesc scan, ItemPointer advancePast, + ItemPointerData *item, bool *recheck) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + GinState *ginstate = &so->ginstate; + ItemPointerData myAdvancePast = *advancePast; + uint32 i; + bool allFinished; + bool match; + + for (;;) { /* - * move forward from previously value and set new curItem, which is - * minimal from entries->curItems. Lossy page is encoded by - * ItemPointer with max value for offset (0xffff), so if there is an - * non-lossy entries on lossy page they will returned too and after - * that the whole page. That's not a problem for resulting tidbitmap. + * Advance any entries that are <= myAdvancePast. In particular, + * since entry->curItem was initialized with ItemPointerSetMin, this + * ensures we fetch the first item for each entry on the first call. */ - ItemPointerSetMax(&key->curItem); - for (i = 0; i < key->nentries; i++) + allFinished = TRUE; + + for (i = 0; i < so->totalentries; i++) { - entry = key->scanEntry + i; + GinScanEntry entry = so->entries[i]; - if (key->entryRes[i]) - { - /* - * Move forward only entries which was the least on previous - * call, key->entryRes[i] points that current entry was a - * result of loop/call. - */ - if (entry->isFinished == FALSE && entryGetItem(index, entry) == FALSE) - { - if (compareItemPointers(&entry->curItem, &key->curItem) < 0) - key->curItem = entry->curItem; - } - else - key->entryRes[i] = FALSE; - } - else if (entry->isFinished == FALSE) - { - if (compareItemPointers(&entry->curItem, &key->curItem) < 0) - key->curItem = entry->curItem; - } + while (entry->isFinished == FALSE && + ginCompareItemPointers(&entry->curItem, + &myAdvancePast) <= 0) + entryGetItem(ginstate, entry); + + if (entry->isFinished == FALSE) + allFinished = FALSE; } - if (ItemPointerIsMax(&key->curItem)) + if (allFinished) { - /* all entries are finished */ - key->isFinished = TRUE; - return TRUE; + /* all entries exhausted, so we're done */ + return false; } /* - * Now key->curItem contains closest ItemPointer to previous result. - * - * if key->nentries == 1 then the consistentFn should always succeed, - * but we must call it anyway to find out the recheck status. + * Perform the consistentFn test for each scan key. If any key + * reports isFinished, meaning its subset of the entries is exhausted, + * we can stop. Otherwise, set *item to the minimum of the key + * curItems. */ + ItemPointerSetMax(item); + + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + keyGetItem(&so->ginstate, so->tempCtx, key); + + if (key->isFinished) + return false; /* finished one of keys */ + + if (ginCompareItemPointers(&key->curItem, item) < 0) + *item = key->curItem; + } + + Assert(!ItemPointerIsMax(item)); /*---------- - * entryRes array is used for: - * - as an argument for consistentFn - * - entry->curItem with corresponding key->entryRes[i] == false are - * greater than key->curItem, so next loop/call they should be - * renewed by entryGetItem(). So, we need to set up an array before - * checking of lossy page. + * Now *item contains first ItemPointer after previous result. + * + * The item is a valid hit only if all the keys succeeded for either + * that exact TID, or a lossy reference to the same page. + * + * This logic works only if a keyGetItem stream can never contain both + * exact and lossy pointers for the same page. Else we could have a + * case like + * + * stream 1 stream 2 + * ... ... + * 42/6 42/7 + * 50/1 42/0xffff + * ... ... + * + * We would conclude that 42/6 is not a match and advance stream 1, + * thus never detecting the match to the lossy pointer in stream 2. + * (keyGetItem has a similar problem versus entryGetItem.) *---------- */ - for (i = 0; i < key->nentries; i++) + match = true; + for (i = 0; i < so->nkeys; i++) { - entry = key->scanEntry + i; + GinScanKey key = so->keys + i; - if (entry->isFinished == FALSE && - compareItemPointers(&entry->curItem, &key->curItem) == 0) - key->entryRes[i] = TRUE; - else - key->entryRes[i] = FALSE; + if (key->curItemMatches) + { + if (ginCompareItemPointers(item, &key->curItem) == 0) + continue; + if (ItemPointerIsLossyPage(&key->curItem) && + GinItemPointerGetBlockNumber(&key->curItem) == + GinItemPointerGetBlockNumber(item)) + continue; + } + match = false; + break; } - /* - * Initialize *keyrecheck in case the consistentFn doesn't know it - * should set it. The safe assumption in that case is to force - * recheck. - */ - *keyrecheck = true; + if (match) + break; /* - * If one of the entry's scans returns lossy result, return it without - * further checking - we can't call consistentFn for lack of data. + * No hit. Update myAdvancePast to this TID, so that on the next pass + * we'll move to the next possible entry. */ - if (ItemPointerIsLossyPage(&key->curItem)) - return FALSE; - - oldCtx = MemoryContextSwitchTo(tempCtx); - res = DatumGetBool(FunctionCall6(&ginstate->consistentFn[key->attnum - 1], - PointerGetDatum(key->entryRes), - UInt16GetDatum(key->strategy), - key->query, - UInt32GetDatum(key->nentries), - PointerGetDatum(key->extra_data), - PointerGetDatum(keyrecheck))); - MemoryContextSwitchTo(oldCtx); - MemoryContextReset(tempCtx); - } while (!res); + myAdvancePast = *item; + } - return FALSE; + /* + * We must return recheck = true if any of the keys are marked recheck. + */ + *recheck = false; + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + if (key->recheckCurItem) + { + *recheck = true; + break; + } + } + + return TRUE; } /* + * Functions for scanning the pending list + */ + + +/* * Get ItemPointer of next heap row to be checked from pending list. - * Returns false if there are no more. On pages with several rows + * Returns false if there are no more. On pages with several heap rows * it returns each row separately, on page with part of heap row returns - * per page data. pos->firstOffset and pos->lastOffset points - * fraction of tuples for current heap row. + * per page data. pos->firstOffset and pos->lastOffset are set to identify + * the range of pending-list tuples belonging to this heap row. * * The pendingBuffer is presumed pinned and share-locked on entry, and is * pinned and share-locked on success exit. On failure exit it's released. @@ -810,10 +1118,9 @@ scanGetCandidate(IndexScanDesc scan, pendingPosition *pos) /* * Now pos->firstOffset points to the first tuple of current heap - * row, pos->lastOffset points to the first tuple of second heap - * row (or to the end of page) + * row, pos->lastOffset points to the first tuple of next heap row + * (or to the end of page) */ - break; } } @@ -822,35 +1129,47 @@ scanGetCandidate(IndexScanDesc scan, pendingPosition *pos) } /* - * Scan page from current tuple (off) up till the first of: + * Scan pending-list page from current tuple (off) up till the first of: * - match is found (then returns true) * - no later match is possible * - tuple's attribute number is not equal to entry's attrnum * - reach end of page + * + * datum[]/category[]/datumExtracted[] arrays are used to cache the results + * of gintuple_get_key() on the current page. */ static bool matchPartialInPendingList(GinState *ginstate, Page page, OffsetNumber off, OffsetNumber maxoff, - Datum value, OffsetNumber attrnum, - Datum *datum, bool *datumExtracted, - StrategyNumber strategy, - Pointer extra_data) + GinScanEntry entry, + Datum *datum, GinNullCategory *category, + bool *datumExtracted) { IndexTuple itup; int32 cmp; + /* Partial match to a null is not possible */ + if (entry->queryCategory != GIN_CAT_NORM_KEY) + return false; + while (off < maxoff) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); - if (attrnum != gintuple_get_attrnum(ginstate, itup)) + + if (gintuple_get_attrnum(ginstate, itup) != entry->attnum) return false; if (datumExtracted[off - 1] == false) { - datum[off - 1] = gin_index_getattr(ginstate, itup); + datum[off - 1] = gintuple_get_key(ginstate, itup, + &category[off - 1]); datumExtracted[off - 1] = true; } + /* Once we hit nulls, no further match is possible */ + if (category[off - 1] != GIN_CAT_NORM_KEY) + return false; + /*---------- * Check partial match. * case cmp == 0 => match @@ -858,11 +1177,12 @@ matchPartialInPendingList(GinState *ginstate, Page page, * case cmp < 0 => not match and continue scan *---------- */ - cmp = DatumGetInt32(FunctionCall4(&ginstate->comparePartialFn[attrnum - 1], - value, - datum[off - 1], - UInt16GetDatum(strategy), - PointerGetDatum(extra_data))); + cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1], + ginstate->supportCollation[entry->attnum - 1], + entry->queryKey, + datum[off - 1], + UInt16GetDatum(entry->strategy), + PointerGetDatum(entry->extra_data))); if (cmp == 0) return true; else if (cmp > 0) @@ -874,27 +1194,20 @@ matchPartialInPendingList(GinState *ginstate, Page page, return false; } -static bool -hasAllMatchingKeys(GinScanOpaque so, pendingPosition *pos) -{ - int i; - - for (i = 0; i < so->nkeys; i++) - if (pos->hasMatchKey[i] == false) - return false; - - return true; -} - /* - * Sets entryRes array for each key by looking at - * every entry per indexed value (heap's row) in pending list. - * returns true if at least one of datum was matched by key's entry + * Set up the entryRes array for each key by looking at + * every entry for current heap row in pending list. + * + * Returns true if each scan key has at least one entryRes match. + * This corresponds to the situations where the normal index search will + * try to apply the key's consistentFn. (A tuple not meeting that requirement + * cannot be returned by the normal search since no entry stream will + * source its TID.) * * The pendingBuffer is presumed pinned and share-locked on entry. */ static bool -collectDatumForItem(IndexScanDesc scan, pendingPosition *pos) +collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos) { GinScanOpaque so = (GinScanOpaque) scan->opaque; OffsetNumber attrnum; @@ -904,7 +1217,7 @@ collectDatumForItem(IndexScanDesc scan, pendingPosition *pos) j; /* - * Resets entryRes + * Reset all entryRes and hasMatchKey flags */ for (i = 0; i < so->nkeys; i++) { @@ -914,13 +1227,19 @@ collectDatumForItem(IndexScanDesc scan, pendingPosition *pos) } memset(pos->hasMatchKey, FALSE, so->nkeys); + /* + * Outer loop iterates over multiple pending-list pages when a single heap + * row has entries spanning those pages. + */ for (;;) { Datum datum[BLCKSZ / sizeof(IndexTupleData)]; + GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)]; bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)]; Assert(pos->lastOffset > pos->firstOffset); - memset(datumExtracted + pos->firstOffset - 1, 0, sizeof(bool) * (pos->lastOffset - pos->firstOffset)); + memset(datumExtracted + pos->firstOffset - 1, 0, + sizeof(bool) * (pos->lastOffset - pos->firstOffset)); page = BufferGetPage(pos->pendingBuffer); @@ -930,128 +1249,174 @@ collectDatumForItem(IndexScanDesc scan, pendingPosition *pos) for (j = 0; j < key->nentries; j++) { + GinScanEntry entry = key->scanEntry[j]; OffsetNumber StopLow = pos->firstOffset, StopHigh = pos->lastOffset, StopMiddle; - GinScanEntry entry = key->scanEntry + j; - /* already true - do not extra work */ + /* If already matched on earlier page, do no extra work */ if (key->entryRes[j]) continue; /* - * Interested tuples are from pos->firstOffset to + * Interesting tuples are from pos->firstOffset to * pos->lastOffset and they are ordered by (attnum, Datum) as - * it's done in entry tree So we could use binary search to - * prevent linear scanning + * it's done in entry tree. So we can use binary search to + * avoid linear scanning. */ while (StopLow < StopHigh) { + int res; + StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle)); + attrnum = gintuple_get_attrnum(&so->ginstate, itup); if (key->attnum < attrnum) + { StopHigh = StopMiddle; - else if (key->attnum > attrnum) + continue; + } + if (key->attnum > attrnum) + { StopLow = StopMiddle + 1; - else + continue; + } + + if (datumExtracted[StopMiddle - 1] == false) { - int res; + datum[StopMiddle - 1] = + gintuple_get_key(&so->ginstate, itup, + &category[StopMiddle - 1]); + datumExtracted[StopMiddle - 1] = true; + } - if (datumExtracted[StopMiddle - 1] == false) + if (entry->queryCategory == GIN_CAT_EMPTY_QUERY) + { + /* special behavior depending on searchMode */ + if (entry->searchMode == GIN_SEARCH_MODE_ALL) { - datum[StopMiddle - 1] = gin_index_getattr(&so->ginstate, itup); - datumExtracted[StopMiddle - 1] = true; + /* match anything except NULL_ITEM */ + if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM) + res = -1; + else + res = 0; } - res = compareEntries(&so->ginstate, - entry->attnum, - entry->entry, - datum[StopMiddle - 1]); - - if (res == 0) + else { - /* - * The exact match causes, so we just scan from - * current position to find a partial match. See - * comment above about tuple's ordering. - */ - if (entry->isPartialMatch) - key->entryRes[j] = - matchPartialInPendingList(&so->ginstate, - page, StopMiddle, - pos->lastOffset, - entry->entry, - entry->attnum, - datum, - datumExtracted, - entry->strategy, - entry->extra_data); - else - key->entryRes[j] = true; - break; + /* match everything */ + res = 0; } - else if (res < 0) - StopHigh = StopMiddle; + } + else + { + res = ginCompareEntries(&so->ginstate, + entry->attnum, + entry->queryKey, + entry->queryCategory, + datum[StopMiddle - 1], + category[StopMiddle - 1]); + } + + if (res == 0) + { + /* + * Found exact match (there can be only one, except in + * EMPTY_QUERY mode). + * + * If doing partial match, scan forward from here to + * end of page to check for matches. + * + * See comment above about tuple's ordering. + */ + if (entry->isPartialMatch) + key->entryRes[j] = + matchPartialInPendingList(&so->ginstate, + page, + StopMiddle, + pos->lastOffset, + entry, + datum, + category, + datumExtracted); else - StopLow = StopMiddle + 1; + key->entryRes[j] = true; + + /* done with binary search */ + break; } + else if (res < 0) + StopHigh = StopMiddle; + else + StopLow = StopMiddle + 1; } if (StopLow >= StopHigh && entry->isPartialMatch) { /* - * The exact match wasn't found, so we need to start scan - * from first tuple greater then current entry See comment - * above about tuple's ordering. + * No exact match on this page. If doing partial match, + * scan from the first tuple greater than target value to + * end of page. Note that since we don't remember whether + * the comparePartialFn told us to stop early on a + * previous page, we will uselessly apply comparePartialFn + * to the first tuple on each subsequent page. */ key->entryRes[j] = matchPartialInPendingList(&so->ginstate, - page, StopHigh, + page, + StopHigh, pos->lastOffset, - entry->entry, - entry->attnum, + entry, datum, - datumExtracted, - entry->strategy, - entry->extra_data); + category, + datumExtracted); } pos->hasMatchKey[i] |= key->entryRes[j]; } } + /* Advance firstOffset over the scanned tuples */ pos->firstOffset = pos->lastOffset; if (GinPageHasFullRow(page)) { /* - * We scan all values from one tuple, go to next one + * We have examined all pending entries for the current heap row. + * Break out of loop over pages. */ - - return hasAllMatchingKeys(so, pos); + break; } else { - ItemPointerData item = pos->item; - /* - * need to get next portion of tuples of row containing on several - * pages + * Advance to next page of pending entries for the current heap + * row. Complain if there isn't one. */ + ItemPointerData item = pos->item; - if (scanGetCandidate(scan, pos) == false || !ItemPointerEquals(&pos->item, &item)) - elog(ERROR, "Could not process tuple"); /* XXX should not be - * here ! */ + if (scanGetCandidate(scan, pos) == false || + !ItemPointerEquals(&pos->item, &item)) + elog(ERROR, "could not find additional pending pages for same heap tuple"); } } - return hasAllMatchingKeys(so, pos); + /* + * Now return "true" if all scan keys have at least one matching datum + */ + for (i = 0; i < so->nkeys; i++) + { + if (pos->hasMatchKey[i] == false) + return false; + } + + return true; } /* - * Collect all matched rows from pending list in bitmap + * Collect all matched rows from pending list into bitmap */ static void scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) @@ -1059,7 +1424,6 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) GinScanOpaque so = (GinScanOpaque) scan->opaque; MemoryContext oldCtx; bool recheck, - keyrecheck, match; int i; pendingPosition pos; @@ -1095,16 +1459,18 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) while (scanGetCandidate(scan, &pos)) { /* - * Check entries in tuple and setup entryRes array If tuples of heap's - * row are placed on several pages collectDatumForItem will read all - * of that pages. + * Check entries in tuple and set up entryRes array. + * + * If pending tuples belonging to the current heap row are spread + * across several pages, collectMatchesForHeapRow will read all of + * those pages. */ - if (!collectDatumForItem(scan, &pos)) + if (!collectMatchesForHeapRow(scan, &pos)) continue; /* - * Matching of entries of one row is finished, so check row by - * consistent function. + * Matching of entries of one row is finished, so check row using + * consistent functions. */ oldCtx = MemoryContextSwitchTo(so->tempCtx); recheck = false; @@ -1114,21 +1480,12 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) { GinScanKey key = so->keys + i; - keyrecheck = true; - - if (!DatumGetBool(FunctionCall6(&so->ginstate.consistentFn[key->attnum - 1], - PointerGetDatum(key->entryRes), - UInt16GetDatum(key->strategy), - key->query, - UInt32GetDatum(key->nentries), - PointerGetDatum(key->extra_data), - PointerGetDatum(&keyrecheck)))) + if (!callConsistentFn(&so->ginstate, key)) { match = false; break; } - - recheck |= keyrecheck; + recheck |= key->recheckCurItem; } MemoryContextSwitchTo(oldCtx); @@ -1144,84 +1501,9 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) pfree(pos.hasMatchKey); } -/* - * Get heap item pointer from scan - * returns true if found - */ -static bool -scanGetItem(IndexScanDesc scan, ItemPointerData *item, bool *recheck) -{ - GinScanOpaque so = (GinScanOpaque) scan->opaque; - uint32 i; - bool keyrecheck; - - /* - * We return recheck = true if any of the keyGetItem calls return - * keyrecheck = true. Note that because the second loop might advance - * some keys, this could theoretically be too conservative. In practice - * though, we expect that a consistentFn's recheck result will depend only - * on the operator and the query, so for any one key it should stay the - * same regardless of advancing to new items. So it's not worth working - * harder. - */ - *recheck = false; - - ItemPointerSetMin(item); - for (i = 0; i < so->nkeys; i++) - { - GinScanKey key = so->keys + i; - - if (keyGetItem(scan->indexRelation, &so->ginstate, so->tempCtx, - key, &keyrecheck)) - return FALSE; /* finished one of keys */ - if (compareItemPointers(item, &key->curItem) < 0) - *item = key->curItem; - *recheck |= keyrecheck; - } - - for (i = 1; i <= so->nkeys; i++) - { - GinScanKey key = so->keys + i - 1; - - for (;;) - { - int cmp = compareItemPointers(item, &key->curItem); - - if (cmp != 0 && (ItemPointerIsLossyPage(item) || ItemPointerIsLossyPage(&key->curItem))) - { - /* - * if one of ItemPointers points to the whole page then - * compare only page's number - */ - if (ItemPointerGetBlockNumber(item) == ItemPointerGetBlockNumber(&key->curItem)) - cmp = 0; - else - cmp = (ItemPointerGetBlockNumber(item) > ItemPointerGetBlockNumber(&key->curItem)) ? 1 : -1; - } - - if (cmp == 0) - break; - else if (cmp > 0) - { - if (keyGetItem(scan->indexRelation, &so->ginstate, so->tempCtx, - key, &keyrecheck)) - return FALSE; /* finished one of keys */ - *recheck |= keyrecheck; - } - else - { /* returns to begin */ - *item = key->curItem; - i = 0; - break; - } - } - } - - return TRUE; -} #define GinIsNewKey(s) ( ((GinScanOpaque) scan->opaque)->keys == NULL ) -#define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes == true ) +#define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes ) Datum gingetbitmap(PG_FUNCTION_ARGS) @@ -1229,9 +1511,14 @@ gingetbitmap(PG_FUNCTION_ARGS) IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1); int64 ntids; + ItemPointerData iptr; + bool recheck; + /* + * Set up the scan keys, and check for unsatisfiable query. + */ if (GinIsNewKey(scan)) - newScanKey(scan); + ginNewScanKey(scan); if (GinIsVoidRes(scan)) PG_RETURN_INT64(0); @@ -1255,14 +1542,13 @@ gingetbitmap(PG_FUNCTION_ARGS) */ startScan(scan); + ItemPointerSetMin(&iptr); + for (;;) { - ItemPointerData iptr; - bool recheck; - CHECK_FOR_INTERRUPTS(); - if (!scanGetItem(scan, &iptr, &recheck)) + if (!scanGetItem(scan, &iptr, &iptr, &recheck)) break; if (ItemPointerIsLossyPage(&iptr)) diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index e2a5e8b013..3e32af94a9 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -4,21 +4,21 @@ * insert routines for the postgres inverted index access method. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/gininsert.c,v 1.26 2010/02/11 14:29:50 teodor Exp $ + * src/backend/access/gin/gininsert.c *------------------------------------------------------------------------- */ #include "postgres.h" -#include "access/genam.h" -#include "access/gin.h" +#include "access/gin_private.h" #include "catalog/index.h" #include "miscadmin.h" #include "storage/bufmgr.h" +#include "storage/smgr.h" #include "storage/indexfsm.h" #include "utils/memutils.h" @@ -27,14 +27,17 @@ typedef struct { GinState ginstate; double indtuples; + GinStatsData buildStats; MemoryContext tmpCtx; MemoryContext funcCtx; BuildAccumulator accum; } GinBuildState; /* - * Creates posting tree with one page. Function - * suppose that items[] fits to page + * Creates new posting tree with one page, containing the given TIDs. + * Returns the page number (which will be the root of this posting tree). + * + * items[] must be in sorted order with no duplicates. */ static BlockNumber createPostingTree(Relation index, ItemPointerData *items, uint32 nitems) @@ -43,6 +46,9 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems) Buffer buffer = GinNewBuffer(index); Page page; + /* Assert that the items[] array will fit on one page */ + Assert(nitems <= GinMaxLeafDataItems); + START_CRIT_SECTION(); GinInitBuffer(buffer, GIN_DATA | GIN_LEAF); @@ -54,7 +60,7 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems) MarkBufferDirty(buffer); - if (!index->rd_istemp) + if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata[2]; @@ -74,12 +80,9 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems) rdata[1].len = sizeof(ItemPointerData) * nitems; rdata[1].next = NULL; - - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); - } UnlockReleaseBuffer(buffer); @@ -91,143 +94,245 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems) /* - * Adds array of item pointers to tuple's posting list or - * creates posting tree and tuple pointed to tree in a case + * Adds array of item pointers to tuple's posting list, or + * creates posting tree and tuple pointing to tree in case * of not enough space. Max size of tuple is defined in - * GinFormTuple(). + * GinFormTuple(). Returns a new, modified index tuple. + * items[] must be in sorted order with no duplicates. */ static IndexTuple -addItemPointersToTuple(Relation index, GinState *ginstate, GinBtreeStack *stack, - IndexTuple old, ItemPointerData *items, uint32 nitem, bool isBuild) +addItemPointersToLeafTuple(GinState *ginstate, + IndexTuple old, + ItemPointerData *items, uint32 nitem, + GinStatsData *buildStats) { - Datum key = gin_index_getattr(ginstate, old); - OffsetNumber attnum = gintuple_get_attrnum(ginstate, old); - IndexTuple res = GinFormTuple(index, ginstate, attnum, key, - NULL, nitem + GinGetNPosting(old), - false); + OffsetNumber attnum; + Datum key; + GinNullCategory category; + IndexTuple res; + + Assert(!GinIsPostingTree(old)); + + attnum = gintuple_get_attrnum(ginstate, old); + key = gintuple_get_key(ginstate, old, &category); + + /* try to build tuple with room for all the items */ + res = GinFormTuple(ginstate, attnum, key, category, + NULL, nitem + GinGetNPosting(old), + false); if (res) { /* good, small enough */ uint32 newnitem; - newnitem = MergeItemPointers(GinGetPosting(res), - GinGetPosting(old), GinGetNPosting(old), - items, nitem); + /* fill in the posting list with union of old and new TIDs */ + newnitem = ginMergeItemPointers(GinGetPosting(res), + GinGetPosting(old), + GinGetNPosting(old), + items, nitem); /* merge might have eliminated some duplicate items */ GinShortenTuple(res, newnitem); } else { + /* posting list would be too big, convert to posting tree */ BlockNumber postingRoot; GinPostingTreeScan *gdi; - /* posting list becomes big, so we need to make posting's tree */ - res = GinFormTuple(index, ginstate, attnum, key, NULL, 0, true); - postingRoot = createPostingTree(index, GinGetPosting(old), GinGetNPosting(old)); - GinSetPostingTree(res, postingRoot); + /* + * Initialize posting tree with the old tuple's posting list. It's + * surely small enough to fit on one posting-tree page, and should + * already be in order with no duplicates. + */ + postingRoot = createPostingTree(ginstate->index, + GinGetPosting(old), + GinGetNPosting(old)); - gdi = prepareScanPostingTree(index, postingRoot, FALSE); - gdi->btree.isBuild = isBuild; + /* During index build, count the newly-added data page */ + if (buildStats) + buildStats->nDataPages++; - insertItemPointer(gdi, items, nitem); + /* Now insert the TIDs-to-be-added into the posting tree */ + gdi = ginPrepareScanPostingTree(ginstate->index, postingRoot, FALSE); + gdi->btree.isBuild = (buildStats != NULL); + + ginInsertItemPointers(gdi, items, nitem, buildStats); pfree(gdi); + + /* And build a new posting-tree-only result tuple */ + res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, true); + GinSetPostingTree(res, postingRoot); + } + + return res; +} + +/* + * Build a fresh leaf tuple, either posting-list or posting-tree format + * depending on whether the given items list will fit. + * items[] must be in sorted order with no duplicates. + * + * This is basically the same logic as in addItemPointersToLeafTuple, + * but working from slightly different input. + */ +static IndexTuple +buildFreshLeafTuple(GinState *ginstate, + OffsetNumber attnum, Datum key, GinNullCategory category, + ItemPointerData *items, uint32 nitem, + GinStatsData *buildStats) +{ + IndexTuple res; + + /* try to build tuple with room for all the items */ + res = GinFormTuple(ginstate, attnum, key, category, + items, nitem, false); + + if (!res) + { + /* posting list would be too big, build posting tree */ + BlockNumber postingRoot; + + /* + * Build posting-tree-only result tuple. We do this first so as to + * fail quickly if the key is too big. + */ + res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, true); + + /* + * Initialize posting tree with as many TIDs as will fit on the first + * page. + */ + postingRoot = createPostingTree(ginstate->index, + items, + Min(nitem, GinMaxLeafDataItems)); + + /* During index build, count the newly-added data page */ + if (buildStats) + buildStats->nDataPages++; + + /* Add any remaining TIDs to the posting tree */ + if (nitem > GinMaxLeafDataItems) + { + GinPostingTreeScan *gdi; + + gdi = ginPrepareScanPostingTree(ginstate->index, postingRoot, FALSE); + gdi->btree.isBuild = (buildStats != NULL); + + ginInsertItemPointers(gdi, + items + GinMaxLeafDataItems, + nitem - GinMaxLeafDataItems, + buildStats); + + pfree(gdi); + } + + /* And save the root link in the result tuple */ + GinSetPostingTree(res, postingRoot); } return res; } /* - * Inserts only one entry to the index, but it can add more than 1 ItemPointer. + * Insert one or more heap TIDs associated with the given key value. + * This will either add a single key entry, or enlarge a pre-existing entry. + * + * During an index build, buildStats is non-null and the counters + * it contains should be incremented as needed. */ void -ginEntryInsert(Relation index, GinState *ginstate, - OffsetNumber attnum, Datum value, +ginEntryInsert(GinState *ginstate, + OffsetNumber attnum, Datum key, GinNullCategory category, ItemPointerData *items, uint32 nitem, - bool isBuild) + GinStatsData *buildStats) { GinBtreeData btree; GinBtreeStack *stack; IndexTuple itup; Page page; - prepareEntryScan(&btree, index, attnum, value, ginstate); + /* During index build, count the to-be-inserted entry */ + if (buildStats) + buildStats->nEntries++; + + ginPrepareEntryScan(&btree, attnum, key, category, ginstate); stack = ginFindLeafPage(&btree, NULL); page = BufferGetPage(stack->buffer); if (btree.findItem(&btree, stack)) { - /* found entry */ + /* found pre-existing entry */ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); if (GinIsPostingTree(itup)) { - /* lock root of posting tree */ - GinPostingTreeScan *gdi; + /* add entries to existing posting tree */ BlockNumber rootPostingTree = GinGetPostingTree(itup); + GinPostingTreeScan *gdi; /* release all stack */ LockBuffer(stack->buffer, GIN_UNLOCK); freeGinBtreeStack(stack); /* insert into posting tree */ - gdi = prepareScanPostingTree(index, rootPostingTree, FALSE); - gdi->btree.isBuild = isBuild; - insertItemPointer(gdi, items, nitem); + gdi = ginPrepareScanPostingTree(ginstate->index, rootPostingTree, FALSE); + gdi->btree.isBuild = (buildStats != NULL); + ginInsertItemPointers(gdi, items, nitem, buildStats); + pfree(gdi); return; } - itup = addItemPointersToTuple(index, ginstate, stack, itup, items, nitem, isBuild); + /* modify an existing leaf entry */ + itup = addItemPointersToLeafTuple(ginstate, itup, + items, nitem, buildStats); btree.isDelete = TRUE; } else { - /* We suppose that tuple can store at least one itempointer */ - itup = GinFormTuple(index, ginstate, attnum, value, items, 1, true); - - if (nitem > 1) - { - /* Add the rest, making a posting tree if necessary */ - IndexTuple previtup = itup; - - itup = addItemPointersToTuple(index, ginstate, stack, previtup, items + 1, nitem - 1, isBuild); - pfree(previtup); - } + /* no match, so construct a new leaf entry */ + itup = buildFreshLeafTuple(ginstate, attnum, key, category, + items, nitem, buildStats); } + /* Insert the new or modified leaf tuple */ btree.entry = itup; - ginInsertValue(&btree, stack); + ginInsertValue(&btree, stack, buildStats); pfree(itup); } /* - * Saves indexed value in memory accumulator during index creation - * Function isn't used during normal insert + * Extract index entries for a single indexable item, and add them to the + * BuildAccumulator's state. + * + * This function is used only during initial index creation. */ -static uint32 -ginHeapTupleBulkInsert(GinBuildState *buildstate, OffsetNumber attnum, Datum value, ItemPointer heapptr) +static void +ginHeapTupleBulkInsert(GinBuildState *buildstate, OffsetNumber attnum, + Datum value, bool isNull, + ItemPointer heapptr) { Datum *entries; + GinNullCategory *categories; int32 nentries; MemoryContext oldCtx; oldCtx = MemoryContextSwitchTo(buildstate->funcCtx); - entries = extractEntriesSU(buildstate->accum.ginstate, attnum, value, &nentries); + entries = ginExtractEntries(buildstate->accum.ginstate, attnum, + value, isNull, + &nentries, &categories); MemoryContextSwitchTo(oldCtx); - if (nentries == 0) - /* nothing to insert */ - return 0; + ginInsertBAEntries(&buildstate->accum, heapptr, attnum, + entries, categories, nentries); - ginInsertRecordBA(&buildstate->accum, heapptr, attnum, entries, nentries); + buildstate->indtuples += nentries; MemoryContextReset(buildstate->funcCtx); - - return nentries; } static void @@ -241,24 +346,27 @@ ginBuildCallback(Relation index, HeapTuple htup, Datum *values, oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++) - if (!isnull[i]) - buildstate->indtuples += ginHeapTupleBulkInsert(buildstate, - (OffsetNumber) (i + 1), values[i], - &htup->t_self); + ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1), + values[i], isnull[i], + &htup->t_self); /* If we've maxed out our available memory, dump everything to the index */ if (buildstate->accum.allocatedMemory >= maintenance_work_mem * 1024L) { ItemPointerData *list; - Datum entry; + Datum key; + GinNullCategory category; uint32 nlist; OffsetNumber attnum; - while ((list = ginGetEntry(&buildstate->accum, &attnum, &entry, &nlist)) != NULL) + ginBeginBAScan(&buildstate->accum); + while ((list = ginGetBAEntry(&buildstate->accum, + &attnum, &key, &category, &nlist)) != NULL) { /* there could be many entries, so be willing to abort here */ CHECK_FOR_INTERRUPTS(); - ginEntryInsert(index, &buildstate->ginstate, attnum, entry, list, nlist, TRUE); + ginEntryInsert(&buildstate->ginstate, attnum, key, category, + list, nlist, &buildstate->buildStats); } MemoryContextReset(buildstate->tmpCtx); @@ -280,7 +388,8 @@ ginbuild(PG_FUNCTION_ARGS) Buffer RootBuffer, MetaBuffer; ItemPointerData *list; - Datum entry; + Datum key; + GinNullCategory category; uint32 nlist; MemoryContext oldCtx; OffsetNumber attnum; @@ -290,6 +399,8 @@ ginbuild(PG_FUNCTION_ARGS) RelationGetRelationName(index)); initGinState(&buildstate.ginstate, index); + buildstate.indtuples = 0; + memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); /* initialize the meta page */ MetaBuffer = GinNewBuffer(index); @@ -303,7 +414,7 @@ ginbuild(PG_FUNCTION_ARGS) GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); - if (!index->rd_istemp) + if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata; @@ -329,8 +440,8 @@ ginbuild(PG_FUNCTION_ARGS) UnlockReleaseBuffer(RootBuffer); END_CRIT_SECTION(); - /* build the index */ - buildstate.indtuples = 0; + /* count the root as first entry page */ + buildstate.buildStats.nEntryPages++; /* * create a temporary memory context that is reset once for each tuple @@ -360,17 +471,26 @@ ginbuild(PG_FUNCTION_ARGS) /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); - while ((list = ginGetEntry(&buildstate.accum, &attnum, &entry, &nlist)) != NULL) + ginBeginBAScan(&buildstate.accum); + while ((list = ginGetBAEntry(&buildstate.accum, + &attnum, &key, &category, &nlist)) != NULL) { /* there could be many entries, so be willing to abort here */ CHECK_FOR_INTERRUPTS(); - ginEntryInsert(index, &buildstate.ginstate, attnum, entry, list, nlist, TRUE); + ginEntryInsert(&buildstate.ginstate, attnum, key, category, + list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); MemoryContextDelete(buildstate.tmpCtx); /* + * Update metapage stats + */ + buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); + ginUpdateStats(index, &buildstate.buildStats); + + /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); @@ -382,25 +502,66 @@ ginbuild(PG_FUNCTION_ARGS) } /* - * Inserts value during normal insertion + * ginbuildempty() -- build an empty gin index in the initialization fork */ -static uint32 -ginHeapTupleInsert(Relation index, GinState *ginstate, OffsetNumber attnum, Datum value, ItemPointer item) +Datum +ginbuildempty(PG_FUNCTION_ARGS) +{ + Relation index = (Relation) PG_GETARG_POINTER(0); + Buffer RootBuffer, + MetaBuffer; + + /* An empty GIN index has two pages. */ + MetaBuffer = + ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + LockBuffer(MetaBuffer, BUFFER_LOCK_EXCLUSIVE); + RootBuffer = + ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + LockBuffer(RootBuffer, BUFFER_LOCK_EXCLUSIVE); + + /* Initialize both pages, mark them dirty, unlock and release buffer. */ + START_CRIT_SECTION(); + GinInitMetabuffer(MetaBuffer); + MarkBufferDirty(MetaBuffer); + GinInitBuffer(RootBuffer, GIN_LEAF); + MarkBufferDirty(RootBuffer); + + /* XLOG the new pages */ + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + BufferGetBlockNumber(MetaBuffer), + BufferGetPage(MetaBuffer)); + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + BufferGetBlockNumber(RootBuffer), + BufferGetPage(RootBuffer)); + END_CRIT_SECTION(); + + /* Unlock and release the buffers. */ + UnlockReleaseBuffer(MetaBuffer); + UnlockReleaseBuffer(RootBuffer); + + PG_RETURN_VOID(); +} + +/* + * Insert index entries for a single indexable item during "normal" + * (non-fast-update) insertion + */ +static void +ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum, + Datum value, bool isNull, + ItemPointer item) { Datum *entries; + GinNullCategory *categories; int32 i, nentries; - entries = extractEntriesSU(ginstate, attnum, value, &nentries); - - if (nentries == 0) - /* nothing to insert */ - return 0; + entries = ginExtractEntries(ginstate, attnum, value, isNull, + &nentries, &categories); for (i = 0; i < nentries; i++) - ginEntryInsert(index, ginstate, attnum, entries[i], item, 1, FALSE); - - return nentries; + ginEntryInsert(ginstate, attnum, entries[i], categories[i], + item, 1, NULL); } Datum @@ -435,20 +596,21 @@ gininsert(PG_FUNCTION_ARGS) GinTupleCollector collector; memset(&collector, 0, sizeof(GinTupleCollector)); + for (i = 0; i < ginstate.origTupdesc->natts; i++) - if (!isnull[i]) - ginHeapTupleFastCollect(index, &ginstate, &collector, - (OffsetNumber) (i + 1), values[i], ht_ctid); + ginHeapTupleFastCollect(&ginstate, &collector, + (OffsetNumber) (i + 1), + values[i], isnull[i], + ht_ctid); - ginHeapTupleFastInsert(index, &ginstate, &collector); + ginHeapTupleFastInsert(&ginstate, &collector); } else { for (i = 0; i < ginstate.origTupdesc->natts; i++) - if (!isnull[i]) - ginHeapTupleInsert(index, &ginstate, - (OffsetNumber) (i + 1), values[i], ht_ctid); - + ginHeapTupleInsert(&ginstate, (OffsetNumber) (i + 1), + values[i], isnull[i], + ht_ctid); } MemoryContextSwitchTo(oldCtx); diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c index 1aa4ea9082..f8d54b1b46 100644 --- a/src/backend/access/gin/ginscan.c +++ b/src/backend/access/gin/ginscan.c @@ -1,20 +1,20 @@ /*------------------------------------------------------------------------- * * ginscan.c - * routines to manage scans inverted index relations + * routines to manage scans of inverted index relations * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginscan.c,v 1.26 2010/01/18 11:50:43 teodor Exp $ + * src/backend/access/gin/ginscan.c *------------------------------------------------------------------------- */ #include "postgres.h" -#include "access/gin.h" +#include "access/gin_private.h" #include "access/relscan.h" #include "pgstat.h" #include "storage/bufmgr.h" @@ -26,160 +26,276 @@ Datum ginbeginscan(PG_FUNCTION_ARGS) { Relation rel = (Relation) PG_GETARG_POINTER(0); - int keysz = PG_GETARG_INT32(1); - ScanKey scankey = (ScanKey) PG_GETARG_POINTER(2); + int nkeys = PG_GETARG_INT32(1); + int norderbys = PG_GETARG_INT32(2); IndexScanDesc scan; + GinScanOpaque so; + + /* no order by operators allowed */ + Assert(norderbys == 0); + + scan = RelationGetIndexScan(rel, nkeys, norderbys); - scan = RelationGetIndexScan(rel, keysz, scankey); + /* allocate private workspace */ + so = (GinScanOpaque) palloc(sizeof(GinScanOpaqueData)); + so->keys = NULL; + so->nkeys = 0; + so->tempCtx = AllocSetContextCreate(CurrentMemoryContext, + "Gin scan temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + initGinState(&so->ginstate, scan->indexRelation); + + scan->opaque = so; PG_RETURN_POINTER(scan); } -static void -fillScanKey(GinState *ginstate, GinScanKey key, OffsetNumber attnum, Datum query, - Datum *entryValues, bool *partial_matches, uint32 nEntryValues, - StrategyNumber strategy, Pointer *extra_data) +/* + * Create a new GinScanEntry, unless an equivalent one already exists, + * in which case just return it + */ +static GinScanEntry +ginFillScanEntry(GinScanOpaque so, OffsetNumber attnum, + StrategyNumber strategy, int32 searchMode, + Datum queryKey, GinNullCategory queryCategory, + bool isPartialMatch, Pointer extra_data) { - uint32 i, - j; - - key->nentries = nEntryValues; - key->entryRes = (bool *) palloc0(sizeof(bool) * nEntryValues); - key->scanEntry = (GinScanEntry) palloc(sizeof(GinScanEntryData) * nEntryValues); - key->strategy = strategy; - key->attnum = attnum; - key->extra_data = extra_data; - key->query = query; - key->firstCall = TRUE; - ItemPointerSet(&(key->curItem), InvalidBlockNumber, InvalidOffsetNumber); - - for (i = 0; i < nEntryValues; i++) + GinState *ginstate = &so->ginstate; + GinScanEntry scanEntry; + uint32 i; + + /* + * Look for an existing equivalent entry. + * + * Entries with non-null extra_data are never considered identical, since + * we can't know exactly what the opclass might be doing with that. + */ + if (extra_data == NULL) { - key->scanEntry[i].pval = key->entryRes + i; - key->scanEntry[i].entry = entryValues[i]; - key->scanEntry[i].attnum = attnum; - key->scanEntry[i].extra_data = (extra_data) ? extra_data[i] : NULL; - ItemPointerSet(&(key->scanEntry[i].curItem), InvalidBlockNumber, InvalidOffsetNumber); - key->scanEntry[i].offset = InvalidOffsetNumber; - key->scanEntry[i].buffer = InvalidBuffer; - key->scanEntry[i].partialMatch = NULL; - key->scanEntry[i].partialMatchIterator = NULL; - key->scanEntry[i].partialMatchResult = NULL; - key->scanEntry[i].strategy = strategy; - key->scanEntry[i].list = NULL; - key->scanEntry[i].nlist = 0; - key->scanEntry[i].isPartialMatch = (ginstate->canPartialMatch[attnum - 1] && partial_matches) - ? partial_matches[i] : false; - - /* link to the equals entry in current scan key */ - key->scanEntry[i].master = NULL; - for (j = 0; j < i; j++) - if (compareEntries(ginstate, attnum, entryValues[i], entryValues[j]) == 0 && - key->scanEntry[i].isPartialMatch == key->scanEntry[j].isPartialMatch && - key->scanEntry[i].strategy == key->scanEntry[j].strategy) + for (i = 0; i < so->totalentries; i++) + { + GinScanEntry prevEntry = so->entries[i]; + + if (prevEntry->extra_data == NULL && + prevEntry->isPartialMatch == isPartialMatch && + prevEntry->strategy == strategy && + prevEntry->searchMode == searchMode && + prevEntry->attnum == attnum && + ginCompareEntries(ginstate, attnum, + prevEntry->queryKey, + prevEntry->queryCategory, + queryKey, + queryCategory) == 0) { - key->scanEntry[i].master = key->scanEntry + j; - break; + /* Successful match */ + return prevEntry; } + } } -} -#ifdef NOT_USED + /* Nope, create a new entry */ + scanEntry = (GinScanEntry) palloc(sizeof(GinScanEntryData)); + scanEntry->queryKey = queryKey; + scanEntry->queryCategory = queryCategory; + scanEntry->isPartialMatch = isPartialMatch; + scanEntry->extra_data = extra_data; + scanEntry->strategy = strategy; + scanEntry->searchMode = searchMode; + scanEntry->attnum = attnum; + + scanEntry->buffer = InvalidBuffer; + ItemPointerSetMin(&scanEntry->curItem); + scanEntry->matchBitmap = NULL; + scanEntry->matchIterator = NULL; + scanEntry->matchResult = NULL; + scanEntry->list = NULL; + scanEntry->nlist = 0; + scanEntry->offset = InvalidOffsetNumber; + scanEntry->isFinished = false; + scanEntry->reduceResult = false; + + /* Add it to so's array */ + if (so->totalentries >= so->allocentries) + { + so->allocentries *= 2; + so->entries = (GinScanEntry *) + repalloc(so->entries, so->allocentries * sizeof(GinScanEntry)); + } + so->entries[so->totalentries++] = scanEntry; + + return scanEntry; +} +/* + * Initialize the next GinScanKey using the output from the extractQueryFn + */ static void -resetScanKeys(GinScanKey keys, uint32 nkeys) +ginFillScanKey(GinScanOpaque so, OffsetNumber attnum, + StrategyNumber strategy, int32 searchMode, + Datum query, uint32 nQueryValues, + Datum *queryValues, GinNullCategory *queryCategories, + bool *partial_matches, Pointer *extra_data) { - uint32 i, - j; + GinScanKey key = &(so->keys[so->nkeys++]); + GinState *ginstate = &so->ginstate; + uint32 nUserQueryValues = nQueryValues; + uint32 i; - if (keys == NULL) - return; + /* Non-default search modes add one "hidden" entry to each key */ + if (searchMode != GIN_SEARCH_MODE_DEFAULT) + nQueryValues++; + key->nentries = nQueryValues; + key->nuserentries = nUserQueryValues; - for (i = 0; i < nkeys; i++) - { - GinScanKey key = keys + i; + key->scanEntry = (GinScanEntry *) palloc(sizeof(GinScanEntry) * nQueryValues); + key->entryRes = (bool *) palloc0(sizeof(bool) * nQueryValues); - key->firstCall = TRUE; - ItemPointerSet(&(key->curItem), InvalidBlockNumber, InvalidOffsetNumber); + key->query = query; + key->queryValues = queryValues; + key->queryCategories = queryCategories; + key->extra_data = extra_data; + key->strategy = strategy; + key->searchMode = searchMode; + key->attnum = attnum; + + ItemPointerSetMin(&key->curItem); + key->curItemMatches = false; + key->recheckCurItem = false; + key->isFinished = false; - for (j = 0; j < key->nentries; j++) + for (i = 0; i < nQueryValues; i++) + { + Datum queryKey; + GinNullCategory queryCategory; + bool isPartialMatch; + Pointer this_extra; + + if (i < nUserQueryValues) { - if (key->scanEntry[j].buffer != InvalidBuffer) - ReleaseBuffer(key->scanEntry[i].buffer); - - ItemPointerSet(&(key->scanEntry[j].curItem), InvalidBlockNumber, InvalidOffsetNumber); - key->scanEntry[j].offset = InvalidOffsetNumber; - key->scanEntry[j].buffer = InvalidBuffer; - key->scanEntry[j].list = NULL; - key->scanEntry[j].nlist = 0; - key->scanEntry[j].partialMatch = NULL; - key->scanEntry[j].partialMatchIterator = NULL; - key->scanEntry[j].partialMatchResult = NULL; + /* set up normal entry using extractQueryFn's outputs */ + queryKey = queryValues[i]; + queryCategory = queryCategories[i]; + isPartialMatch = + (ginstate->canPartialMatch[attnum - 1] && partial_matches) + ? partial_matches[i] : false; + this_extra = (extra_data) ? extra_data[i] : NULL; } + else + { + /* set up hidden entry */ + queryKey = (Datum) 0; + switch (searchMode) + { + case GIN_SEARCH_MODE_INCLUDE_EMPTY: + queryCategory = GIN_CAT_EMPTY_ITEM; + break; + case GIN_SEARCH_MODE_ALL: + queryCategory = GIN_CAT_EMPTY_QUERY; + break; + case GIN_SEARCH_MODE_EVERYTHING: + queryCategory = GIN_CAT_EMPTY_QUERY; + break; + default: + elog(ERROR, "unexpected searchMode: %d", searchMode); + queryCategory = 0; /* keep compiler quiet */ + break; + } + isPartialMatch = false; + this_extra = NULL; + + /* + * We set the strategy to a fixed value so that ginFillScanEntry + * can combine these entries for different scan keys. This is + * safe because the strategy value in the entry struct is only + * used for partial-match cases. It's OK to overwrite our local + * variable here because this is the last loop iteration. + */ + strategy = InvalidStrategy; + } + + key->scanEntry[i] = ginFillScanEntry(so, attnum, + strategy, searchMode, + queryKey, queryCategory, + isPartialMatch, this_extra); } } -#endif static void -freeScanKeys(GinScanKey keys, uint32 nkeys) +freeScanKeys(GinScanOpaque so) { - uint32 i, - j; + uint32 i; - if (keys == NULL) + if (so->keys == NULL) return; - for (i = 0; i < nkeys; i++) + for (i = 0; i < so->nkeys; i++) { - GinScanKey key = keys + i; - - for (j = 0; j < key->nentries; j++) - { - if (key->scanEntry[j].buffer != InvalidBuffer) - ReleaseBuffer(key->scanEntry[j].buffer); - if (key->scanEntry[j].list) - pfree(key->scanEntry[j].list); - if (key->scanEntry[j].partialMatchIterator) - tbm_end_iterate(key->scanEntry[j].partialMatchIterator); - if (key->scanEntry[j].partialMatch) - tbm_free(key->scanEntry[j].partialMatch); - } + GinScanKey key = so->keys + i; - pfree(key->entryRes); pfree(key->scanEntry); + pfree(key->entryRes); } - pfree(keys); + pfree(so->keys); + so->keys = NULL; + so->nkeys = 0; + + for (i = 0; i < so->totalentries; i++) + { + GinScanEntry entry = so->entries[i]; + + if (entry->buffer != InvalidBuffer) + ReleaseBuffer(entry->buffer); + if (entry->list) + pfree(entry->list); + if (entry->matchIterator) + tbm_end_iterate(entry->matchIterator); + if (entry->matchBitmap) + tbm_free(entry->matchBitmap); + pfree(entry); + } + + pfree(so->entries); + so->entries = NULL; + so->totalentries = 0; } void -newScanKey(IndexScanDesc scan) +ginNewScanKey(IndexScanDesc scan) { ScanKey scankey = scan->keyData; GinScanOpaque so = (GinScanOpaque) scan->opaque; int i; - uint32 nkeys = 0; + bool hasNullQuery = false; - if (scan->numberOfKeys < 1) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("GIN indexes do not support whole-index scans"))); + /* if no scan keys provided, allocate extra EVERYTHING GinScanKey */ + so->keys = (GinScanKey) + palloc(Max(scan->numberOfKeys, 1) * sizeof(GinScanKeyData)); + so->nkeys = 0; - so->keys = (GinScanKey) palloc(scan->numberOfKeys * sizeof(GinScanKeyData)); + /* initialize expansible array of GinScanEntry pointers */ + so->totalentries = 0; + so->allocentries = 32; + so->entries = (GinScanEntry *) + palloc0(so->allocentries * sizeof(GinScanEntry)); so->isVoidRes = false; for (i = 0; i < scan->numberOfKeys; i++) { ScanKey skey = &scankey[i]; - Datum *entryValues; - int32 nEntryValues = 0; + Datum *queryValues; + int32 nQueryValues = 0; bool *partial_matches = NULL; Pointer *extra_data = NULL; + bool *nullFlags = NULL; + int32 searchMode = GIN_SEARCH_MODE_DEFAULT; /* - * Assume, that GIN-indexable operators are strict, so nothing could - * be found + * We assume that GIN-indexable operators are strict, so a null query + * argument means an unsatisfiable query. */ if (skey->sk_flags & SK_ISNULL) { @@ -187,48 +303,105 @@ newScanKey(IndexScanDesc scan) break; } - entryValues = (Datum *) - DatumGetPointer(FunctionCall5(&so->ginstate.extractQueryFn[skey->sk_attno - 1], - skey->sk_argument, - PointerGetDatum(&nEntryValues), - UInt16GetDatum(skey->sk_strategy), - PointerGetDatum(&partial_matches), - PointerGetDatum(&extra_data))); + /* OK to call the extractQueryFn */ + queryValues = (Datum *) + DatumGetPointer(FunctionCall7Coll(&so->ginstate.extractQueryFn[skey->sk_attno - 1], + so->ginstate.supportCollation[skey->sk_attno - 1], + skey->sk_argument, + PointerGetDatum(&nQueryValues), + UInt16GetDatum(skey->sk_strategy), + PointerGetDatum(&partial_matches), + PointerGetDatum(&extra_data), + PointerGetDatum(&nullFlags), + PointerGetDatum(&searchMode))); + + /* + * If bogus searchMode is returned, treat as GIN_SEARCH_MODE_ALL; note + * in particular we don't allow extractQueryFn to select + * GIN_SEARCH_MODE_EVERYTHING. + */ + if (searchMode < GIN_SEARCH_MODE_DEFAULT || + searchMode > GIN_SEARCH_MODE_ALL) + searchMode = GIN_SEARCH_MODE_ALL; + + /* Non-default modes require the index to have placeholders */ + if (searchMode != GIN_SEARCH_MODE_DEFAULT) + hasNullQuery = true; - if (nEntryValues < 0) + /* + * In default mode, no keys means an unsatisfiable query. + */ + if (queryValues == NULL || nQueryValues <= 0) { - /* - * extractQueryFn signals that nothing can match, so we can just - * set isVoidRes flag. No need to examine any more keys. - */ - so->isVoidRes = true; - break; + if (searchMode == GIN_SEARCH_MODE_DEFAULT) + { + so->isVoidRes = true; + break; + } + nQueryValues = 0; /* ensure sane value */ } - if (entryValues == NULL || nEntryValues == 0) + /* + * If the extractQueryFn didn't create a nullFlags array, create one, + * assuming that everything's non-null. Otherwise, run through the + * array and make sure each value is exactly 0 or 1; this ensures + * binary compatibility with the GinNullCategory representation. While + * at it, detect whether any null keys are present. + */ + if (nullFlags == NULL) + nullFlags = (bool *) palloc0(nQueryValues * sizeof(bool)); + else { - /* - * extractQueryFn signals that everything matches. This would - * require a full scan, which we can't do, but perhaps there is - * another scankey that provides a restriction to use. So we keep - * going and check only at the end. - */ - continue; + int32 j; + + for (j = 0; j < nQueryValues; j++) + { + if (nullFlags[j]) + { + nullFlags[j] = true; /* not any other nonzero value */ + hasNullQuery = true; + } + } } + /* now we can use the nullFlags as category codes */ - fillScanKey(&so->ginstate, &(so->keys[nkeys]), - skey->sk_attno, skey->sk_argument, - entryValues, partial_matches, nEntryValues, - skey->sk_strategy, extra_data); - nkeys++; + ginFillScanKey(so, skey->sk_attno, + skey->sk_strategy, searchMode, + skey->sk_argument, nQueryValues, + queryValues, (GinNullCategory *) nullFlags, + partial_matches, extra_data); } - if (nkeys == 0 && !so->isVoidRes) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("GIN indexes do not support whole-index scans"))); + /* + * If there are no regular scan keys, generate an EVERYTHING scankey to + * drive a full-index scan. + */ + if (so->nkeys == 0 && !so->isVoidRes) + { + hasNullQuery = true; + ginFillScanKey(so, FirstOffsetNumber, + InvalidStrategy, GIN_SEARCH_MODE_EVERYTHING, + (Datum) 0, 0, + NULL, NULL, NULL, NULL); + } - so->nkeys = nkeys; + /* + * If the index is version 0, it may be missing null and placeholder + * entries, which would render searches for nulls and full-index scans + * unreliable. Throw an error if so. + */ + if (hasNullQuery && !so->isVoidRes) + { + GinStatsData ginStats; + + ginGetStats(scan->indexRelation, &ginStats); + if (ginStats.ginVersion < 1) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("old GIN indexes do not support whole-index scans nor searches for nulls"), + errhint("To fix this, do REINDEX INDEX \"%s\".", + RelationGetRelationName(scan->indexRelation)))); + } pgstat_count_index_scan(scan->indexRelation); } @@ -238,28 +411,11 @@ ginrescan(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1); - GinScanOpaque so; - - so = (GinScanOpaque) scan->opaque; - if (so == NULL) - { - /* if called from ginbeginscan */ - so = (GinScanOpaque) palloc(sizeof(GinScanOpaqueData)); - so->tempCtx = AllocSetContextCreate(CurrentMemoryContext, - "Gin scan temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - initGinState(&so->ginstate, scan->indexRelation); - scan->opaque = so; - } - else - { - freeScanKeys(so->keys, so->nkeys); - } + /* remaining arguments are ignored */ + GinScanOpaque so = (GinScanOpaque) scan->opaque; - so->keys = NULL; + freeScanKeys(so); if (scankey && scan->numberOfKeys > 0) { @@ -277,14 +433,11 @@ ginendscan(PG_FUNCTION_ARGS) IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); GinScanOpaque so = (GinScanOpaque) scan->opaque; - if (so != NULL) - { - freeScanKeys(so->keys, so->nkeys); + freeScanKeys(so); - MemoryContextDelete(so->tempCtx); + MemoryContextDelete(so->tempCtx); - pfree(so); - } + pfree(so); PG_RETURN_VOID(); } diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 2a75820e22..ba142bc874 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -4,44 +4,61 @@ * utilities routines for the postgres inverted index access method. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginutil.c,v 1.23 2010/01/02 16:57:33 momjian Exp $ + * src/backend/access/gin/ginutil.c *------------------------------------------------------------------------- */ #include "postgres.h" -#include "access/genam.h" -#include "access/gin.h" + +#include "access/gin_private.h" #include "access/reloptions.h" +#include "catalog/pg_collation.h" #include "catalog/pg_type.h" +#include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/freespace.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" + +/* + * initGinState: fill in an empty GinState struct to describe the index + * + * Note: assorted subsidiary data is allocated in the CurrentMemoryContext. + */ void initGinState(GinState *state, Relation index) { + TupleDesc origTupdesc = RelationGetDescr(index); int i; - state->origTupdesc = index->rd_att; + MemSet(state, 0, sizeof(GinState)); - state->oneCol = (index->rd_att->natts == 1) ? true : false; + state->index = index; + state->oneCol = (origTupdesc->natts == 1) ? true : false; + state->origTupdesc = origTupdesc; - for (i = 0; i < index->rd_att->natts; i++) + for (i = 0; i < origTupdesc->natts; i++) { - state->tupdesc[i] = CreateTemplateTupleDesc(2, false); - - TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 1, NULL, - INT2OID, -1, 0); - TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 2, NULL, - index->rd_att->attrs[i]->atttypid, - index->rd_att->attrs[i]->atttypmod, - index->rd_att->attrs[i]->attndims - ); + if (state->oneCol) + state->tupdesc[i] = state->origTupdesc; + else + { + state->tupdesc[i] = CreateTemplateTupleDesc(2, false); + + TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 1, NULL, + INT2OID, -1, 0); + TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 2, NULL, + origTupdesc->attrs[i]->atttypid, + origTupdesc->attrs[i]->atttypmod, + origTupdesc->attrs[i]->attndims); + TupleDescInitEntryCollation(state->tupdesc[i], (AttrNumber) 2, + origTupdesc->attrs[i]->attcollation); + } fmgr_info_copy(&(state->compareFn[i]), index_getprocinfo(index, i + 1, GIN_COMPARE_PROC), @@ -64,13 +81,29 @@ initGinState(GinState *state, Relation index) fmgr_info_copy(&(state->comparePartialFn[i]), index_getprocinfo(index, i + 1, GIN_COMPARE_PARTIAL_PROC), CurrentMemoryContext); - state->canPartialMatch[i] = true; } else { state->canPartialMatch[i] = false; } + + /* + * If the index column has a specified collation, we should honor that + * while doing comparisons. However, we may have a collatable storage + * type for a noncollatable indexed data type (for instance, hstore + * uses text index entries). If there's no index collation then + * specify default collation in case the support functions need + * collation. This is harmless if the support functions don't care + * about collation, so we just do it unconditionally. (We could + * alternatively call get_typcollation, but that seems like expensive + * overkill --- there aren't going to be any cases where a GIN storage + * type has a nondefault collation.) + */ + if (OidIsValid(index->rd_indcollation[i])) + state->supportCollation[i] = index->rd_indcollation[i]; + else + state->supportCollation[i] = DEFAULT_COLLATION_OID; } } @@ -80,9 +113,14 @@ initGinState(GinState *state, Relation index) OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple) { - OffsetNumber colN = FirstOffsetNumber; + OffsetNumber colN; - if (!ginstate->oneCol) + if (ginstate->oneCol) + { + /* column number is not stored explicitly */ + colN = FirstOffsetNumber; + } + else { Datum res; bool isnull; @@ -103,13 +141,14 @@ gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple) } /* - * Extract stored datum from GIN tuple + * Extract stored datum (and possible null category) from GIN tuple */ Datum -gin_index_getattr(GinState *ginstate, IndexTuple tuple) +gintuple_get_key(GinState *ginstate, IndexTuple tuple, + GinNullCategory *category) { - bool isnull; Datum res; + bool isnull; if (ginstate->oneCol) { @@ -132,7 +171,10 @@ gin_index_getattr(GinState *ginstate, IndexTuple tuple) &isnull); } - Assert(!isnull); + if (isnull) + *category = GinGetNullCategory(tuple, ginstate); + else + *category = GIN_CAT_NORM_KEY; return res; } @@ -142,7 +184,6 @@ gin_index_getattr(GinState *ginstate, IndexTuple tuple) * The returned buffer is already pinned and exclusive-locked * Caller is responsible for initializing the page by calling GinInitBuffer */ - Buffer GinNewBuffer(Relation index) { @@ -227,100 +268,227 @@ GinInitMetabuffer(Buffer b) metadata->tailFreeSize = 0; metadata->nPendingPages = 0; metadata->nPendingHeapTuples = 0; + metadata->nTotalPages = 0; + metadata->nEntryPages = 0; + metadata->nDataPages = 0; + metadata->nEntries = 0; + metadata->ginVersion = GIN_CURRENT_VERSION; } +/* + * Compare two keys of the same index column + */ int -compareEntries(GinState *ginstate, OffsetNumber attnum, Datum a, Datum b) +ginCompareEntries(GinState *ginstate, OffsetNumber attnum, + Datum a, GinNullCategory categorya, + Datum b, GinNullCategory categoryb) { - return DatumGetInt32( - FunctionCall2( - &ginstate->compareFn[attnum - 1], - a, b - ) - ); + /* if not of same null category, sort by that first */ + if (categorya != categoryb) + return (categorya < categoryb) ? -1 : 1; + + /* all null items in same category are equal */ + if (categorya != GIN_CAT_NORM_KEY) + return 0; + + /* both not null, so safe to call the compareFn */ + return DatumGetInt32(FunctionCall2Coll(&ginstate->compareFn[attnum - 1], + ginstate->supportCollation[attnum - 1], + a, b)); } +/* + * Compare two keys of possibly different index columns + */ int -compareAttEntries(GinState *ginstate, OffsetNumber attnum_a, Datum a, - OffsetNumber attnum_b, Datum b) +ginCompareAttEntries(GinState *ginstate, + OffsetNumber attnuma, Datum a, GinNullCategory categorya, + OffsetNumber attnumb, Datum b, GinNullCategory categoryb) { - if (attnum_a == attnum_b) - return compareEntries(ginstate, attnum_a, a, b); + /* attribute number is the first sort key */ + if (attnuma != attnumb) + return (attnuma < attnumb) ? -1 : 1; - return (attnum_a < attnum_b) ? -1 : 1; + return ginCompareEntries(ginstate, attnuma, a, categorya, b, categoryb); } + +/* + * Support for sorting key datums in ginExtractEntries + * + * Note: we only have to worry about null and not-null keys here; + * ginExtractEntries never generates more than one placeholder null, + * so it doesn't have to sort those. + */ +typedef struct +{ + Datum datum; + bool isnull; +} keyEntryData; + typedef struct { FmgrInfo *cmpDatumFunc; - bool *needUnique; -} cmpEntriesData; + Oid collation; + bool haveDups; +} cmpEntriesArg; static int -cmpEntries(const Datum *a, const Datum *b, cmpEntriesData *arg) +cmpEntries(const void *a, const void *b, void *arg) { - int res = DatumGetInt32(FunctionCall2(arg->cmpDatumFunc, - *a, *b)); + const keyEntryData *aa = (const keyEntryData *) a; + const keyEntryData *bb = (const keyEntryData *) b; + cmpEntriesArg *data = (cmpEntriesArg *) arg; + int res; + if (aa->isnull) + { + if (bb->isnull) + res = 0; /* NULL "=" NULL */ + else + res = 1; /* NULL ">" not-NULL */ + } + else if (bb->isnull) + res = -1; /* not-NULL "<" NULL */ + else + res = DatumGetInt32(FunctionCall2Coll(data->cmpDatumFunc, + data->collation, + aa->datum, bb->datum)); + + /* + * Detect if we have any duplicates. If there are equal keys, qsort must + * compare them at some point, else it wouldn't know whether one should go + * before or after the other. + */ if (res == 0) - *(arg->needUnique) = TRUE; + data->haveDups = true; return res; } + +/* + * Extract the index key values from an indexable item + * + * The resulting key values are sorted, and any duplicates are removed. + * This avoids generating redundant index entries. + */ Datum * -extractEntriesS(GinState *ginstate, OffsetNumber attnum, Datum value, int32 *nentries, - bool *needUnique) +ginExtractEntries(GinState *ginstate, OffsetNumber attnum, + Datum value, bool isNull, + int32 *nentries, GinNullCategory **categories) { Datum *entries; - - entries = (Datum *) DatumGetPointer(FunctionCall2( - &ginstate->extractValueFn[attnum - 1], - value, - PointerGetDatum(nentries) - )); - - if (entries == NULL) - *nentries = 0; - - *needUnique = FALSE; - if (*nentries > 1) + bool *nullFlags; + int32 i; + + /* + * We don't call the extractValueFn on a null item. Instead generate a + * placeholder. + */ + if (isNull) { - cmpEntriesData arg; - - arg.cmpDatumFunc = &ginstate->compareFn[attnum - 1]; - arg.needUnique = needUnique; - qsort_arg(entries, *nentries, sizeof(Datum), - (qsort_arg_comparator) cmpEntries, (void *) &arg); + *nentries = 1; + entries = (Datum *) palloc(sizeof(Datum)); + entries[0] = (Datum) 0; + *categories = (GinNullCategory *) palloc(sizeof(GinNullCategory)); + (*categories)[0] = GIN_CAT_NULL_ITEM; + return entries; } - return entries; -} - - -Datum * -extractEntriesSU(GinState *ginstate, OffsetNumber attnum, Datum value, int32 *nentries) -{ - bool needUnique; - Datum *entries = extractEntriesS(ginstate, attnum, value, nentries, - &needUnique); + /* OK, call the opclass's extractValueFn */ + nullFlags = NULL; /* in case extractValue doesn't set it */ + entries = (Datum *) + DatumGetPointer(FunctionCall3Coll(&ginstate->extractValueFn[attnum - 1], + ginstate->supportCollation[attnum - 1], + value, + PointerGetDatum(nentries), + PointerGetDatum(&nullFlags))); + + /* + * Generate a placeholder if the item contained no keys. + */ + if (entries == NULL || *nentries <= 0) + { + *nentries = 1; + entries = (Datum *) palloc(sizeof(Datum)); + entries[0] = (Datum) 0; + *categories = (GinNullCategory *) palloc(sizeof(GinNullCategory)); + (*categories)[0] = GIN_CAT_EMPTY_ITEM; + return entries; + } - if (needUnique) + /* + * If the extractValueFn didn't create a nullFlags array, create one, + * assuming that everything's non-null. Otherwise, run through the array + * and make sure each value is exactly 0 or 1; this ensures binary + * compatibility with the GinNullCategory representation. + */ + if (nullFlags == NULL) + nullFlags = (bool *) palloc0(*nentries * sizeof(bool)); + else + { + for (i = 0; i < *nentries; i++) + nullFlags[i] = (nullFlags[i] ? true : false); + } + /* now we can use the nullFlags as category codes */ + *categories = (GinNullCategory *) nullFlags; + + /* + * If there's more than one key, sort and unique-ify. + * + * XXX Using qsort here is notationally painful, and the overhead is + * pretty bad too. For small numbers of keys it'd likely be better to use + * a simple insertion sort. + */ + if (*nentries > 1) { - Datum *ptr, - *res; + keyEntryData *keydata; + cmpEntriesArg arg; - ptr = res = entries; + keydata = (keyEntryData *) palloc(*nentries * sizeof(keyEntryData)); + for (i = 0; i < *nentries; i++) + { + keydata[i].datum = entries[i]; + keydata[i].isnull = nullFlags[i]; + } + + arg.cmpDatumFunc = &ginstate->compareFn[attnum - 1]; + arg.collation = ginstate->supportCollation[attnum - 1]; + arg.haveDups = false; + qsort_arg(keydata, *nentries, sizeof(keyEntryData), + cmpEntries, (void *) &arg); - while (ptr - entries < *nentries) + if (arg.haveDups) + { + /* there are duplicates, must get rid of 'em */ + int32 j; + + entries[0] = keydata[0].datum; + nullFlags[0] = keydata[0].isnull; + j = 1; + for (i = 1; i < *nentries; i++) + { + if (cmpEntries(&keydata[i - 1], &keydata[i], &arg) != 0) + { + entries[j] = keydata[i].datum; + nullFlags[j] = keydata[i].isnull; + j++; + } + } + *nentries = j; + } + else { - if (compareEntries(ginstate, attnum, *ptr, *res) != 0) - *(++res) = *ptr++; - else - ptr++; + /* easy, no duplicates */ + for (i = 0; i < *nentries; i++) + { + entries[i] = keydata[i].datum; + nullFlags[i] = keydata[i].isnull; + } } - *nentries = res + 1 - entries; + pfree(keydata); } return entries; @@ -354,3 +522,83 @@ ginoptions(PG_FUNCTION_ARGS) PG_RETURN_BYTEA_P(rdopts); } + +/* + * Fetch index's statistical data into *stats + * + * Note: in the result, nPendingPages can be trusted to be up-to-date, + * as can ginVersion; but the other fields are as of the last VACUUM. + */ +void +ginGetStats(Relation index, GinStatsData *stats) +{ + Buffer metabuffer; + Page metapage; + GinMetaPageData *metadata; + + metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, GIN_SHARE); + metapage = BufferGetPage(metabuffer); + metadata = GinPageGetMeta(metapage); + + stats->nPendingPages = metadata->nPendingPages; + stats->nTotalPages = metadata->nTotalPages; + stats->nEntryPages = metadata->nEntryPages; + stats->nDataPages = metadata->nDataPages; + stats->nEntries = metadata->nEntries; + stats->ginVersion = metadata->ginVersion; + + UnlockReleaseBuffer(metabuffer); +} + +/* + * Write the given statistics to the index's metapage + * + * Note: nPendingPages and ginVersion are *not* copied over + */ +void +ginUpdateStats(Relation index, const GinStatsData *stats) +{ + Buffer metabuffer; + Page metapage; + GinMetaPageData *metadata; + + metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, GIN_EXCLUSIVE); + metapage = BufferGetPage(metabuffer); + metadata = GinPageGetMeta(metapage); + + START_CRIT_SECTION(); + + metadata->nTotalPages = stats->nTotalPages; + metadata->nEntryPages = stats->nEntryPages; + metadata->nDataPages = stats->nDataPages; + metadata->nEntries = stats->nEntries; + + MarkBufferDirty(metabuffer); + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + ginxlogUpdateMeta data; + XLogRecData rdata; + + data.node = index->rd_node; + data.ntuples = 0; + data.newRightlink = data.prevTail = InvalidBlockNumber; + memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); + + rdata.buffer = InvalidBuffer; + rdata.data = (char *) &data; + rdata.len = sizeof(ginxlogUpdateMeta); + rdata.next = NULL; + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE, &rdata); + PageSetLSN(metapage, recptr); + PageSetTLI(metapage, ThisTimeLineID); + } + + UnlockReleaseBuffer(metabuffer); + + END_CRIT_SECTION(); +} diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index a13b99cdfa..79c54f16b8 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -4,18 +4,17 @@ * delete & vacuum routines for the postgres GIN * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.33 2010/02/08 04:33:52 tgl Exp $ + * src/backend/access/gin/ginvacuum.c *------------------------------------------------------------------------- */ #include "postgres.h" -#include "access/genam.h" -#include "access/gin.h" +#include "access/gin_private.h" #include "catalog/storage.h" #include "commands/vacuum.h" #include "miscadmin.h" @@ -93,7 +92,7 @@ xlogVacuumPage(Relation index, Buffer buffer) Assert(GinPageIsLeaf(page)); - if (index->rd_istemp) + if (!RelationNeedsWAL(index)) return; data.node = index->rd_node; @@ -190,7 +189,6 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, /* saves changes about deleted tuple ... */ if (oldMaxOff != newMaxOff) { - START_CRIT_SECTION(); if (newMaxOff > 0) @@ -293,7 +291,7 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn Assert(PostingItemGetBlockNumber(tod) == deleteBlkno); } while (0); #endif - PageDeletePostingItem(parentPage, myoff); + GinPageDeletePostingItem(parentPage, myoff); page = BufferGetPage(dBuffer); @@ -308,7 +306,7 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn MarkBufferDirty(lBuffer); MarkBufferDirty(dBuffer); - if (!gvs->index->rd_istemp) + if (RelationNeedsWAL(gvs->index)) { XLogRecPtr recptr; XLogRecData rdata[4]; @@ -519,7 +517,7 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3 * store posting tree's roots for further processing, we can't * vacuum it just now due to risk of deadlocks with scans/inserts */ - roots[*nroot] = GinItemPointerGetBlockNumber(&itup->t_tid); + roots[*nroot] = GinGetDownlink(itup); (*nroot)++; } else if (GinGetNPosting(itup) > 0) @@ -533,8 +531,9 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3 if (GinGetNPosting(itup) != newN) { - Datum value; OffsetNumber attnum; + Datum key; + GinNullCategory category; /* * Some ItemPointers was deleted, so we should remake our @@ -562,9 +561,9 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3 itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i)); } - value = gin_index_getattr(&gvs->ginstate, itup); attnum = gintuple_get_attrnum(&gvs->ginstate, itup); - itup = GinFormTuple(gvs->index, &gvs->ginstate, attnum, value, + key = gintuple_get_key(&gvs->ginstate, itup, &category); + itup = GinFormTuple(&gvs->ginstate, attnum, key, category, GinGetPosting(itup), newN, true); PageIndexTupleDelete(tmppage, i); @@ -606,7 +605,7 @@ ginbulkdelete(PG_FUNCTION_ARGS) /* Yes, so initialize stats to zeroes */ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* and cleanup any pending inserts */ - ginInsertCleanup(index, &gvs.ginstate, true, stats); + ginInsertCleanup(&gvs.ginstate, true, stats); } /* we'll re-count the tuples each time */ @@ -642,7 +641,7 @@ ginbulkdelete(PG_FUNCTION_ARGS) Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); - blkno = GinItemPointerGetBlockNumber(&(itup)->t_tid); + blkno = GinGetDownlink(itup); Assert(blkno != InvalidBlockNumber); UnlockReleaseBuffer(buffer); @@ -707,9 +706,8 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) BlockNumber npages, blkno; BlockNumber totFreePages; - BlockNumber lastBlock = GIN_ROOT_BLKNO, - lastFilledBlock = GIN_ROOT_BLKNO; GinState ginstate; + GinStatsData idxStat; /* * In an autovacuum analyze, we want to clean up pending insertions. @@ -720,7 +718,7 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) if (IsAutoVacuumWorkerProcess()) { initGinState(&ginstate, index); - ginInsertCleanup(index, &ginstate, true, stats); + ginInsertCleanup(&ginstate, true, stats); } PG_RETURN_POINTER(stats); } @@ -733,9 +731,11 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) { stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); initGinState(&ginstate, index); - ginInsertCleanup(index, &ginstate, true, stats); + ginInsertCleanup(&ginstate, true, stats); } + memset(&idxStat, 0, sizeof(idxStat)); + /* * XXX we always report the heap tuple count as the number of index * entries. This is bogus if the index is partial, but it's real hard to @@ -757,7 +757,7 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) totFreePages = 0; - for (blkno = GIN_ROOT_BLKNO + 1; blkno < npages; blkno++) + for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++) { Buffer buffer; Page page; @@ -771,15 +771,28 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) if (GinPageIsDeleted(page)) { + Assert(blkno != GIN_ROOT_BLKNO); RecordFreeIndexPage(index, blkno); totFreePages++; } - else - lastFilledBlock = blkno; + else if (GinPageIsData(page)) + { + idxStat.nDataPages++; + } + else if (!GinPageIsList(page)) + { + idxStat.nEntryPages++; + + if (GinPageIsLeaf(page)) + idxStat.nEntries += PageGetMaxOffsetNumber(page); + } UnlockReleaseBuffer(buffer); } - lastBlock = npages - 1; + + /* Update the metapage with accurate page and entry counts */ + idxStat.nTotalPages = npages; + ginUpdateStats(info->index, &idxStat); /* Finally, vacuum the FSM */ IndexFreeSpaceMapVacuum(info->index); diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index cff5bc8bd5..c954bcb12f 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -4,16 +4,16 @@ * WAL replay logic for inverted index. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.22 2010/02/09 20:31:24 heikki Exp $ + * src/backend/access/gin/ginxlog.c *------------------------------------------------------------------------- */ #include "postgres.h" -#include "access/gin.h" +#include "access/gin_private.h" #include "access/xlogutils.h" #include "storage/bufmgr.h" #include "utils/memutils.h" @@ -77,11 +77,13 @@ ginRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) MetaBuffer = XLogReadBuffer(*node, GIN_METAPAGE_BLKNO, true); Assert(BufferIsValid(MetaBuffer)); + page = (Page) BufferGetPage(MetaBuffer); + GinInitMetabuffer(MetaBuffer); - page = (Page) BufferGetPage(MetaBuffer); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(MetaBuffer); RootBuffer = XLogReadBuffer(*node, GIN_ROOT_BLKNO, true); Assert(BufferIsValid(RootBuffer)); @@ -91,11 +93,10 @@ ginRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); - - MarkBufferDirty(MetaBuffer); - UnlockReleaseBuffer(MetaBuffer); MarkBufferDirty(RootBuffer); + UnlockReleaseBuffer(RootBuffer); + UnlockReleaseBuffer(MetaBuffer); } static void @@ -128,21 +129,49 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record) Buffer buffer; Page page; + /* first, forget any incomplete split this insertion completes */ + if (data->isData) + { + Assert(data->isDelete == FALSE); + if (!data->isLeaf && data->updateBlkno != InvalidBlockNumber) + { + PostingItem *pitem; + + pitem = (PostingItem *) (XLogRecGetData(record) + sizeof(ginxlogInsert)); + forgetIncompleteSplit(data->node, + PostingItemGetBlockNumber(pitem), + data->updateBlkno); + } + + } + else + { + if (!data->isLeaf && data->updateBlkno != InvalidBlockNumber) + { + IndexTuple itup; + + itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsert)); + forgetIncompleteSplit(data->node, + GinGetDownlink(itup), + data->updateBlkno); + } + } + /* nothing else to do if page was backed up */ if (record->xl_info & XLR_BKP_BLOCK_1) return; buffer = XLogReadBuffer(data->node, data->blkno, false); - Assert(BufferIsValid(buffer)); + if (!BufferIsValid(buffer)) + return; /* page was deleted, nothing to do */ page = (Page) BufferGetPage(buffer); - if (data->isData) + if (!XLByteLE(lsn, PageGetLSN(page))) { - Assert(data->isDelete == FALSE); - Assert(GinPageIsData(page)); - - if (!XLByteLE(lsn, PageGetLSN(page))) + if (data->isData) { + Assert(GinPageIsData(page)); + if (data->isLeaf) { OffsetNumber i; @@ -172,30 +201,19 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record) GinDataPageAddItem(page, pitem, data->offset); } } - - if (!data->isLeaf && data->updateBlkno != InvalidBlockNumber) + else { - PostingItem *pitem = (PostingItem *) (XLogRecGetData(record) + sizeof(ginxlogInsert)); - - forgetIncompleteSplit(data->node, PostingItemGetBlockNumber(pitem), data->updateBlkno); - } + IndexTuple itup; - } - else - { - IndexTuple itup; + Assert(!GinPageIsData(page)); - Assert(!GinPageIsData(page)); - - if (!XLByteLE(lsn, PageGetLSN(page))) - { if (data->updateBlkno != InvalidBlockNumber) { /* update link to right page after split */ Assert(!GinPageIsLeaf(page)); Assert(data->offset >= FirstOffsetNumber && data->offset <= PageGetMaxOffsetNumber(page)); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, data->offset)); - ItemPointerSet(&itup->t_tid, data->updateBlkno, InvalidOffsetNumber); + GinSetDownlink(itup, data->updateBlkno); } if (data->isDelete) @@ -212,20 +230,12 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record) data->node.spcNode, data->node.dbNode, data->node.relNode); } - if (!data->isLeaf && data->updateBlkno != InvalidBlockNumber) - { - itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsert)); - forgetIncompleteSplit(data->node, GinItemPointerGetBlockNumber(&itup->t_tid), data->updateBlkno); - } - } - - if (!XLByteLE(lsn, PageGetLSN(page))) - { PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } + UnlockReleaseBuffer(buffer); } @@ -244,7 +254,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record) if (data->isData) flags |= GIN_DATA; - lbuffer = XLogReadBuffer(data->node, data->lblkno, data->isRootSplit); + lbuffer = XLogReadBuffer(data->node, data->lblkno, true); Assert(BufferIsValid(lbuffer)); lpage = (Page) BufferGetPage(lbuffer); GinInitBuffer(lbuffer, flags); @@ -260,7 +270,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record) if (data->isData) { char *ptr = XLogRecGetData(record) + sizeof(ginxlogSplit); - Size sizeofitem = GinSizeOfItem(lpage); + Size sizeofitem = GinSizeOfDataPageItem(lpage); OffsetNumber i; ItemPointer bound; @@ -321,7 +331,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record) if (data->isRootSplit) { - Buffer rootBuf = XLogReadBuffer(data->node, data->rootBlkno, false); + Buffer rootBuf = XLogReadBuffer(data->node, data->rootBlkno, true); Page rootPage = BufferGetPage(rootBuf); GinInitBuffer(rootBuf, flags & ~GIN_LEAF); @@ -329,12 +339,12 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record) if (data->isData) { Assert(data->rootBlkno != GIN_ROOT_BLKNO); - dataFillRoot(NULL, rootBuf, lbuffer, rbuffer); + ginDataFillRoot(NULL, rootBuf, lbuffer, rbuffer); } else { Assert(data->rootBlkno == GIN_ROOT_BLKNO); - entryFillRoot(NULL, rootBuf, lbuffer, rbuffer); + ginEntryFillRoot(NULL, rootBuf, lbuffer, rbuffer); } PageSetLSN(rootPage, lsn); @@ -357,45 +367,50 @@ ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record) Buffer buffer; Page page; - /* nothing else to do if page was backed up (and no info to do it with) */ + /* nothing to do if page was backed up (and no info to do it with) */ if (record->xl_info & XLR_BKP_BLOCK_1) return; buffer = XLogReadBuffer(data->node, data->blkno, false); - Assert(BufferIsValid(buffer)); + if (!BufferIsValid(buffer)) + return; page = (Page) BufferGetPage(buffer); - if (GinPageIsData(page)) - { - memcpy(GinDataPageGetData(page), XLogRecGetData(record) + sizeof(ginxlogVacuumPage), - GinSizeOfItem(page) *data->nitem); - GinPageGetOpaque(page)->maxoff = data->nitem; - } - else + if (!XLByteLE(lsn, PageGetLSN(page))) { - OffsetNumber i, - *tod; - IndexTuple itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogVacuumPage)); + if (GinPageIsData(page)) + { + memcpy(GinDataPageGetData(page), + XLogRecGetData(record) + sizeof(ginxlogVacuumPage), + data->nitem * GinSizeOfDataPageItem(page)); + GinPageGetOpaque(page)->maxoff = data->nitem; + } + else + { + OffsetNumber i, + *tod; + IndexTuple itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogVacuumPage)); - tod = (OffsetNumber *) palloc(sizeof(OffsetNumber) * PageGetMaxOffsetNumber(page)); - for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); i++) - tod[i - 1] = i; + tod = (OffsetNumber *) palloc(sizeof(OffsetNumber) * PageGetMaxOffsetNumber(page)); + for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); i++) + tod[i - 1] = i; - PageIndexMultiDelete(page, tod, PageGetMaxOffsetNumber(page)); + PageIndexMultiDelete(page, tod, PageGetMaxOffsetNumber(page)); - for (i = 0; i < data->nitem; i++) - { - if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) - elog(ERROR, "failed to add item to index page in %u/%u/%u", - data->node.spcNode, data->node.dbNode, data->node.relNode); - itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup))); + for (i = 0; i < data->nitem; i++) + { + if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in %u/%u/%u", + data->node.spcNode, data->node.dbNode, data->node.relNode); + itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup))); + } } - } - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } - MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); } @@ -409,38 +424,56 @@ ginRedoDeletePage(XLogRecPtr lsn, XLogRecord *record) if (!(record->xl_info & XLR_BKP_BLOCK_1)) { buffer = XLogReadBuffer(data->node, data->blkno, false); - page = BufferGetPage(buffer); - Assert(GinPageIsData(page)); - GinPageGetOpaque(page)->flags = GIN_DELETED; - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + Assert(GinPageIsData(page)); + GinPageGetOpaque(page)->flags = GIN_DELETED; + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } } if (!(record->xl_info & XLR_BKP_BLOCK_2)) { buffer = XLogReadBuffer(data->node, data->parentBlkno, false); - page = BufferGetPage(buffer); - Assert(GinPageIsData(page)); - Assert(!GinPageIsLeaf(page)); - PageDeletePostingItem(page, data->parentOffset); - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + Assert(GinPageIsData(page)); + Assert(!GinPageIsLeaf(page)); + GinPageDeletePostingItem(page, data->parentOffset); + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } } if (!(record->xl_info & XLR_BKP_BLOCK_3) && data->leftBlkno != InvalidBlockNumber) { buffer = XLogReadBuffer(data->node, data->leftBlkno, false); - page = BufferGetPage(buffer); - Assert(GinPageIsData(page)); - GinPageGetOpaque(page)->rightlink = data->rightLink; - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + Assert(GinPageIsData(page)); + GinPageGetOpaque(page)->rightlink = data->rightLink; + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } } } @@ -450,8 +483,11 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record) ginxlogUpdateMeta *data = (ginxlogUpdateMeta *) XLogRecGetData(record); Buffer metabuffer; Page metapage; + Buffer buffer; metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false); + if (!BufferIsValid(metabuffer)) + elog(PANIC, "GIN metapage disappeared"); metapage = BufferGetPage(metabuffer); if (!XLByteLE(lsn, PageGetLSN(metapage))) @@ -469,40 +505,43 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record) */ if (!(record->xl_info & XLR_BKP_BLOCK_1)) { - Buffer buffer = XLogReadBuffer(data->node, data->metadata.tail, false); - Page page = BufferGetPage(buffer); - - if (!XLByteLE(lsn, PageGetLSN(page))) + buffer = XLogReadBuffer(data->node, data->metadata.tail, false); + if (BufferIsValid(buffer)) { - OffsetNumber l, - off = (PageIsEmpty(page)) ? FirstOffsetNumber : - OffsetNumberNext(PageGetMaxOffsetNumber(page)); - int i, - tupsize; - IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogUpdateMeta)); - - for (i = 0; i < data->ntuples; i++) + Page page = BufferGetPage(buffer); + + if (!XLByteLE(lsn, PageGetLSN(page))) { - tupsize = IndexTupleSize(tuples); + OffsetNumber l, + off = (PageIsEmpty(page)) ? FirstOffsetNumber : + OffsetNumberNext(PageGetMaxOffsetNumber(page)); + int i, + tupsize; + IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogUpdateMeta)); - l = PageAddItem(page, (Item) tuples, tupsize, off, false, false); + for (i = 0; i < data->ntuples; i++) + { + tupsize = IndexTupleSize(tuples); - if (l == InvalidOffsetNumber) - elog(ERROR, "failed to add item to index page"); + l = PageAddItem(page, (Item) tuples, tupsize, off, false, false); - tuples = (IndexTuple) (((char *) tuples) + tupsize); - } + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page"); - /* - * Increase counter of heap tuples - */ - GinPageGetOpaque(page)->maxoff++; + tuples = (IndexTuple) (((char *) tuples) + tupsize); + } - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); + /* + * Increase counter of heap tuples + */ + GinPageGetOpaque(page)->maxoff++; + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); } - UnlockReleaseBuffer(buffer); } } else if (data->prevTail != InvalidBlockNumber) @@ -510,19 +549,21 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record) /* * New tail */ - - Buffer buffer = XLogReadBuffer(data->node, data->prevTail, false); - Page page = BufferGetPage(buffer); - - if (!XLByteLE(lsn, PageGetLSN(page))) + buffer = XLogReadBuffer(data->node, data->prevTail, false); + if (BufferIsValid(buffer)) { - GinPageGetOpaque(page)->rightlink = data->newRightlink; + Page page = BufferGetPage(buffer); - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + GinPageGetOpaque(page)->rightlink = data->newRightlink; + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); } - UnlockReleaseBuffer(buffer); } UnlockReleaseBuffer(metabuffer); @@ -544,6 +585,7 @@ ginRedoInsertListPage(XLogRecPtr lsn, XLogRecord *record) return; buffer = XLogReadBuffer(data->node, data->blkno, true); + Assert(BufferIsValid(buffer)); page = BufferGetPage(buffer); GinInitBuffer(buffer, GIN_LIST); @@ -587,6 +629,8 @@ ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record) int i; metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false); + if (!BufferIsValid(metabuffer)) + elog(PANIC, "GIN metapage disappeared"); metapage = BufferGetPage(metabuffer); if (!XLByteLE(lsn, PageGetLSN(metapage))) @@ -600,18 +644,22 @@ ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record) for (i = 0; i < data->ndeleted; i++) { Buffer buffer = XLogReadBuffer(data->node, data->toDelete[i], false); - Page page = BufferGetPage(buffer); - if (!XLByteLE(lsn, PageGetLSN(page))) + if (BufferIsValid(buffer)) { - GinPageGetOpaque(page)->flags = GIN_DELETED; + Page page = BufferGetPage(buffer); - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); - } + if (!XLByteLE(lsn, PageGetLSN(page))) + { + GinPageGetOpaque(page)->flags = GIN_DELETED; - UnlockReleaseBuffer(buffer); + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + + UnlockReleaseBuffer(buffer); + } } UnlockReleaseBuffer(metabuffer); } @@ -745,6 +793,7 @@ static void ginContinueSplit(ginIncompleteSplit *split) { GinBtreeData btree; + GinState ginstate; Relation reln; Buffer buffer; GinBtreeStack stack; @@ -755,18 +804,30 @@ ginContinueSplit(ginIncompleteSplit *split) */ buffer = XLogReadBuffer(split->node, split->leftBlkno, false); + /* + * Failure should be impossible here, because we wrote the page earlier. + */ + if (!BufferIsValid(buffer)) + elog(PANIC, "ginContinueSplit: left block %u not found", + split->leftBlkno); + reln = CreateFakeRelcacheEntry(split->node); if (split->rootBlkno == GIN_ROOT_BLKNO) { - prepareEntryScan(&btree, reln, InvalidOffsetNumber, (Datum) 0, NULL); + MemSet(&ginstate, 0, sizeof(ginstate)); + ginstate.index = reln; + + ginPrepareEntryScan(&btree, + InvalidOffsetNumber, (Datum) 0, GIN_CAT_NULL_KEY, + &ginstate); btree.entry = ginPageGetLinkItup(buffer); } else { Page page = BufferGetPage(buffer); - prepareDataScan(&btree, reln); + ginPrepareDataScan(&btree, reln); PostingItemSetBlockNumber(&(btree.pitem), split->leftBlkno); if (GinPageIsLeaf(page)) @@ -784,8 +845,8 @@ ginContinueSplit(ginIncompleteSplit *split) stack.off = InvalidOffsetNumber; stack.parent = NULL; - findParents(&btree, &stack, split->rootBlkno); - ginInsertValue(&btree, stack.parent); + ginFindParents(&btree, &stack, split->rootBlkno); + ginInsertValue(&btree, stack.parent, NULL); FreeFakeRelcacheEntry(reln); diff --git a/src/backend/access/gist/Makefile b/src/backend/access/gist/Makefile index 298e9309f5..f8051a2b45 100644 --- a/src/backend/access/gist/Makefile +++ b/src/backend/access/gist/Makefile @@ -4,7 +4,7 @@ # Makefile for access/gist # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/access/gist/Makefile,v 1.18 2008/02/19 10:30:06 petere Exp $ +# src/backend/access/gist/Makefile # #------------------------------------------------------------------------- diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README index 6c90e508bf..2d78dcb0df 100644 --- a/src/backend/access/gist/README +++ b/src/backend/access/gist/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/access/gist/README,v 1.5 2010/04/14 20:17:26 rhaas Exp $ +src/backend/access/gist/README GiST Indexing ============= @@ -20,33 +20,34 @@ The current implementation of GiST supports: * Variable length keys * Composite keys (multi-key) + * Ordered search (nearest-neighbor search) * provides NULL-safe interface to GiST core * Concurrency * Recovery support via WAL logging -The support for concurrency implemented in PostgreSQL was developed based on -the paper "Access Methods for Next-Generation Database Systems" by +The support for concurrency implemented in PostgreSQL was developed based on +the paper "Access Methods for Next-Generation Database Systems" by Marcel Kornaker: http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz The original algorithms were modified in several ways: -* They should be adapted to PostgreSQL conventions. For example, the SEARCH - algorithm was considerably changed, because in PostgreSQL function search - should return one tuple (next), not all tuples at once. Also, it should +* They had to be adapted to PostgreSQL conventions. For example, the SEARCH + algorithm was considerably changed, because in PostgreSQL the search function + should return one tuple (next), not all tuples at once. Also, it should release page locks between calls. -* Since we added support for variable length keys, it's not possible to - guarantee enough free space for all keys on pages after splitting. User - defined function picksplit doesn't have information about size of tuples +* Since we added support for variable length keys, it's not possible to + guarantee enough free space for all keys on pages after splitting. User + defined function picksplit doesn't have information about size of tuples (each tuple may contain several keys as in multicolumn index while picksplit could work with only one key) and pages. -* We modified original INSERT algorithm for performance reason. In particular, +* We modified original INSERT algorithm for performance reasons. In particular, it is now a single-pass algorithm. * Since the papers were theoretical, some details were omitted and we - have to find out ourself how to solve some specific problems. + had to find out ourself how to solve some specific problems. -Because of the above reasons, we have to revised interaction of GiST +Because of the above reasons, we have revised the interaction of GiST core and PostgreSQL WAL system. Moreover, we encountered (and solved) a problem of uncompleted insertions when recovering after crash, which was not touched in the paper. @@ -54,96 +55,127 @@ was not touched in the paper. Search Algorithm ---------------- -Function gettuple finds a tuple which satisfies the search -predicate. It store their state and returns next tuple under -subsequent calls. Stack contains page, its LSN and LSN of parent page -and currentposition is saved between calls. +The search code maintains a queue of unvisited items, where an "item" is +either a heap tuple known to satisfy the search conditions, or an index +page that is consistent with the search conditions according to inspection +of its parent page's downlink item. Initially the root page is searched +to find unvisited items in it. Then we pull items from the queue. A +heap tuple pointer is just returned immediately; an index page entry +causes that page to be searched, generating more queue entries. -gettuple(search-pred) - if ( firsttime ) - push(stack, [root, 0, 0]) // page, LSN, parentLSN - currentposition=0 - end - ptr = top of stack - while(true) - latch( ptr->page, S-mode ) - if ( ptr->page->lsn != ptr->lsn ) - ptr->lsn = ptr->page->lsn - currentposition=0 - if ( ptr->parentlsn < ptr->page->nsn ) - add to stack rightlink - else - currentposition++ - end +The queue is kept ordered with heap tuple items at the front, then +index page entries, with any newly-added index page entry inserted +before existing index page entries. This ensures depth-first traversal +of the index, and in particular causes the first few heap tuples to be +returned as soon as possible. That is helpful in case there is a LIMIT +that requires only a few tuples to be produced. - while(true) - currentposition = find_first_match( currentposition ) - if ( currentposition is invalid ) - unlatch( ptr->page ) - pop stack - ptr = top of stack - if (ptr is NULL) - return NULL - break loop - else if ( ptr->page is leaf ) - unlatch( ptr->page ) - return tuple - else - add to stack child page - end - currentposition++ - end - end +To implement nearest-neighbor search, the queue entries are augmented +with distance data: heap tuple entries are labeled with exact distance +from the search argument, while index-page entries must be labeled with +the minimum distance that any of their children could have. Then, +queue entries are retrieved in smallest-distance-first order, with +entries having identical distances managed as stated in the previous +paragraph. + +The search algorithm keeps an index page locked only long enough to scan +its entries and queue those that satisfy the search conditions. Since +insertions can occur concurrently with searches, it is possible for an +index child page to be split between the time we make a queue entry for it +(while visiting its parent page) and the time we actually reach and scan +the child page. To avoid missing the entries that were moved to the right +sibling, we detect whether a split has occurred by comparing the child +page's NSN to the LSN that the parent had when visited. If it did, the +sibling page is immediately added to the front of the queue, ensuring that +its items will be scanned in the same order as if they were still on the +original child page. + +As is usual in Postgres, the search algorithm only guarantees to find index +entries that existed before the scan started; index entries added during +the scan might or might not be visited. This is okay as long as all +searches use MVCC snapshot rules to reject heap tuples newer than the time +of scan start. In particular, this means that we need not worry about +cases where a parent page's downlink key is "enlarged" after we look at it. +Any such enlargement would be to add child items that we aren't interested +in returning anyway. Insert Algorithm ---------------- -INSERT guarantees that the GiST tree remains balanced. User defined key method -Penalty is used for choosing a subtree to insert; method PickSplit is used for -the node splitting algorithm; method Union is used for propagating changes +INSERT guarantees that the GiST tree remains balanced. User defined key method +Penalty is used for choosing a subtree to insert; method PickSplit is used for +the node splitting algorithm; method Union is used for propagating changes upward to maintain the tree properties. -NOTICE: We modified original INSERT algorithm for performance reason. In -particularly, it is now a single-pass algorithm. +To insert a tuple, we first have to find a suitable leaf page to insert to. +The algorithm walks down the tree, starting from the root, along the path +of smallest Penalty. At each step: -Function findLeaf is used to identify subtree for insertion. Page, in which -insertion is proceeded, is locked as well as its parent page. Functions -findParent and findPath are used to find parent pages, which could be changed -because of concurrent access. Function pageSplit is recurrent and could split -page by more than 2 pages, which could be necessary if keys have different -lengths or more than one key are inserted (in such situation, user defined -function pickSplit cannot guarantee free space on page). +1. Has this page been split since we looked at the parent? If so, it's +possible that we should be inserting to the other half instead, so retreat +back to the parent. +2. If this is a leaf node, we've found our target node. +3. Otherwise use Penalty to pick a new target subtree. +4. Check the key representing the target subtree. If it doesn't already cover +the key we're inserting, replace it with the Union of the old downlink key +and the key being inserted. (Actually, we always call Union, and just skip +the replacement if the Unioned key is the same as the existing key) +5. Replacing the key in step 4 might cause the page to be split. In that case, +propagate the change upwards and restart the algorithm from the first parent +that didn't need to be split. +6. Walk down to the target subtree, and goto 1. + +This differs from the insertion algorithm in the original paper. In the +original paper, you first walk down the tree until you reach a leaf page, and +then you adjust the downlink in the parent, and propagating the adjustment up, +all the way up to the root in the worst case. But we adjust the downlinks to +cover the new key already when we walk down, so that when we reach the leaf +page, we don't need to update the parents anymore, except to insert the +downlinks if we have to split the page. This makes crash recovery simpler: +after inserting a key to the page, the tree is immediately self-consistent +without having to update the parents. Even if we split a page and crash before +inserting the downlink to the parent, the tree is self-consistent because the +right half of the split is accessible via the rightlink of the left page +(which replaced the original page). + +Note that the algorithm can walk up and down the tree before reaching a leaf +page, if internal pages need to split while adjusting the downlinks for the +new key. Eventually, you should reach the bottom, and proceed with the +insertion of the new tuple. + +Once we've found the target page to insert to, we check if there's room +for the new tuple. If there is, the tuple is inserted, and we're done. +If it doesn't fit, however, the page needs to be split. Note that it is +possible that a page needs to be split into more than two pages, if keys have +different lengths or more than one key is being inserted at a time (which can +happen when inserting downlinks for a page split that resulted in more than +two pages at the lower level). After splitting a page, the parent page needs +to be updated. The downlink for the new page needs to be inserted, and the +downlink for the old page, which became the left half of the split, needs to +be updated to only cover those tuples that stayed on the left page. Inserting +the downlink in the parent can again lead to a page split, recursing up to the +root page in the worst case. + +gistplacetopage is the workhorse function that performs one step of the +insertion. If the tuple fits, it inserts it to the given page, otherwise +it splits the page, and constructs the new downlink tuples for the split +pages. The caller must then call gistplacetopage() on the parent page to +insert the downlink tuples. The parent page that holds the downlink to +the child might have migrated as a result of concurrent splits of the +parent, gistfindCorrectParent() is used to find the parent page. + +Splitting the root page works slightly differently. At root split, +gistplacetopage() allocates the new child pages and replaces the old root +page with the new root containing downlinks to the new children, all in one +operation. -findLeaf(new-key) - push(stack, [root, 0]) //page, LSN - while(true) - ptr = top of stack - latch( ptr->page, S-mode ) - ptr->lsn = ptr->page->lsn - if ( exists ptr->parent AND ptr->parent->lsn < ptr->page->nsn ) - unlatch( ptr->page ) - pop stack - else if ( ptr->page is not leaf ) - push( stack, [get_best_child(ptr->page, new-key), 0] ) - unlatch( ptr->page ) - else - unlatch( ptr->page ) - latch( ptr->page, X-mode ) - if ( ptr->page is not leaf ) - //the only root page can become a non-leaf - unlatch( ptr->page ) - else if ( ptr->parent->lsn < ptr->page->nsn ) - unlatch( ptr->page ) - pop stack - else - return stack - end - end - end + +findPath is a subroutine of findParent, used when the correct parent page +can't be found by following the rightlinks at the parent level: findPath( stack item ) - push stack, [root, 0, 0] // page, LSN, parent + push stack, [root, 0, 0] // page, LSN, parent while( stack ) ptr = top of stack latch( ptr->page, S-mode ) @@ -152,7 +184,7 @@ findPath( stack item ) end for( each tuple on page ) if ( tuple->pagepointer == item->page ) - return stack + return stack else add to stack at the end [tuple->pagepointer,0, ptr] end @@ -160,12 +192,16 @@ findPath( stack item ) unlatch( ptr->page ) pop stack end - + + +gistFindCorrectParent is used to re-find the parent of a page during +insertion. It might have migrated to the right since we traversed down the +tree because of page splits. + findParent( stack item ) parent = item->parent - latch( parent->page, X-mode ) if ( parent->page->lsn != parent->lsn ) - while(true) + while(true) search parent tuple on parent->page, if found the return rightlink = parent->page->rightlink unlatch( parent->page ) @@ -177,9 +213,13 @@ findParent( stack item ) end newstack = findPath( item->parent ) replace part of stack to new one + latch( parent->page, X-mode ) return findParent( item ) end +pageSplit function decides how to distribute keys to the new pages after +page split: + pageSplit(page, allkeys) (lkeys, rkeys) = pickSplit( allkeys ) if ( page is root ) @@ -200,40 +240,45 @@ pageSplit(page, allkeys) return newkeys -placetopage(page, keysarray) - if ( no space left on page ) - keysarray = pageSplit(page, [ extract_keys(page), keysarray]) - last page in chain gets old NSN, - original and others - new NSN equals to LSN - if ( page is root ) - make new root with keysarray - end - else - put keysarray on page - if ( length of keysarray > 1 ) - keysarray = [ union(keysarray) ] - end - end - -insert(new-key) - stack = findLeaf(new-key) - keysarray = [new-key] - ptr = top of stack - while(true) - findParent( ptr ) //findParent latches parent page - keysarray = placetopage(ptr->page, keysarray) - unlatch( ptr->page ) - pop stack; - ptr = top of stack - if (length of keysarray == 1) - newboundingkey = union(oldboundingkey, keysarray) - if (newboundingkey == oldboundingkey) - unlatch ptr->page - break loop - end - end - end + +Concurrency control +------------------- +As a rule of thumb, if you need to hold a lock on multiple pages at the +same time, the locks should be acquired in the following order: child page +before parent, and left-to-right at the same level. Always acquiring the +locks in the same order avoids deadlocks. + +The search algorithm only looks at and locks one page at a time. Consequently +there's a race condition between a search and a page split. A page split +happens in two phases: 1. The page is split 2. The downlink is inserted to the +parent. If a search looks at the parent page between those steps, before the +downlink is inserted, it will still find the new right half by following the +rightlink on the left half. But it must not follow the rightlink if it saw the +downlink in the parent, or the page will be visited twice! + +A split initially marks the left page with the F_FOLLOW_RIGHT flag. If a scan +sees that flag set, it knows that the right page is missing the downlink, and +should be visited too. When split inserts the downlink to the parent, it +clears the F_FOLLOW_RIGHT flag in the child, and sets the NSN field in the +child page header to match the LSN of the insertion on the parent. If the +F_FOLLOW_RIGHT flag is not set, a scan compares the NSN on the child and the +LSN it saw in the parent. If NSN < LSN, the scan looked at the parent page +before the downlink was inserted, so it should follow the rightlink. Otherwise +the scan saw the downlink in the parent page, and will/did follow that as +usual. + +A scan can't normally see a page with the F_FOLLOW_RIGHT flag set, because +a page split keeps the child pages locked until the downlink has been inserted +to the parent and the flag cleared again. But if a crash happens in the middle +of a page split, before the downlinks are inserted into the parent, that will +leave a page with F_FOLLOW_RIGHT in the tree. Scans handle that just fine, +but we'll eventually want to fix that for performance reasons. And more +importantly, dealing with pages with missing downlink pointers in the parent +would complicate the insertion algorithm. So when an insertion sees a page +with F_FOLLOW_RIGHT set, it immediately tries to bring the split that +crashed in the middle to completion by adding the downlink in the parent. + Authors: Teodor Sigaev <teodor@sigaev.ru> - Oleg Bartunov <oleg@sai.msu.su> + Oleg Bartunov <oleg@sai.msu.su> diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index cec08c7226..8227bfdb88 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -4,11 +4,11 @@ * interface routines for the postgres GiST index access method. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.158 2010/01/02 16:57:33 momjian Exp $ + * src/backend/access/gist/gist.c * *------------------------------------------------------------------------- */ @@ -17,13 +17,12 @@ #include "access/genam.h" #include "access/gist_private.h" #include "catalog/index.h" +#include "catalog/pg_collation.h" #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/indexfsm.h" #include "utils/memutils.h" -const XLogRecPtr XLogRecPtrForTemp = {1, 1}; - /* Working state for gistbuild and its callback */ typedef struct { @@ -33,6 +32,12 @@ typedef struct MemoryContext tmpCtx; } GISTBuildState; +/* A List of these is used represent a split-in-progress. */ +typedef struct +{ + Buffer buf; /* the split page "half" */ + IndexTuple downlink; /* downlink for this half. */ +} GISTPageSplitInfo; /* non-export function prototypes */ static void gistbuildCallback(Relation index, @@ -45,8 +50,13 @@ static void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *GISTstate); -static void gistfindleaf(GISTInsertState *state, - GISTSTATE *giststate); +static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate); +static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, + IndexTuple *tuples, int ntup, OffsetNumber oldoffnum, + Buffer leftchild); +static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, List *splitinfo); #define ROTATEDIST(d) do { \ @@ -117,7 +127,7 @@ gistbuild(PG_FUNCTION_ARGS) MarkBufferDirty(buffer); - if (!index->rd_istemp) + if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata; @@ -132,7 +142,7 @@ gistbuild(PG_FUNCTION_ARGS) PageSetTLI(page, ThisTimeLineID); } else - PageSetLSN(page, XLogRecPtrForTemp); + PageSetLSN(page, GetXLogRecPtrForTemp()); UnlockReleaseBuffer(buffer); @@ -210,6 +220,19 @@ gistbuildCallback(Relation index, } /* + * gistbuildempty() -- build an empty gist index in the initialization fork + */ +Datum +gistbuildempty(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("unlogged GiST indexes are not supported"))); + + PG_RETURN_VOID(); +} + +/* * gistinsert -- wrapper for GiST tuple insertion. * * This is the public interface routine for tuple insertion in GiSTs. @@ -253,41 +276,52 @@ gistinsert(PG_FUNCTION_ARGS) /* - * Workhouse routine for doing insertion into a GiST index. Note that - * this routine assumes it is invoked in a short-lived memory context, - * so it does not bother releasing palloc'd allocations. + * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple + * at that offset is atomically removed along with inserting the new tuples. + * This is used to replace a tuple with a new one. + * + * If 'leftchildbuf' is valid, we're inserting the downlink for the page + * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'. + * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set. + * + * If there is not enough room on the page, it is split. All the split + * pages are kept pinned and locked and returned in *splitinfo, the caller + * is responsible for inserting the downlinks for them. However, if + * 'buffer' is the root page and it needs to be split, gistplacetopage() + * performs the split as one atomic operation, and *splitinfo is set to NIL. + * In that case, we continue to hold the root page locked, and the child + * pages are released; note that new tuple(s) are *not* on the root page + * but in one of the new child pages. */ -static void -gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) +static bool +gistplacetopage(GISTInsertState *state, GISTSTATE *giststate, + Buffer buffer, + IndexTuple *itup, int ntup, OffsetNumber oldoffnum, + Buffer leftchildbuf, + List **splitinfo) { - GISTInsertState state; - - memset(&state, 0, sizeof(GISTInsertState)); - - state.itup = (IndexTuple *) palloc(sizeof(IndexTuple)); - state.itup[0] = (IndexTuple) palloc(IndexTupleSize(itup)); - memcpy(state.itup[0], itup, IndexTupleSize(itup)); - state.ituplen = 1; - state.freespace = freespace; - state.r = r; - state.key = itup->t_tid; - state.needInsertComplete = true; + Page page = BufferGetPage(buffer); + bool is_leaf = (GistPageIsLeaf(page)) ? true : false; + XLogRecPtr recptr; + int i; + bool is_split; - state.stack = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); - state.stack->blkno = GIST_ROOT_BLKNO; + /* + * Refuse to modify a page that's incompletely split. This should not + * happen because we finish any incomplete splits while we walk down the + * tree. However, it's remotely possible that another concurrent inserter + * splits a parent page, and errors out before completing the split. We + * will just throw an error in that case, and leave any split we had in + * progress unfinished too. The next insert that comes along will clean up + * the mess. + */ + if (GistFollowRight(page)) + elog(ERROR, "concurrent GiST page split was incomplete"); - gistfindleaf(&state, giststate); - gistmakedeal(&state, giststate); -} - -static bool -gistplacetopage(GISTInsertState *state, GISTSTATE *giststate) -{ - bool is_splitted = false; - bool is_leaf = (GistPageIsLeaf(state->stack->page)) ? true : false; + *splitinfo = NIL; /* - * if (!is_leaf) remove old key: This node's key has been modified, either + * if isupdate, remove old key: This node's key has been modified, either * because a child split occurred or because we needed to adjust our key * for an insert in a child node. Therefore, remove the old version of * this node's key. @@ -295,77 +329,136 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate) * for WAL replay, in the non-split case we handle this by setting up a * one-element todelete array; in the split case, it's handled implicitly * because the tuple vector passed to gistSplit won't include this tuple. - * - * XXX: If we want to change fillfactors between node and leaf, fillfactor - * = (is_leaf ? state->leaf_fillfactor : state->node_fillfactor) */ - if (gistnospace(state->stack->page, state->itup, state->ituplen, - is_leaf ? InvalidOffsetNumber : state->stack->childoffnum, - state->freespace)) + is_split = gistnospace(page, itup, ntup, oldoffnum, state->freespace); + if (is_split) { /* no space for insertion */ IndexTuple *itvec; int tlen; SplitedPageLayout *dist = NULL, *ptr; - BlockNumber rrlink = InvalidBlockNumber; - GistNSN oldnsn; + BlockNumber oldrlink = InvalidBlockNumber; + GistNSN oldnsn = {0, 0}; + SplitedPageLayout rootpg; + BlockNumber blkno = BufferGetBlockNumber(buffer); + bool is_rootsplit; - is_splitted = true; + is_rootsplit = (blkno == GIST_ROOT_BLKNO); /* - * Form index tuples vector to split: remove old tuple if t's needed - * and add new tuples to vector + * Form index tuples vector to split. If we're replacing an old tuple, + * remove the old version from the vector. */ - itvec = gistextractpage(state->stack->page, &tlen); - if (!is_leaf) + itvec = gistextractpage(page, &tlen); + if (OffsetNumberIsValid(oldoffnum)) { /* on inner page we should remove old tuple */ - int pos = state->stack->childoffnum - FirstOffsetNumber; + int pos = oldoffnum - FirstOffsetNumber; tlen--; if (pos != tlen) memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos)); } - itvec = gistjoinvector(itvec, &tlen, state->itup, state->ituplen); - dist = gistSplit(state->r, state->stack->page, itvec, tlen, giststate); + itvec = gistjoinvector(itvec, &tlen, itup, ntup); + dist = gistSplit(state->r, page, itvec, tlen, giststate); - state->itup = (IndexTuple *) palloc(sizeof(IndexTuple) * tlen); - state->ituplen = 0; - - if (state->stack->blkno != GIST_ROOT_BLKNO) + /* + * Set up pages to work with. Allocate new buffers for all but the + * leftmost page. The original page becomes the new leftmost page, and + * is just replaced with the new contents. + * + * For a root-split, allocate new buffers for all child pages, the + * original page is overwritten with new root page containing + * downlinks to the new child pages. + */ + ptr = dist; + if (!is_rootsplit) { - /* - * if non-root split then we should not allocate new buffer, but - * we must create temporary page to operate - */ - dist->buffer = state->stack->buffer; - dist->page = PageGetTempPageCopySpecial(BufferGetPage(dist->buffer)); + /* save old rightlink and NSN */ + oldrlink = GistPageGetOpaque(page)->rightlink; + oldnsn = GistPageGetOpaque(page)->nsn; + + dist->buffer = buffer; + dist->block.blkno = BufferGetBlockNumber(buffer); + dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer)); /* clean all flags except F_LEAF */ GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0; + + ptr = ptr->next; + } + for (; ptr; ptr = ptr->next) + { + /* Allocate new page */ + ptr->buffer = gistNewBuffer(state->r); + GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); + ptr->page = BufferGetPage(ptr->buffer); + ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); } - /* make new pages and fills them */ + /* + * Now that we know whick blocks the new pages go to, set up downlink + * tuples to point to them. + */ for (ptr = dist; ptr; ptr = ptr->next) { + ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); + GistTupleSetValid(ptr->itup); + } + + /* + * If this is a root split, we construct the new root page with the + * downlinks here directly, instead of requiring the caller to insert + * them. Add the new root page to the list along with the child pages. + */ + if (is_rootsplit) + { + IndexTuple *downlinks; + int ndownlinks = 0; int i; - char *data; - /* get new page */ - if (ptr->buffer == InvalidBuffer) + rootpg.buffer = buffer; + rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer)); + GistPageGetOpaque(rootpg.page)->flags = 0; + + /* Prepare a vector of all the downlinks */ + for (ptr = dist; ptr; ptr = ptr->next) + ndownlinks++; + downlinks = palloc(sizeof(IndexTuple) * ndownlinks); + for (i = 0, ptr = dist; ptr; ptr = ptr->next) + downlinks[i++] = ptr->itup; + + rootpg.block.blkno = GIST_ROOT_BLKNO; + rootpg.block.num = ndownlinks; + rootpg.list = gistfillitupvec(downlinks, ndownlinks, + &(rootpg.lenlist)); + rootpg.itup = NULL; + + rootpg.next = dist; + dist = &rootpg; + } + else + { + /* Prepare split-info to be returned to caller */ + for (ptr = dist; ptr; ptr = ptr->next) { - ptr->buffer = gistNewBuffer(state->r); - GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); - ptr->page = BufferGetPage(ptr->buffer); + GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); + + si->buf = ptr->buffer; + si->downlink = ptr->itup; + *splitinfo = lappend(*splitinfo, si); } - ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); + } + + /* + * Fill all pages. All the pages are new, ie. freshly allocated empty + * pages, or a temporary copy of the old page. + */ + for (ptr = dist; ptr; ptr = ptr->next) + { + char *data = (char *) (ptr->list); - /* - * fill page, we can do it because all these pages are new (ie not - * linked in tree or masked by temp page - */ - data = (char *) (ptr->list); for (i = 0; i < ptr->block.num; i++) { if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) @@ -373,276 +466,388 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate) data += IndexTupleSize((IndexTuple) data); } - /* set up ItemPointer and remember it for parent */ - ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); - state->itup[state->ituplen] = ptr->itup; - state->ituplen++; - } + /* Set up rightlinks */ + if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO) + GistPageGetOpaque(ptr->page)->rightlink = + ptr->next->block.blkno; + else + GistPageGetOpaque(ptr->page)->rightlink = oldrlink; - /* saves old rightlink */ - if (state->stack->blkno != GIST_ROOT_BLKNO) - rrlink = GistPageGetOpaque(dist->page)->rightlink; + if (ptr->next && !is_rootsplit) + GistMarkFollowRight(ptr->page); + else + GistClearFollowRight(ptr->page); + + /* + * Copy the NSN of the original page to all pages. The + * F_FOLLOW_RIGHT flags ensure that scans will follow the + * rightlinks until the downlinks are inserted. + */ + GistPageGetOpaque(ptr->page)->nsn = oldnsn; + } START_CRIT_SECTION(); /* - * must mark buffers dirty before XLogInsert, even though we'll still - * be changing their opaque fields below. set up right links. + * Must mark buffers dirty before XLogInsert, even though we'll still + * be changing their opaque fields below. */ for (ptr = dist; ptr; ptr = ptr->next) - { MarkBufferDirty(ptr->buffer); - GistPageGetOpaque(ptr->page)->rightlink = (ptr->next) ? - ptr->next->block.blkno : rrlink; - } - - /* restore splitted non-root page */ - if (state->stack->blkno != GIST_ROOT_BLKNO) - { - PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); - dist->page = BufferGetPage(dist->buffer); - } - - if (!state->r->rd_istemp) - { - XLogRecPtr recptr; - XLogRecData *rdata; - - rdata = formSplitRdata(state->r->rd_node, state->stack->blkno, - is_leaf, &(state->key), dist); + if (BufferIsValid(leftchildbuf)) + MarkBufferDirty(leftchildbuf); - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata); + /* + * The first page in the chain was a temporary working copy meant to + * replace the old page. Copy it over the old page. + */ + PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); + dist->page = BufferGetPage(dist->buffer); - for (ptr = dist; ptr; ptr = ptr->next) - { - PageSetLSN(ptr->page, recptr); - PageSetTLI(ptr->page, ThisTimeLineID); - } - } + /* Write the WAL record */ + if (RelationNeedsWAL(state->r)) + recptr = gistXLogSplit(state->r->rd_node, blkno, is_leaf, + dist, oldrlink, oldnsn, leftchildbuf); else - { - for (ptr = dist; ptr; ptr = ptr->next) - { - PageSetLSN(ptr->page, XLogRecPtrForTemp); - } - } - - /* set up NSN */ - oldnsn = GistPageGetOpaque(dist->page)->nsn; - if (state->stack->blkno == GIST_ROOT_BLKNO) - /* if root split we should put initial value */ - oldnsn = PageGetLSN(dist->page); + recptr = GetXLogRecPtrForTemp(); for (ptr = dist; ptr; ptr = ptr->next) { - /* only for last set oldnsn */ - GistPageGetOpaque(ptr->page)->nsn = (ptr->next) ? - PageGetLSN(ptr->page) : oldnsn; + PageSetLSN(ptr->page, recptr); + PageSetTLI(ptr->page, ThisTimeLineID); } /* - * release buffers, if it was a root split then release all buffers - * because we create all buffers + * Return the new child buffers to the caller. + * + * If this was a root split, we've already inserted the downlink + * pointers, in the form of a new root page. Therefore we can release + * all the new buffers, and keep just the root page locked. */ - ptr = (state->stack->blkno == GIST_ROOT_BLKNO) ? dist : dist->next; - for (; ptr; ptr = ptr->next) - UnlockReleaseBuffer(ptr->buffer); - - if (state->stack->blkno == GIST_ROOT_BLKNO) + if (is_rootsplit) { - gistnewroot(state->r, state->stack->buffer, state->itup, state->ituplen, &(state->key)); - state->needInsertComplete = false; + for (ptr = dist->next; ptr; ptr = ptr->next) + UnlockReleaseBuffer(ptr->buffer); } - - END_CRIT_SECTION(); } else { - /* enough space */ + /* + * Enough space. We also get here if ntuples==0. + */ START_CRIT_SECTION(); - if (!is_leaf) - PageIndexTupleDelete(state->stack->page, state->stack->childoffnum); - gistfillbuffer(state->stack->page, state->itup, state->ituplen, InvalidOffsetNumber); + if (OffsetNumberIsValid(oldoffnum)) + PageIndexTupleDelete(page, oldoffnum); + gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); - MarkBufferDirty(state->stack->buffer); + MarkBufferDirty(buffer); - if (!state->r->rd_istemp) + if (BufferIsValid(leftchildbuf)) + MarkBufferDirty(leftchildbuf); + + if (RelationNeedsWAL(state->r)) { - OffsetNumber noffs = 0, - offs[1]; - XLogRecPtr recptr; - XLogRecData *rdata; + OffsetNumber ndeloffs = 0, + deloffs[1]; - if (!is_leaf) + if (OffsetNumberIsValid(oldoffnum)) { - /* only on inner page we should delete previous version */ - offs[0] = state->stack->childoffnum; - noffs = 1; + deloffs[0] = oldoffnum; + ndeloffs = 1; } - rdata = formUpdateRdata(state->r->rd_node, state->stack->buffer, - offs, noffs, - state->itup, state->ituplen, - &(state->key)); + recptr = gistXLogUpdate(state->r->rd_node, buffer, + deloffs, ndeloffs, itup, ntup, + leftchildbuf); - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); - PageSetLSN(state->stack->page, recptr); - PageSetTLI(state->stack->page, ThisTimeLineID); + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); } else - PageSetLSN(state->stack->page, XLogRecPtrForTemp); - - if (state->stack->blkno == GIST_ROOT_BLKNO) - state->needInsertComplete = false; + { + recptr = GetXLogRecPtrForTemp(); + PageSetLSN(page, recptr); + } - END_CRIT_SECTION(); + *splitinfo = NIL; + } - if (state->ituplen > 1) - { /* previous is_splitted==true */ + /* + * If we inserted the downlink for a child page, set NSN and clear + * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to + * follow the rightlink if and only if they looked at the parent page + * before we inserted the downlink. + * + * Note that we do this *after* writing the WAL record. That means that + * the possible full page image in the WAL record does not include these + * changes, and they must be replayed even if the page is restored from + * the full page image. There's a chicken-and-egg problem: if we updated + * the child pages first, we wouldn't know the recptr of the WAL record + * we're about to write. + */ + if (BufferIsValid(leftchildbuf)) + { + Page leftpg = BufferGetPage(leftchildbuf); - /* - * child was splited, so we must form union for insertion in - * parent - */ - IndexTuple newtup = gistunion(state->r, state->itup, state->ituplen, giststate); + GistPageGetOpaque(leftpg)->nsn = recptr; + GistClearFollowRight(leftpg); - ItemPointerSetBlockNumber(&(newtup->t_tid), state->stack->blkno); - state->itup[0] = newtup; - state->ituplen = 1; - } - else if (is_leaf) - { - /* - * itup[0] store key to adjust parent, we set it to valid to - * correct check by GistTupleIsInvalid macro in gistgetadjusted() - */ - ItemPointerSetBlockNumber(&(state->itup[0]->t_tid), state->stack->blkno); - GistTupleSetValid(state->itup[0]); - } + PageSetLSN(leftpg, recptr); + PageSetTLI(leftpg, ThisTimeLineID); } - return is_splitted; + + END_CRIT_SECTION(); + + return is_split; } /* - * returns stack of pages, all pages in stack are pinned, and - * leaf is X-locked + * Workhouse routine for doing insertion into a GiST index. Note that + * this routine assumes it is invoked in a short-lived memory context, + * so it does not bother releasing palloc'd allocations. */ - static void -gistfindleaf(GISTInsertState *state, GISTSTATE *giststate) +gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) { ItemId iid; IndexTuple idxtuple; - GISTPageOpaque opaque; + GISTInsertStack firststack; + GISTInsertStack *stack; + GISTInsertState state; + bool xlocked = false; + + memset(&state, 0, sizeof(GISTInsertState)); + state.freespace = freespace; + state.r = r; + + /* Start from the root */ + firststack.blkno = GIST_ROOT_BLKNO; + firststack.lsn.xrecoff = 0; + firststack.parent = NULL; + state.stack = stack = &firststack; /* - * walk down, We don't lock page for a long time, but so we should be - * ready to recheck path in a bad case... We remember, that page->lsn - * should never be invalid. + * Walk down along the path of smallest penalty, updating the parent + * pointers with the key we're inserting as we go. If we crash in the + * middle, the tree is consistent, although the possible parent updates + * were a waste. */ for (;;) { - if (XLogRecPtrIsInvalid(state->stack->lsn)) - state->stack->buffer = ReadBuffer(state->r, state->stack->blkno); - LockBuffer(state->stack->buffer, GIST_SHARE); - gistcheckpage(state->r, state->stack->buffer); + if (XLogRecPtrIsInvalid(stack->lsn)) + stack->buffer = ReadBuffer(state.r, stack->blkno); + + /* + * Be optimistic and grab shared lock first. Swap it for an exclusive + * lock later if we need to update the page. + */ + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_SHARE); + gistcheckpage(state.r, stack->buffer); + } - state->stack->page = (Page) BufferGetPage(state->stack->buffer); - opaque = GistPageGetOpaque(state->stack->page); + stack->page = (Page) BufferGetPage(stack->buffer); + stack->lsn = PageGetLSN(stack->page); + Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); - state->stack->lsn = PageGetLSN(state->stack->page); - Assert(state->r->rd_istemp || !XLogRecPtrIsInvalid(state->stack->lsn)); + /* + * If this page was split but the downlink was never inserted to the + * parent because the inserting backend crashed before doing that, fix + * that now. + */ + if (GistFollowRight(stack->page)) + { + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + /* someone might've completed the split when we unlocked */ + if (!GistFollowRight(stack->page)) + continue; + } + gistfixsplit(&state, giststate); - if (state->stack->blkno != GIST_ROOT_BLKNO && - XLByteLT(state->stack->parent->lsn, opaque->nsn)) + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } + + if (stack->blkno != GIST_ROOT_BLKNO && + XLByteLT(stack->parent->lsn, + GistPageGetOpaque(stack->page)->nsn)) { /* - * caused split non-root page is detected, go up to parent to - * choose best child + * Concurrent split detected. There's no guarantee that the + * downlink for this page is consistent with the tuple we're + * inserting anymore, so go back to parent and rechoose the best + * child. */ - UnlockReleaseBuffer(state->stack->buffer); - state->stack = state->stack->parent; + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; continue; } - if (!GistPageIsLeaf(state->stack->page)) + if (!GistPageIsLeaf(stack->page)) { /* - * This is an internal page, so continue to walk down the tree. We - * find the child node that has the minimum insertion penalty and - * recursively invoke ourselves to modify that node. Once the - * recursive call returns, we may need to adjust the parent node - * for two reasons: the child node split, or the key in this node - * needs to be adjusted for the newly inserted key below us. + * This is an internal page so continue to walk down the tree. + * Find the child node that has the minimum insertion penalty. */ - GISTInsertStack *item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); - - state->stack->childoffnum = gistchoose(state->r, state->stack->page, state->itup[0], giststate); + BlockNumber childblkno; + IndexTuple newtup; + GISTInsertStack *item; - iid = PageGetItemId(state->stack->page, state->stack->childoffnum); - idxtuple = (IndexTuple) PageGetItem(state->stack->page, iid); - item->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); - LockBuffer(state->stack->buffer, GIST_UNLOCK); + stack->childoffnum = gistchoose(state.r, stack->page, itup, giststate); + iid = PageGetItemId(stack->page, stack->childoffnum); + idxtuple = (IndexTuple) PageGetItem(stack->page, iid); + childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); - item->parent = state->stack; - item->child = NULL; - if (state->stack) - state->stack->child = item; - state->stack = item; - } - else - { - /* be carefull, during unlock/lock page may be changed... */ - LockBuffer(state->stack->buffer, GIST_UNLOCK); - LockBuffer(state->stack->buffer, GIST_EXCLUSIVE); - state->stack->page = (Page) BufferGetPage(state->stack->buffer); - opaque = GistPageGetOpaque(state->stack->page); + /* + * Check that it's not a leftover invalid tuple from pre-9.1 + */ + if (GistTupleIsInvalid(idxtuple)) + ereport(ERROR, + (errmsg("index \"%s\" contains an inner tuple marked as invalid", + RelationGetRelationName(r)), + errdetail("This is caused by an incomplete page split at crash recovery before upgrading to 9.1."), + errhint("Please REINDEX it."))); - if (state->stack->blkno == GIST_ROOT_BLKNO) + /* + * Check that the key representing the target child node is + * consistent with the key we're inserting. Update it if it's not. + */ + newtup = gistgetadjusted(state.r, idxtuple, itup, giststate); + if (newtup) { /* - * the only page can become inner instead of leaf is a root - * page, so for root we should recheck it + * Swap shared lock for an exclusive one. Beware, the page may + * change while we unlock/lock the page... */ - if (!GistPageIsLeaf(state->stack->page)) + if (!xlocked) { - /* - * very rarely situation: during unlock/lock index with - * number of pages = 1 was increased - */ - LockBuffer(state->stack->buffer, GIST_UNLOCK); - continue; + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + stack->page = (Page) BufferGetPage(stack->buffer); + + if (!XLByteEQ(PageGetLSN(stack->page), stack->lsn)) + { + /* the page was changed while we unlocked it, retry */ + continue; + } } /* - * we don't need to check root split, because checking - * leaf/inner is enough to recognize split for root + * Update the tuple. + * + * We still hold the lock after gistinserttuples(), but it + * might have to split the page to make the updated tuple fit. + * In that case the updated tuple might migrate to the other + * half of the split, so we have to go back to the parent and + * descend back to the half that's a better fit for the new + * tuple. */ - + if (gistinserttuples(&state, stack, giststate, &newtup, 1, + stack->childoffnum, InvalidBuffer)) + { + /* + * If this was a root split, the root page continues to be + * the parent and the updated tuple went to one of the + * child pages, so we just need to retry from the root + * page. + */ + if (stack->blkno != GIST_ROOT_BLKNO) + { + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + } + continue; + } } - else if (XLByteLT(state->stack->parent->lsn, opaque->nsn)) + LockBuffer(stack->buffer, GIST_UNLOCK); + xlocked = false; + + /* descend to the chosen child */ + item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + item->blkno = childblkno; + item->parent = stack; + state.stack = stack = item; + } + else + { + /* + * Leaf page. Insert the new key. We've already updated all the + * parents on the way down, but we might have to split the page if + * it doesn't fit. gistinserthere() will take care of that. + */ + + /* + * Swap shared lock for an exclusive one. Be careful, the page may + * change while we unlock/lock the page... + */ + if (!xlocked) { - /* - * detecting split during unlock/lock, so we should find - * better child on parent - */ + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + stack->page = (Page) BufferGetPage(stack->buffer); + stack->lsn = PageGetLSN(stack->page); - /* forget buffer */ - UnlockReleaseBuffer(state->stack->buffer); + if (stack->blkno == GIST_ROOT_BLKNO) + { + /* + * the only page that can become inner instead of leaf is + * the root page, so for root we should recheck it + */ + if (!GistPageIsLeaf(stack->page)) + { + /* + * very rare situation: during unlock/lock index with + * number of pages = 1 was increased + */ + LockBuffer(stack->buffer, GIST_UNLOCK); + xlocked = false; + continue; + } - state->stack = state->stack->parent; - continue; + /* + * we don't need to check root split, because checking + * leaf/inner is enough to recognize split for root + */ + } + else if (GistFollowRight(stack->page) || + XLByteLT(stack->parent->lsn, + GistPageGetOpaque(stack->page)->nsn)) + { + /* + * The page was split while we momentarily unlocked the + * page. Go back to parent. + */ + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } } - state->stack->lsn = PageGetLSN(state->stack->page); + /* now state.stack->(page, buffer and blkno) points to leaf page */ + + gistinserttuples(&state, stack, giststate, &itup, 1, + InvalidOffsetNumber, InvalidBuffer); + LockBuffer(stack->buffer, GIST_UNLOCK); - /* ok we found a leaf page and it X-locked */ + /* Release any pins we might still hold before exiting */ + for (; stack; stack = stack->parent) + ReleaseBuffer(stack->buffer); break; } } - - /* now state->stack->(page, buffer and blkno) points to leaf page */ } /* @@ -650,7 +855,7 @@ gistfindleaf(GISTInsertState *state, GISTSTATE *giststate) * * returns from the beginning of closest parent; * - * To prevent deadlocks, this should lock only one page simultaneously. + * To prevent deadlocks, this should lock only one page at a time. */ GISTInsertStack * gistFindPath(Relation r, BlockNumber child) @@ -685,6 +890,13 @@ gistFindPath(Relation r, BlockNumber child) top->lsn = PageGetLSN(page); + /* + * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a + * downlink. This should not normally happen.. + */ + if (GistFollowRight(page)) + elog(ERROR, "concurrent GiST page split was incomplete"); + if (top->parent && XLByteLT(top->parent->lsn, GistPageGetOpaque(page)->nsn) && GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ ) { @@ -713,8 +925,6 @@ gistFindPath(Relation r, BlockNumber child) ptr = top; while (ptr->parent) { - /* set child link */ - ptr->parent->child = ptr; /* move childoffnum.. */ if (ptr == top) { @@ -756,17 +966,16 @@ gistFindPath(Relation r, BlockNumber child) return NULL; } - /* - * Returns X-locked parent of stack page + * Updates the stack so that child->parent is the correct parent of the + * child. child->parent must be exclusively locked on entry, and will + * remain so at exit, but it might not be the same page anymore. */ - static void gistFindCorrectParent(Relation r, GISTInsertStack *child) { GISTInsertStack *parent = child->parent; - LockBuffer(parent->buffer, GIST_EXCLUSIVE); gistcheckpage(r, parent->buffer); parent->page = (Page) BufferGetPage(parent->buffer); @@ -838,83 +1047,232 @@ gistFindCorrectParent(Relation r, GISTInsertStack *child) /* install new chain of parents to stack */ child->parent = parent; - parent->child = child; /* make recursive call to normal processing */ + LockBuffer(child->parent->buffer, GIST_EXCLUSIVE); gistFindCorrectParent(r, child); } return; } -void -gistmakedeal(GISTInsertState *state, GISTSTATE *giststate) +/* + * Form a downlink pointer for the page in 'buf'. + */ +static IndexTuple +gistformdownlink(Relation rel, Buffer buf, GISTSTATE *giststate, + GISTInsertStack *stack) { - int is_splitted; - ItemId iid; - IndexTuple oldtup, - newtup; + Page page = BufferGetPage(buf); + OffsetNumber maxoff; + OffsetNumber offset; + IndexTuple downlink = NULL; - /* walk up */ - while (true) + maxoff = PageGetMaxOffsetNumber(page); + for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset)) { - /* - * After this call: 1. if child page was splited, then itup contains - * keys for each page 2. if child page wasn't splited, then itup - * contains additional for adjustment of current key - */ + IndexTuple ituple = (IndexTuple) + PageGetItem(page, PageGetItemId(page, offset)); - if (state->stack->parent) + if (downlink == NULL) + downlink = CopyIndexTuple(ituple); + else { - /* - * X-lock parent page before proceed child, gistFindCorrectParent - * should find and lock it - */ - gistFindCorrectParent(state->r, state->stack); + IndexTuple newdownlink; + + newdownlink = gistgetadjusted(rel, downlink, ituple, + giststate); + if (newdownlink) + downlink = newdownlink; } - is_splitted = gistplacetopage(state, giststate); + } + + /* + * If the page is completely empty, we can't form a meaningful downlink + * for it. But we have to insert a downlink for the page. Any key will do, + * as long as its consistent with the downlink of parent page, so that we + * can legally insert it to the parent. A minimal one that matches as few + * scans as possible would be best, to keep scans from doing useless work, + * but we don't know how to construct that. So we just use the downlink of + * the original page that was split - that's as far from optimal as it can + * get but will do.. + */ + if (!downlink) + { + ItemId iid; - /* parent locked above, so release child buffer */ - UnlockReleaseBuffer(state->stack->buffer); + LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE); + gistFindCorrectParent(rel, stack); + iid = PageGetItemId(stack->parent->page, stack->parent->childoffnum); + downlink = (IndexTuple) PageGetItem(stack->parent->page, iid); + downlink = CopyIndexTuple(downlink); + LockBuffer(stack->parent->buffer, GIST_UNLOCK); + } - /* pop parent page from stack */ - state->stack = state->stack->parent; + ItemPointerSetBlockNumber(&(downlink->t_tid), BufferGetBlockNumber(buf)); + GistTupleSetValid(downlink); - /* stack is void */ - if (!state->stack) - break; + return downlink; +} - /* - * child did not split, so we can check is it needed to update parent - * tuple - */ - if (!is_splitted) - { - /* parent's tuple */ - iid = PageGetItemId(state->stack->page, state->stack->childoffnum); - oldtup = (IndexTuple) PageGetItem(state->stack->page, iid); - newtup = gistgetadjusted(state->r, oldtup, state->itup[0], giststate); - - if (!newtup) - { /* not need to update key */ - LockBuffer(state->stack->buffer, GIST_UNLOCK); - break; - } - state->itup[0] = newtup; +/* + * Complete the incomplete split of state->stack->page. + */ +static void +gistfixsplit(GISTInsertState *state, GISTSTATE *giststate) +{ + GISTInsertStack *stack = state->stack; + Buffer buf; + Page page; + List *splitinfo = NIL; + + elog(LOG, "fixing incomplete split in index \"%s\", block %u", + RelationGetRelationName(state->r), stack->blkno); + + Assert(GistFollowRight(stack->page)); + Assert(OffsetNumberIsValid(stack->parent->childoffnum)); + + buf = stack->buffer; + + /* + * Read the chain of split pages, following the rightlinks. Construct a + * downlink tuple for each page. + */ + for (;;) + { + GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); + IndexTuple downlink; + + page = BufferGetPage(buf); + + /* Form the new downlink tuples to insert to parent */ + downlink = gistformdownlink(state->r, buf, giststate, stack); + + si->buf = buf; + si->downlink = downlink; + + splitinfo = lappend(splitinfo, si); + + if (GistFollowRight(page)) + { + /* lock next page */ + buf = ReadBuffer(state->r, GistPageGetOpaque(page)->rightlink); + LockBuffer(buf, GIST_EXCLUSIVE); } - } /* while */ + else + break; + } + + /* Insert the downlinks */ + gistfinishsplit(state, stack, giststate, splitinfo); +} + +/* + * Insert tuples to stack->buffer. If 'oldoffnum' is valid, the new tuples + * replace an old tuple at oldoffnum. The caller must hold an exclusive lock + * on the page. + * + * If leftchild is valid, we're inserting/updating the downlink for the + * page to the right of leftchild. We clear the F_FOLLOW_RIGHT flag and + * update NSN on leftchild, atomically with the insertion of the downlink. + * + * Returns 'true' if the page had to be split. On return, we will continue + * to hold an exclusive lock on state->stack->buffer, but if we had to split + * the page, it might not contain the tuple we just inserted/updated. + */ +static bool +gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, + IndexTuple *tuples, int ntup, OffsetNumber oldoffnum, + Buffer leftchild) +{ + List *splitinfo; + bool is_split; + + is_split = gistplacetopage(state, giststate, stack->buffer, + tuples, ntup, oldoffnum, + leftchild, + &splitinfo); + if (splitinfo) + gistfinishsplit(state, stack, giststate, splitinfo); + + return is_split; +} + +/* + * Finish an incomplete split by inserting/updating the downlinks in + * parent page. 'splitinfo' contains all the child pages, exclusively-locked, + * involved in the split, from left-to-right. + */ +static void +gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, List *splitinfo) +{ + ListCell *lc; + List *reversed; + GISTPageSplitInfo *right; + GISTPageSplitInfo *left; + IndexTuple tuples[2]; + + /* A split always contains at least two halves */ + Assert(list_length(splitinfo) >= 2); + + /* + * We need to insert downlinks for each new page, and update the downlink + * for the original (leftmost) page in the split. Begin at the rightmost + * page, inserting one downlink at a time until there's only two pages + * left. Finally insert the downlink for the last new page and update the + * downlink for the original page as one operation. + */ + + /* for convenience, create a copy of the list in reverse order */ + reversed = NIL; + foreach(lc, splitinfo) + { + reversed = lcons(lfirst(lc), reversed); + } - /* release all parent buffers */ - while (state->stack) + LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE); + gistFindCorrectParent(state->r, stack); + + while (list_length(reversed) > 2) { - ReleaseBuffer(state->stack->buffer); - state->stack = state->stack->parent; + right = (GISTPageSplitInfo *) linitial(reversed); + left = (GISTPageSplitInfo *) lsecond(reversed); + + if (gistinserttuples(state, stack->parent, giststate, + &right->downlink, 1, + InvalidOffsetNumber, + left->buf)) + { + /* + * If the parent page was split, need to relocate the original + * parent pointer. + */ + gistFindCorrectParent(state->r, stack); + } + UnlockReleaseBuffer(right->buf); + reversed = list_delete_first(reversed); } - /* say to xlog that insert is completed */ - if (state->needInsertComplete && !state->r->rd_istemp) - gistxlogInsertCompletion(state->r->rd_node, &(state->key), 1); + right = (GISTPageSplitInfo *) linitial(reversed); + left = (GISTPageSplitInfo *) lsecond(reversed); + + /* + * Finally insert downlink for the remaining right page and update the + * downlink for the original page to not contain the tuples that were + * moved to the new pages. + */ + tuples[0] = left->downlink; + tuples[1] = right->downlink; + gistinserttuples(state, stack->parent, giststate, + tuples, 2, + stack->parent->childoffnum, + left->buf); + LockBuffer(stack->parent->buffer, GIST_UNLOCK); + UnlockReleaseBuffer(right->buf); + Assert(left->buf == stack->buffer); } /* @@ -965,8 +1323,7 @@ gistSplit(Relation r, ROTATEDIST(res); res->block.num = v.splitVector.spl_nright; res->list = gistfillitupvec(rvectup, v.splitVector.spl_nright, &(res->lenlist)); - res->itup = (v.spl_rightvalid) ? gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false) - : gist_form_invalid_tuple(GIST_ROOT_BLKNO); + res->itup = gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false); } if (!gistfitpage(lvectup, v.splitVector.spl_nleft)) @@ -988,51 +1345,16 @@ gistSplit(Relation r, ROTATEDIST(res); res->block.num = v.splitVector.spl_nleft; res->list = gistfillitupvec(lvectup, v.splitVector.spl_nleft, &(res->lenlist)); - res->itup = (v.spl_leftvalid) ? gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false) - : gist_form_invalid_tuple(GIST_ROOT_BLKNO); + res->itup = gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false); } return res; } /* - * buffer must be pinned and locked by caller + * Fill a GISTSTATE with information about the index */ void -gistnewroot(Relation r, Buffer buffer, IndexTuple *itup, int len, ItemPointer key) -{ - Page page; - - Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); - page = BufferGetPage(buffer); - - START_CRIT_SECTION(); - - GISTInitBuffer(buffer, 0); - gistfillbuffer(page, itup, len, FirstOffsetNumber); - - MarkBufferDirty(buffer); - - if (!r->rd_istemp) - { - XLogRecPtr recptr; - XLogRecData *rdata; - - rdata = formUpdateRdata(r->rd_node, buffer, - NULL, 0, - itup, len, key); - - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_NEW_ROOT, rdata); - PageSetLSN(page, recptr); - PageSetTLI(page, ThisTimeLineID); - } - else - PageSetLSN(page, XLogRecPtrForTemp); - - END_CRIT_SECTION(); -} - -void initGISTstate(GISTSTATE *giststate, Relation index) { int i; @@ -1066,6 +1388,29 @@ initGISTstate(GISTSTATE *giststate, Relation index) fmgr_info_copy(&(giststate->equalFn[i]), index_getprocinfo(index, i + 1, GIST_EQUAL_PROC), CurrentMemoryContext); + /* opclasses are not required to provide a Distance method */ + if (OidIsValid(index_getprocid(index, i + 1, GIST_DISTANCE_PROC))) + fmgr_info_copy(&(giststate->distanceFn[i]), + index_getprocinfo(index, i + 1, GIST_DISTANCE_PROC), + CurrentMemoryContext); + else + giststate->distanceFn[i].fn_oid = InvalidOid; + + /* + * If the index column has a specified collation, we should honor that + * while doing comparisons. However, we may have a collatable storage + * type for a noncollatable indexed data type. If there's no index + * collation then specify default collation in case the support + * functions need collation. This is harmless if the support + * functions don't care about collation, so we just do it + * unconditionally. (We could alternatively call get_typcollation, + * but that seems like expensive overkill --- there aren't going to be + * any cases where a GiST storage type has a nondefault collation.) + */ + if (OidIsValid(index->rd_indcollation[i])) + giststate->supportCollation[i] = index->rd_indcollation[i]; + else + giststate->supportCollation[i] = DEFAULT_COLLATION_OID; } } diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index 216910307a..1aba686844 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -4,11 +4,11 @@ * fetch tuples from a GiST scan. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistget.c,v 1.85 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/gist/gistget.c * *------------------------------------------------------------------------- */ @@ -20,504 +20,568 @@ #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" +#include "utils/builtins.h" #include "utils/memutils.h" -static OffsetNumber gistfindnext(IndexScanDesc scan, OffsetNumber n); -static int64 gistnext(IndexScanDesc scan, TIDBitmap *tbm); -static bool gistindex_keytest(IndexTuple tuple, IndexScanDesc scan, - OffsetNumber offset); - -static void -killtuple(Relation r, GISTScanOpaque so, ItemPointer iptr) +/* + * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? + * + * The index tuple might represent either a heap tuple or a lower index page, + * depending on whether the containing page is a leaf page or not. + * + * On success return for a heap tuple, *recheck_p is set to indicate + * whether recheck is needed. We recheck if any of the consistent() functions + * request it. recheck is not interesting when examining a non-leaf entry, + * since we must visit the lower index page if there's any doubt. + * + * If we are doing an ordered scan, so->distances[] is filled with distance + * data from the distance() functions before returning success. + * + * We must decompress the key in the IndexTuple before passing it to the + * sk_funcs (which actually are the opclass Consistent or Distance methods). + * + * Note that this function is always invoked in a short-lived memory context, + * so we don't need to worry about cleaning up allocated memory, either here + * or in the implementation of any Consistent or Distance methods. + */ +static bool +gistindex_keytest(IndexScanDesc scan, + IndexTuple tuple, + Page page, + OffsetNumber offset, + bool *recheck_p) { - Page p; - OffsetNumber offset; + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + GISTSTATE *giststate = so->giststate; + ScanKey key = scan->keyData; + int keySize = scan->numberOfKeys; + double *distance_p; + Relation r = scan->indexRelation; - LockBuffer(so->curbuf, GIST_SHARE); - gistcheckpage(r, so->curbuf); - p = (Page) BufferGetPage(so->curbuf); + *recheck_p = false; - if (XLByteEQ(so->stack->lsn, PageGetLSN(p))) + /* + * If it's a leftover invalid tuple from pre-9.1, treat it as a match with + * minimum possible distances. This means we'll always follow it to the + * referenced page. + */ + if (GistTupleIsInvalid(tuple)) { - /* page unchanged, so all is simple */ - offset = ItemPointerGetOffsetNumber(iptr); - ItemIdMarkDead(PageGetItemId(p, offset)); - SetBufferCommitInfoNeedsSave(so->curbuf); + int i; + + if (GistPageIsLeaf(page)) /* shouldn't happen */ + elog(ERROR, "invalid GiST tuple found on leaf page"); + for (i = 0; i < scan->numberOfOrderBys; i++) + so->distances[i] = -get_float8_infinity(); + return true; } - else + + /* Check whether it matches according to the Consistent functions */ + while (keySize > 0) { - OffsetNumber maxoff = PageGetMaxOffsetNumber(p); + Datum datum; + bool isNull; - for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset)) - { - IndexTuple ituple = (IndexTuple) PageGetItem(p, PageGetItemId(p, offset)); + datum = index_getattr(tuple, + key->sk_attno, + giststate->tupdesc, + &isNull); - if (ItemPointerEquals(&(ituple->t_tid), iptr)) + if (key->sk_flags & SK_ISNULL) + { + /* + * On non-leaf page we can't conclude that child hasn't NULL + * values because of assumption in GiST: union (VAL, NULL) is VAL. + * But if on non-leaf page key IS NULL, then all children are + * NULL. + */ + if (key->sk_flags & SK_SEARCHNULL) { - /* found */ - ItemIdMarkDead(PageGetItemId(p, offset)); - SetBufferCommitInfoNeedsSave(so->curbuf); - break; + if (GistPageIsLeaf(page) && !isNull) + return false; + } + else + { + Assert(key->sk_flags & SK_SEARCHNOTNULL); + if (isNull) + return false; } } - } + else if (isNull) + { + return false; + } + else + { + Datum test; + bool recheck; + GISTENTRY de; - LockBuffer(so->curbuf, GIST_UNLOCK); -} + gistdentryinit(giststate, key->sk_attno - 1, &de, + datum, r, page, offset, + FALSE, isNull); -/* - * gistgettuple() -- Get the next tuple in the scan - */ -Datum -gistgettuple(PG_FUNCTION_ARGS) -{ - IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); - ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); - GISTScanOpaque so; - bool res; + /* + * Call the Consistent function to evaluate the test. The + * arguments are the index datum (as a GISTENTRY*), the comparison + * datum, the comparison operator's strategy number and subtype + * from pg_amop, and the recheck flag. + * + * (Presently there's no need to pass the subtype since it'll + * always be zero, but might as well pass it for possible future + * use.) + * + * We initialize the recheck flag to true (the safest assumption) + * in case the Consistent function forgets to set it. + */ + recheck = true; - so = (GISTScanOpaque) scan->opaque; + test = FunctionCall5Coll(&key->sk_func, + key->sk_collation, + PointerGetDatum(&de), + key->sk_argument, + Int32GetDatum(key->sk_strategy), + ObjectIdGetDatum(key->sk_subtype), + PointerGetDatum(&recheck)); - if (dir != ForwardScanDirection) - elog(ERROR, "GiST doesn't support other scan directions than forward"); + if (!DatumGetBool(test)) + return false; + *recheck_p |= recheck; + } - /* - * If we have produced an index tuple in the past and the executor has - * informed us we need to mark it as "killed", do so now. - */ - if (scan->kill_prior_tuple && ItemPointerIsValid(&(so->curpos))) - killtuple(scan->indexRelation, so, &(so->curpos)); + key++; + keySize--; + } - /* - * Get the next tuple that matches the search key. - */ - res = (gistnext(scan, NULL) > 0); + /* OK, it passes --- now let's compute the distances */ + key = scan->orderByData; + distance_p = so->distances; + keySize = scan->numberOfOrderBys; + while (keySize > 0) + { + Datum datum; + bool isNull; - PG_RETURN_BOOL(res); -} + datum = index_getattr(tuple, + key->sk_attno, + giststate->tupdesc, + &isNull); -Datum -gistgetbitmap(PG_FUNCTION_ARGS) -{ - IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); - TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1); - int64 ntids; + if ((key->sk_flags & SK_ISNULL) || isNull) + { + /* Assume distance computes as null and sorts to the end */ + *distance_p = get_float8_infinity(); + } + else + { + Datum dist; + GISTENTRY de; - ntids = gistnext(scan, tbm); + gistdentryinit(giststate, key->sk_attno - 1, &de, + datum, r, page, offset, + FALSE, isNull); - PG_RETURN_INT64(ntids); + /* + * Call the Distance function to evaluate the distance. The + * arguments are the index datum (as a GISTENTRY*), the comparison + * datum, and the ordering operator's strategy number and subtype + * from pg_amop. + * + * (Presently there's no need to pass the subtype since it'll + * always be zero, but might as well pass it for possible future + * use.) + * + * Note that Distance functions don't get a recheck argument. We + * can't tolerate lossy distance calculations on leaf tuples; + * there is no opportunity to re-sort the tuples afterwards. + */ + dist = FunctionCall4Coll(&key->sk_func, + key->sk_collation, + PointerGetDatum(&de), + key->sk_argument, + Int32GetDatum(key->sk_strategy), + ObjectIdGetDatum(key->sk_subtype)); + + *distance_p = DatumGetFloat8(dist); + } + + key++; + distance_p++; + keySize--; + } + + return true; } /* - * Fetch tuple(s) that match the search key; this can be invoked - * either to fetch the first such tuple or subsequent matching tuples. + * Scan all items on the GiST index page identified by *pageItem, and insert + * them into the queue (or directly to output areas) + * + * scan: index scan we are executing + * pageItem: search queue item identifying an index page to scan + * myDistances: distances array associated with pageItem, or NULL at the root + * tbm: if not NULL, gistgetbitmap's output bitmap + * ntids: if not NULL, gistgetbitmap's output tuple counter * - * This function is used by both gistgettuple and gistgetbitmap. When - * invoked from gistgettuple, tbm is null and the next matching tuple - * is returned in scan->xs_ctup.t_self. When invoked from getbitmap, - * tbm is non-null and all matching tuples are added to tbm before - * returning. In both cases, the function result is the number of - * returned tuples. + * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap + * tuples should be reported directly into the bitmap. If they are NULL, + * we're doing a plain or ordered indexscan. For a plain indexscan, heap + * tuple TIDs are returned into so->pageData[]. For an ordered indexscan, + * heap tuple TIDs are pushed into individual search queue items. * - * If scan specifies to skip killed tuples, continue looping until we find a - * non-killed tuple that matches the search key. + * If we detect that the index page has split since we saw its downlink + * in the parent, we push its new right sibling onto the queue so the + * sibling will be processed next. */ -static int64 -gistnext(IndexScanDesc scan, TIDBitmap *tbm) +static void +gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, + TIDBitmap *tbm, int64 *ntids) { - Page p; - OffsetNumber n; - GISTScanOpaque so; - GISTSearchStack *stk; - IndexTuple it; + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + Buffer buffer; + Page page; GISTPageOpaque opaque; - int64 ntids = 0; + OffsetNumber maxoff; + OffsetNumber i; + GISTSearchTreeItem *tmpItem = so->tmpTreeItem; + bool isNew; + MemoryContext oldcxt; - so = (GISTScanOpaque) scan->opaque; + Assert(!GISTSearchItemIsHeap(*pageItem)); - if (so->qual_ok == false) - return 0; + buffer = ReadBuffer(scan->indexRelation, pageItem->blkno); + LockBuffer(buffer, GIST_SHARE); + gistcheckpage(scan->indexRelation, buffer); + page = BufferGetPage(buffer); + opaque = GistPageGetOpaque(page); - if (so->curbuf == InvalidBuffer) + /* + * Check if we need to follow the rightlink. We need to follow it if the + * page was concurrently split since we visited the parent (in which case + * parentlsn < nsn), or if the the system crashed after a page split but + * before the downlink was inserted into the parent. + */ + if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) && + (GistFollowRight(page) || + XLByteLT(pageItem->data.parentlsn, opaque->nsn)) && + opaque->rightlink != InvalidBlockNumber /* sanity check */ ) { - if (ItemPointerIsValid(&so->curpos) == false) - { - /* Being asked to fetch the first entry, so start at the root */ - Assert(so->curbuf == InvalidBuffer); - Assert(so->stack == NULL); + /* There was a page split, follow right link to add pages */ + GISTSearchItem *item; - so->curbuf = ReadBuffer(scan->indexRelation, GIST_ROOT_BLKNO); + /* This can't happen when starting at the root */ + Assert(myDistances != NULL); - stk = so->stack = (GISTSearchStack *) palloc0(sizeof(GISTSearchStack)); + oldcxt = MemoryContextSwitchTo(so->queueCxt); - stk->next = NULL; - stk->block = GIST_ROOT_BLKNO; + /* Create new GISTSearchItem for the right sibling index page */ + item = palloc(sizeof(GISTSearchItem)); + item->next = NULL; + item->blkno = opaque->rightlink; + item->data.parentlsn = pageItem->data.parentlsn; - pgstat_count_index_scan(scan->indexRelation); - } - else - { - /* scan is finished */ - return 0; - } + /* Insert it into the queue using same distances as for this page */ + tmpItem->head = item; + tmpItem->lastHeap = NULL; + memcpy(tmpItem->distances, myDistances, + sizeof(double) * scan->numberOfOrderBys); + + (void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew); + + MemoryContextSwitchTo(oldcxt); } + so->nPageData = so->curPageData = 0; + /* - * check stored pointers from last visit + * check all tuples on page */ - if (so->nPageData > 0) + maxoff = PageGetMaxOffsetNumber(page); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { + IndexTuple it = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + bool match; + bool recheck; + /* - * gistgetmulti never should go here + * Must call gistindex_keytest in tempCxt, and clean up any leftover + * junk afterward. */ - Assert(tbm == NULL); + oldcxt = MemoryContextSwitchTo(so->tempCxt); - if (so->curPageData < so->nPageData) - { - scan->xs_ctup.t_self = so->pageData[so->curPageData].heapPtr; - scan->xs_recheck = so->pageData[so->curPageData].recheck; + match = gistindex_keytest(scan, it, page, i, &recheck); - ItemPointerSet(&so->curpos, - BufferGetBlockNumber(so->curbuf), - so->pageData[so->curPageData].pageOffset); + MemoryContextSwitchTo(oldcxt); + MemoryContextReset(so->tempCxt); - so->curPageData++; + /* Ignore tuple if it doesn't match */ + if (!match) + continue; - return 1; + if (tbm && GistPageIsLeaf(page)) + { + /* + * getbitmap scan, so just push heap tuple TIDs into the bitmap + * without worrying about ordering + */ + tbm_add_tuples(tbm, &it->t_tid, 1, recheck); + (*ntids)++; + } + else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page)) + { + /* + * Non-ordered scan, so report heap tuples in so->pageData[] + */ + so->pageData[so->nPageData].heapPtr = it->t_tid; + so->pageData[so->nPageData].recheck = recheck; + so->nPageData++; } else { /* - * Go to the next page + * Must push item into search queue. We get here for any lower + * index page, and also for heap tuples if doing an ordered + * search. */ - stk = so->stack->next; - pfree(so->stack); - so->stack = stk; + GISTSearchItem *item; - /* If we're out of stack entries, we're done */ - if (so->stack == NULL) + oldcxt = MemoryContextSwitchTo(so->queueCxt); + + /* Create new GISTSearchItem for this item */ + item = palloc(sizeof(GISTSearchItem)); + item->next = NULL; + + if (GistPageIsLeaf(page)) + { + /* Creating heap-tuple GISTSearchItem */ + item->blkno = InvalidBlockNumber; + item->data.heap.heapPtr = it->t_tid; + item->data.heap.recheck = recheck; + } + else { - ReleaseBuffer(so->curbuf); - so->curbuf = InvalidBuffer; - return 0; + /* Creating index-page GISTSearchItem */ + item->blkno = ItemPointerGetBlockNumber(&it->t_tid); + /* lsn of current page is lsn of parent page for child */ + item->data.parentlsn = PageGetLSN(page); } - so->curbuf = ReleaseAndReadBuffer(so->curbuf, - scan->indexRelation, - stk->block); + /* Insert it into the queue using new distance data */ + tmpItem->head = item; + tmpItem->lastHeap = GISTSearchItemIsHeap(*item) ? item : NULL; + memcpy(tmpItem->distances, so->distances, + sizeof(double) * scan->numberOfOrderBys); + + (void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew); + + MemoryContextSwitchTo(oldcxt); } } + UnlockReleaseBuffer(buffer); +} + +/* + * Extract next item (in order) from search queue + * + * Returns a GISTSearchItem or NULL. Caller must pfree item when done with it. + * + * NOTE: on successful return, so->curTreeItem is the GISTSearchTreeItem that + * contained the result item. Callers can use so->curTreeItem->distances as + * the distances value for the item. + */ +static GISTSearchItem * +getNextGISTSearchItem(GISTScanOpaque so) +{ for (;;) { - CHECK_FOR_INTERRUPTS(); + GISTSearchItem *item; - /* First of all, we need lock buffer */ - Assert(so->curbuf != InvalidBuffer); - LockBuffer(so->curbuf, GIST_SHARE); - gistcheckpage(scan->indexRelation, so->curbuf); - p = BufferGetPage(so->curbuf); - opaque = GistPageGetOpaque(p); - - /* remember lsn to identify page changed for tuple's killing */ - so->stack->lsn = PageGetLSN(p); - - /* check page split, occured since visit to parent */ - if (!XLogRecPtrIsInvalid(so->stack->parentlsn) && - XLByteLT(so->stack->parentlsn, opaque->nsn) && - opaque->rightlink != InvalidBlockNumber /* sanity check */ && - (so->stack->next == NULL || so->stack->next->block != opaque->rightlink) /* check if already - added */ ) + /* Update curTreeItem if we don't have one */ + if (so->curTreeItem == NULL) { - /* detect page split, follow right link to add pages */ - - stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack)); - stk->next = so->stack->next; - stk->block = opaque->rightlink; - stk->parentlsn = so->stack->parentlsn; - memset(&(stk->lsn), 0, sizeof(GistNSN)); - so->stack->next = stk; + so->curTreeItem = (GISTSearchTreeItem *) rb_leftmost(so->queue); + /* Done when tree is empty */ + if (so->curTreeItem == NULL) + break; } - /* if page is empty, then just skip it */ - if (PageIsEmpty(p)) + item = so->curTreeItem->head; + if (item != NULL) { - LockBuffer(so->curbuf, GIST_UNLOCK); - stk = so->stack->next; - pfree(so->stack); - so->stack = stk; - - if (so->stack == NULL) - { - ReleaseBuffer(so->curbuf); - so->curbuf = InvalidBuffer; - return ntids; - } - - so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, - stk->block); - continue; + /* Delink item from chain */ + so->curTreeItem->head = item->next; + if (item == so->curTreeItem->lastHeap) + so->curTreeItem->lastHeap = NULL; + /* Return item; caller is responsible to pfree it */ + return item; } - n = FirstOffsetNumber; - - /* wonderful, we can look at page */ - so->nPageData = so->curPageData = 0; - - for (;;) - { - n = gistfindnext(scan, n); - - if (!OffsetNumberIsValid(n)) - { - /* - * If we was called from gistgettuple and current buffer - * contains something matched then make a recursive call - it - * will return ItemPointer from so->pageData. But we save - * buffer pinned to support tuple's killing - */ - if (!tbm && so->nPageData > 0) - { - LockBuffer(so->curbuf, GIST_UNLOCK); - return gistnext(scan, NULL); - } + /* curTreeItem is exhausted, so remove it from rbtree */ + rb_delete(so->queue, (RBNode *) so->curTreeItem); + so->curTreeItem = NULL; + } - /* - * We ran out of matching index entries on the current page, - * so pop the top stack entry and use it to continue the - * search. - */ - LockBuffer(so->curbuf, GIST_UNLOCK); - stk = so->stack->next; - pfree(so->stack); - so->stack = stk; - - /* If we're out of stack entries, we're done */ - - if (so->stack == NULL) - { - ReleaseBuffer(so->curbuf); - so->curbuf = InvalidBuffer; - return ntids; - } - - so->curbuf = ReleaseAndReadBuffer(so->curbuf, - scan->indexRelation, - stk->block); - /* XXX go up */ - break; - } + return NULL; +} - if (GistPageIsLeaf(p)) - { - /* - * We've found a matching index entry in a leaf page, so - * return success. Note that we keep "curbuf" pinned so that - * we can efficiently resume the index scan later. - */ +/* + * Fetch next heap tuple in an ordered search + */ +static bool +getNextNearest(IndexScanDesc scan) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + bool res = false; - if (!(scan->ignore_killed_tuples && - ItemIdIsDead(PageGetItemId(p, n)))) - { - it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); - ntids++; - if (tbm != NULL) - tbm_add_tuples(tbm, &it->t_tid, 1, scan->xs_recheck); - else - { - so->pageData[so->nPageData].heapPtr = it->t_tid; - so->pageData[so->nPageData].pageOffset = n; - so->pageData[so->nPageData].recheck = scan->xs_recheck; - so->nPageData++; - } - } - } - else - { - /* - * We've found an entry in an internal node whose key is - * consistent with the search key, so push it to stack - */ - stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack)); + do + { + GISTSearchItem *item = getNextGISTSearchItem(so); - it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); - stk->block = ItemPointerGetBlockNumber(&(it->t_tid)); - memset(&(stk->lsn), 0, sizeof(GistNSN)); - stk->parentlsn = so->stack->lsn; + if (!item) + break; - stk->next = so->stack->next; - so->stack->next = stk; - } + if (GISTSearchItemIsHeap(*item)) + { + /* found a heap item at currently minimal distance */ + scan->xs_ctup.t_self = item->data.heap.heapPtr; + scan->xs_recheck = item->data.heap.recheck; + res = true; + } + else + { + /* visit an index page, extract its items into queue */ + CHECK_FOR_INTERRUPTS(); - n = OffsetNumberNext(n); + gistScanPage(scan, item, so->curTreeItem->distances, NULL, NULL); } - } - return ntids; + pfree(item); + } while (!res); + + return res; } /* - * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? - * - * On success return for a leaf tuple, scan->xs_recheck is set to indicate - * whether recheck is needed. We recheck if any of the consistent() functions - * request it. - * - * We must decompress the key in the IndexTuple before passing it to the - * sk_func (and we have previously overwritten the sk_func to use the - * user-defined Consistent method, so we actually are invoking that). - * - * Note that this function is always invoked in a short-lived memory context, - * so we don't need to worry about cleaning up allocated memory, either here - * or in the implementation of any Consistent methods. + * gistgettuple() -- Get the next tuple in the scan */ -static bool -gistindex_keytest(IndexTuple tuple, - IndexScanDesc scan, - OffsetNumber offset) +Datum +gistgettuple(PG_FUNCTION_ARGS) { - int keySize = scan->numberOfKeys; - ScanKey key = scan->keyData; - Relation r = scan->indexRelation; - GISTScanOpaque so; - Page p; - GISTSTATE *giststate; - - so = (GISTScanOpaque) scan->opaque; - giststate = so->giststate; - p = BufferGetPage(so->curbuf); + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; - scan->xs_recheck = false; + if (dir != ForwardScanDirection) + elog(ERROR, "GiST only supports forward scan direction"); - /* - * Tuple doesn't restore after crash recovery because of incomplete insert - */ - if (!GistPageIsLeaf(p) && GistTupleIsInvalid(tuple)) - return true; + if (!so->qual_ok) + PG_RETURN_BOOL(false); - while (keySize > 0) + if (so->firstCall) { - Datum datum; - bool isNull; - Datum test; - bool recheck; - GISTENTRY de; + /* Begin the scan by processing the root page */ + GISTSearchItem fakeItem; - datum = index_getattr(tuple, - key->sk_attno, - giststate->tupdesc, - &isNull); + pgstat_count_index_scan(scan->indexRelation); - if (key->sk_flags & SK_ISNULL) + so->firstCall = false; + so->curTreeItem = NULL; + so->curPageData = so->nPageData = 0; + + fakeItem.blkno = GIST_ROOT_BLKNO; + memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); + gistScanPage(scan, &fakeItem, NULL, NULL, NULL); + } + + if (scan->numberOfOrderBys > 0) + { + /* Must fetch tuples in strict distance order */ + PG_RETURN_BOOL(getNextNearest(scan)); + } + else + { + /* Fetch tuples index-page-at-a-time */ + for (;;) { - /* - * On non-leaf page we can't conclude that child hasn't NULL - * values because of assumption in GiST: union (VAL, NULL) is VAL. - * But if on non-leaf page key IS NULL, then all children are - * NULL. - */ - if (key->sk_flags & SK_SEARCHNULL) + if (so->curPageData < so->nPageData) { - if (GistPageIsLeaf(p) && !isNull) - return false; + /* continuing to return tuples from a leaf page */ + scan->xs_ctup.t_self = so->pageData[so->curPageData].heapPtr; + scan->xs_recheck = so->pageData[so->curPageData].recheck; + so->curPageData++; + PG_RETURN_BOOL(true); } - else + + /* find and process the next index page */ + do { - Assert(key->sk_flags & SK_SEARCHNOTNULL); - if (isNull) - return false; - } - } - else if (isNull) - { - return false; - } - else - { - gistdentryinit(giststate, key->sk_attno - 1, &de, - datum, r, p, offset, - FALSE, isNull); + GISTSearchItem *item = getNextGISTSearchItem(so); - /* - * Call the Consistent function to evaluate the test. The - * arguments are the index datum (as a GISTENTRY*), the comparison - * datum, the comparison operator's strategy number and subtype - * from pg_amop, and the recheck flag. - * - * (Presently there's no need to pass the subtype since it'll - * always be zero, but might as well pass it for possible future - * use.) - * - * We initialize the recheck flag to true (the safest assumption) - * in case the Consistent function forgets to set it. - */ - recheck = true; + if (!item) + PG_RETURN_BOOL(false); - test = FunctionCall5(&key->sk_func, - PointerGetDatum(&de), - key->sk_argument, - Int32GetDatum(key->sk_strategy), - ObjectIdGetDatum(key->sk_subtype), - PointerGetDatum(&recheck)); + CHECK_FOR_INTERRUPTS(); - if (!DatumGetBool(test)) - return false; - scan->xs_recheck |= recheck; - } + /* + * While scanning a leaf page, ItemPointers of matching heap + * tuples are stored in so->pageData. If there are any on + * this page, we fall out of the inner "do" and loop around to + * return them. + */ + gistScanPage(scan, item, so->curTreeItem->distances, NULL, NULL); - keySize--; - key++; + pfree(item); + } while (so->nPageData == 0); + } } - return true; + PG_RETURN_BOOL(false); /* keep compiler quiet */ } /* - * Return the offset of the first index entry that is consistent with - * the search key after offset 'n' in the current page. If there are - * no more consistent entries, return InvalidOffsetNumber. - * On success, scan->xs_recheck is set correctly, too. - * Page should be locked.... + * gistgetbitmap() -- Get a bitmap of all heap tuple locations */ -static OffsetNumber -gistfindnext(IndexScanDesc scan, OffsetNumber n) +Datum +gistgetbitmap(PG_FUNCTION_ARGS) { - OffsetNumber maxoff; - IndexTuple it; - GISTScanOpaque so; - MemoryContext oldcxt; - Page p; + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1); + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + int64 ntids = 0; + GISTSearchItem fakeItem; - so = (GISTScanOpaque) scan->opaque; - p = BufferGetPage(so->curbuf); - maxoff = PageGetMaxOffsetNumber(p); + if (!so->qual_ok) + PG_RETURN_INT64(0); + + pgstat_count_index_scan(scan->indexRelation); + + /* Begin the scan by processing the root page */ + so->curTreeItem = NULL; + so->curPageData = so->nPageData = 0; + + fakeItem.blkno = GIST_ROOT_BLKNO; + memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); + gistScanPage(scan, &fakeItem, NULL, tbm, &ntids); /* - * Make sure we're in a short-lived memory context when we invoke a - * user-supplied GiST method in gistindex_keytest(), so we don't leak - * memory + * While scanning a leaf page, ItemPointers of matching heap tuples will + * be stored directly into tbm, so we don't need to deal with them here. */ - oldcxt = MemoryContextSwitchTo(so->tempCxt); - - while (n >= FirstOffsetNumber && n <= maxoff) + for (;;) { - it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); - if (gistindex_keytest(it, scan, n)) + GISTSearchItem *item = getNextGISTSearchItem(so); + + if (!item) break; - n = OffsetNumberNext(n); - } + CHECK_FOR_INTERRUPTS(); - MemoryContextSwitchTo(oldcxt); - MemoryContextReset(so->tempCxt); + gistScanPage(scan, item, so->curTreeItem->distances, tbm, &ntids); - /* - * If we found a matching entry, return its offset; otherwise return - * InvalidOffsetNumber to inform the caller to go to the next page. - */ - if (n >= FirstOffsetNumber && n <= maxoff) - return n; - else - return InvalidOffsetNumber; + pfree(item); + } + + PG_RETURN_INT64(ntids); } diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c index cb34b26113..43c4b1251b 100644 --- a/src/backend/access/gist/gistproc.c +++ b/src/backend/access/gist/gistproc.c @@ -6,11 +6,11 @@ * This gives R-tree behavior, with Guttman's poly-time split algorithm. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistproc.c,v 1.21 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/gist/gistproc.c * *------------------------------------------------------------------------- */ @@ -380,12 +380,12 @@ gist_box_picksplit(PG_FUNCTION_ARGS) for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i)) { cur = DatumGetBoxP(entryvec->vector[i].key); - if (allisequal == true && ( - pageunion.high.x != cur->high.x || - pageunion.high.y != cur->high.y || - pageunion.low.x != cur->low.x || - pageunion.low.y != cur->low.y - )) + if (allisequal && ( + pageunion.high.x != cur->high.x || + pageunion.high.y != cur->high.y || + pageunion.low.x != cur->low.x || + pageunion.low.y != cur->low.y + )) allisequal = false; adjustBox(&pageunion, cur); @@ -904,6 +904,76 @@ gist_point_compress(PG_FUNCTION_ARGS) PG_RETURN_POINTER(entry); } +#define point_point_distance(p1,p2) \ + DatumGetFloat8(DirectFunctionCall2(point_distance, \ + PointPGetDatum(p1), PointPGetDatum(p2))) + +static double +computeDistance(bool isLeaf, BOX *box, Point *point) +{ + double result = 0.0; + + if (isLeaf) + { + /* simple point to point distance */ + result = point_point_distance(point, &box->low); + } + else if (point->x <= box->high.x && point->x >= box->low.x && + point->y <= box->high.y && point->y >= box->low.y) + { + /* point inside the box */ + result = 0.0; + } + else if (point->x <= box->high.x && point->x >= box->low.x) + { + /* point is over or below box */ + Assert(box->low.y <= box->high.y); + if (point->y > box->high.y) + result = point->y - box->high.y; + else if (point->y < box->low.y) + result = box->low.y - point->y; + else + elog(ERROR, "inconsistent point values"); + } + else if (point->y <= box->high.y && point->y >= box->low.y) + { + /* point is to left or right of box */ + Assert(box->low.x <= box->high.x); + if (point->x > box->high.x) + result = point->x - box->high.x; + else if (point->x < box->low.x) + result = box->low.x - point->x; + else + elog(ERROR, "inconsistent point values"); + } + else + { + /* closest point will be a vertex */ + Point p; + double subresult; + + result = point_point_distance(point, &box->low); + + subresult = point_point_distance(point, &box->high); + if (result > subresult) + result = subresult; + + p.x = box->low.x; + p.y = box->high.y; + subresult = point_point_distance(point, &p); + if (result > subresult) + result = subresult; + + p.x = box->high.x; + p.y = box->low.y; + subresult = point_point_distance(point, &p); + if (result > subresult) + result = subresult; + } + + return result; +} + static bool gist_point_consistent_internal(StrategyNumber strategy, bool isLeaf, BOX *key, Point *query) @@ -954,8 +1024,8 @@ gist_point_consistent(PG_FUNCTION_ARGS) { GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); - bool result; bool *recheck = (bool *) PG_GETARG_POINTER(4); + bool result; StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset; switch (strategyGroup) @@ -1034,9 +1104,32 @@ gist_point_consistent(PG_FUNCTION_ARGS) } break; default: - result = false; /* silence compiler warning */ elog(ERROR, "unknown strategy number: %d", strategy); + result = false; /* keep compiler quiet */ } PG_RETURN_BOOL(result); } + +Datum +gist_point_distance(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + double distance; + StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset; + + switch (strategyGroup) + { + case PointStrategyNumberGroup: + distance = computeDistance(GIST_LEAF(entry), + DatumGetBoxP(entry->key), + PG_GETARG_POINT_P(1)); + break; + default: + elog(ERROR, "unknown strategy number: %d", strategy); + distance = 0.0; /* keep compiler quiet */ + } + + PG_RETURN_FLOAT8(distance); +} diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index a53d8cd087..5662a3a4aa 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -4,11 +4,11 @@ * routines to manage scans on GiST index relations * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistscan.c,v 1.79 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/gist/gistscan.c * *------------------------------------------------------------------------- */ @@ -20,18 +20,112 @@ #include "access/relscan.h" #include "storage/bufmgr.h" #include "utils/memutils.h" +#include "utils/rel.h" -static void gistfreestack(GISTSearchStack *s); + +/* + * RBTree support functions for the GISTSearchTreeItem queue + */ + +static int +GISTSearchTreeItemComparator(const RBNode *a, const RBNode *b, void *arg) +{ + const GISTSearchTreeItem *sa = (const GISTSearchTreeItem *) a; + const GISTSearchTreeItem *sb = (const GISTSearchTreeItem *) b; + IndexScanDesc scan = (IndexScanDesc) arg; + int i; + + /* Order according to distance comparison */ + for (i = 0; i < scan->numberOfOrderBys; i++) + { + if (sa->distances[i] != sb->distances[i]) + return (sa->distances[i] > sb->distances[i]) ? 1 : -1; + } + + return 0; +} + +static void +GISTSearchTreeItemCombiner(RBNode *existing, const RBNode *newrb, void *arg) +{ + GISTSearchTreeItem *scurrent = (GISTSearchTreeItem *) existing; + const GISTSearchTreeItem *snew = (const GISTSearchTreeItem *) newrb; + GISTSearchItem *newitem = snew->head; + + /* snew should have just one item in its chain */ + Assert(newitem && newitem->next == NULL); + + /* + * If new item is heap tuple, it goes to front of chain; otherwise insert + * it before the first index-page item, so that index pages are visited in + * LIFO order, ensuring depth-first search of index pages. See comments + * in gist_private.h. + */ + if (GISTSearchItemIsHeap(*newitem)) + { + newitem->next = scurrent->head; + scurrent->head = newitem; + if (scurrent->lastHeap == NULL) + scurrent->lastHeap = newitem; + } + else if (scurrent->lastHeap == NULL) + { + newitem->next = scurrent->head; + scurrent->head = newitem; + } + else + { + newitem->next = scurrent->lastHeap->next; + scurrent->lastHeap->next = newitem; + } +} + +static RBNode * +GISTSearchTreeItemAllocator(void *arg) +{ + IndexScanDesc scan = (IndexScanDesc) arg; + + return palloc(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys); +} + +static void +GISTSearchTreeItemDeleter(RBNode *rb, void *arg) +{ + pfree(rb); +} + + +/* + * Index AM API functions for scanning GiST indexes + */ Datum gistbeginscan(PG_FUNCTION_ARGS) { Relation r = (Relation) PG_GETARG_POINTER(0); int nkeys = PG_GETARG_INT32(1); - ScanKey key = (ScanKey) PG_GETARG_POINTER(2); + int norderbys = PG_GETARG_INT32(2); IndexScanDesc scan; + GISTScanOpaque so; - scan = RelationGetIndexScan(r, nkeys, key); + scan = RelationGetIndexScan(r, nkeys, norderbys); + + /* initialize opaque data */ + so = (GISTScanOpaque) palloc0(sizeof(GISTScanOpaqueData)); + so->queueCxt = AllocSetContextCreate(CurrentMemoryContext, + "GiST queue context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + so->tempCxt = createTempGistContext(); + so->giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE)); + initGISTstate(so->giststate, scan->indexRelation); + /* workspaces with size dependent on numberOfOrderBys: */ + so->tmpTreeItem = palloc(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys); + so->distances = palloc(sizeof(double) * scan->numberOfOrderBys); + so->qual_ok = true; /* in case there are zero keys */ + + scan->opaque = so; PG_RETURN_POINTER(scan); } @@ -41,42 +135,28 @@ gistrescan(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ScanKey key = (ScanKey) PG_GETARG_POINTER(1); - GISTScanOpaque so; + ScanKey orderbys = (ScanKey) PG_GETARG_POINTER(3); + + /* nkeys and norderbys arguments are ignored */ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; int i; + MemoryContext oldCxt; - so = (GISTScanOpaque) scan->opaque; - if (so != NULL) - { - /* rescan an existing indexscan --- reset state */ - gistfreestack(so->stack); - so->stack = NULL; - /* drop pins on buffers -- no locks held */ - if (BufferIsValid(so->curbuf)) - { - ReleaseBuffer(so->curbuf); - so->curbuf = InvalidBuffer; - } - } - else - { - /* initialize opaque data */ - so = (GISTScanOpaque) palloc(sizeof(GISTScanOpaqueData)); - so->stack = NULL; - so->tempCxt = createTempGistContext(); - so->curbuf = InvalidBuffer; - so->giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE)); - initGISTstate(so->giststate, scan->indexRelation); - - scan->opaque = so; - } + /* rescan an existing indexscan --- reset state */ + MemoryContextReset(so->queueCxt); + so->curTreeItem = NULL; - /* - * Clear all the pointers. - */ - ItemPointerSetInvalid(&so->curpos); - so->nPageData = so->curPageData = 0; + /* create new, empty RBTree for search queue */ + oldCxt = MemoryContextSwitchTo(so->queueCxt); + so->queue = rb_create(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys, + GISTSearchTreeItemComparator, + GISTSearchTreeItemCombiner, + GISTSearchTreeItemAllocator, + GISTSearchTreeItemDeleter, + scan); + MemoryContextSwitchTo(oldCxt); - so->qual_ok = true; + so->firstCall = true; /* Update scan key, if a new one is given */ if (key && scan->numberOfKeys > 0) @@ -85,8 +165,8 @@ gistrescan(PG_FUNCTION_ARGS) scan->numberOfKeys * sizeof(ScanKeyData)); /* - * Modify the scan key so that all the Consistent method is called for - * all comparisons. The original operator is passed to the Consistent + * Modify the scan key so that the Consistent method is called for all + * comparisons. The original operator is passed to the Consistent * function in the form of its strategy number, which is available * from the sk_strategy field, and its subtype from the sk_subtype * field. @@ -95,9 +175,11 @@ gistrescan(PG_FUNCTION_ARGS) * SK_SEARCHNULL/SK_SEARCHNOTNULL then nothing can be found (ie, we * assume all indexable operators are strict). */ + so->qual_ok = true; + for (i = 0; i < scan->numberOfKeys; i++) { - ScanKey skey = &(scan->keyData[i]); + ScanKey skey = scan->keyData + i; skey->sk_func = so->giststate->consistentFn[skey->sk_attno - 1]; @@ -109,6 +191,33 @@ gistrescan(PG_FUNCTION_ARGS) } } + /* Update order-by key, if a new one is given */ + if (orderbys && scan->numberOfOrderBys > 0) + { + memmove(scan->orderByData, orderbys, + scan->numberOfOrderBys * sizeof(ScanKeyData)); + + /* + * Modify the order-by key so that the Distance method is called for + * all comparisons. The original operator is passed to the Distance + * function in the form of its strategy number, which is available + * from the sk_strategy field, and its subtype from the sk_subtype + * field. + */ + for (i = 0; i < scan->numberOfOrderBys; i++) + { + ScanKey skey = scan->orderByData + i; + + skey->sk_func = so->giststate->distanceFn[skey->sk_attno - 1]; + + /* Check we actually have a distance function ... */ + if (!OidIsValid(skey->sk_func.fn_oid)) + elog(ERROR, "missing support function %d for attribute %d of index \"%s\"", + GIST_DISTANCE_PROC, skey->sk_attno, + RelationGetRelationName(scan->indexRelation)); + } + } + PG_RETURN_VOID(); } @@ -130,33 +239,14 @@ Datum gistendscan(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); - GISTScanOpaque so; - - so = (GISTScanOpaque) scan->opaque; + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; - if (so != NULL) - { - gistfreestack(so->stack); - if (so->giststate != NULL) - freeGISTstate(so->giststate); - /* drop pins on buffers -- we aren't holding any locks */ - if (BufferIsValid(so->curbuf)) - ReleaseBuffer(so->curbuf); - MemoryContextDelete(so->tempCxt); - pfree(scan->opaque); - } + freeGISTstate(so->giststate); + MemoryContextDelete(so->queueCxt); + MemoryContextDelete(so->tempCxt); + pfree(so->tmpTreeItem); + pfree(so->distances); + pfree(so); PG_RETURN_VOID(); } - -static void -gistfreestack(GISTSearchStack *s) -{ - while (s != NULL) - { - GISTSearchStack *p = s->next; - - pfree(s); - s = p; - } -} diff --git a/src/backend/access/gist/gistsplit.c b/src/backend/access/gist/gistsplit.c index 5700e530fe..bd846cecca 100644 --- a/src/backend/access/gist/gistsplit.c +++ b/src/backend/access/gist/gistsplit.c @@ -4,11 +4,11 @@ * Split page algorithm * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistsplit.c,v 1.12 2010/01/02 16:57:34 momjian Exp $ + * src/backend/access/gist/gistsplit.c * *------------------------------------------------------------------------- */ @@ -325,16 +325,18 @@ genericPickSplit(GISTSTATE *giststate, GistEntryVector *entryvec, GIST_SPLITVEC evec->n = v->spl_nleft; memcpy(evec->vector, entryvec->vector + FirstOffsetNumber, sizeof(GISTENTRY) * evec->n); - v->spl_ldatum = FunctionCall2(&giststate->unionFn[attno], - PointerGetDatum(evec), - PointerGetDatum(&nbytes)); + v->spl_ldatum = FunctionCall2Coll(&giststate->unionFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(evec), + PointerGetDatum(&nbytes)); evec->n = v->spl_nright; memcpy(evec->vector, entryvec->vector + FirstOffsetNumber + v->spl_nleft, sizeof(GISTENTRY) * evec->n); - v->spl_rdatum = FunctionCall2(&giststate->unionFn[attno], - PointerGetDatum(evec), - PointerGetDatum(&nbytes)); + v->spl_rdatum = FunctionCall2Coll(&giststate->unionFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(evec), + PointerGetDatum(&nbytes)); } /* @@ -361,9 +363,10 @@ gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVec sv->spl_ldatum = v->spl_lattr[attno]; sv->spl_rdatum = v->spl_rattr[attno]; - FunctionCall2(&giststate->picksplitFn[attno], - PointerGetDatum(entryvec), - PointerGetDatum(sv)); + FunctionCall2Coll(&giststate->picksplitFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(entryvec), + PointerGetDatum(sv)); if (sv->spl_nleft == 0 || sv->spl_nright == 0) { @@ -500,58 +503,6 @@ gistSplitHalf(GIST_SPLITVEC *v, int len) } /* - * if it was invalid tuple then we need special processing. - * We move all invalid tuples on right page. - * - * if there is no place on left page, gistSplit will be called one more - * time for left page. - * - * Normally, we never exec this code, but after crash replay it's possible - * to get 'invalid' tuples (probability is low enough) - */ -static void -gistSplitByInvalid(GISTSTATE *giststate, GistSplitVector *v, IndexTuple *itup, int len) -{ - int i; - static OffsetNumber offInvTuples[MaxOffsetNumber]; - int nOffInvTuples = 0; - - for (i = 1; i <= len; i++) - if (GistTupleIsInvalid(itup[i - 1])) - offInvTuples[nOffInvTuples++] = i; - - if (nOffInvTuples == len) - { - /* corner case, all tuples are invalid */ - v->spl_rightvalid = v->spl_leftvalid = false; - gistSplitHalf(&v->splitVector, len); - } - else - { - GistSplitUnion gsvp; - - v->splitVector.spl_right = offInvTuples; - v->splitVector.spl_nright = nOffInvTuples; - v->spl_rightvalid = false; - - v->splitVector.spl_left = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); - v->splitVector.spl_nleft = 0; - for (i = 1; i <= len; i++) - if (!GistTupleIsInvalid(itup[i - 1])) - v->splitVector.spl_left[v->splitVector.spl_nleft++] = i; - v->spl_leftvalid = true; - - gsvp.equiv = NULL; - gsvp.attr = v->spl_lattr; - gsvp.len = v->splitVector.spl_nleft; - gsvp.entries = v->splitVector.spl_left; - gsvp.isnull = v->spl_lisnull; - - gistunionsubkeyvec(giststate, itup, &gsvp, 0); - } -} - -/* * trys to split page by attno key, in a case of null * values move its to separate page. */ @@ -568,12 +519,6 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *gist Datum datum; bool IsNull; - if (!GistPageIsLeaf(page) && GistTupleIsInvalid(itup[i - 1])) - { - gistSplitByInvalid(giststate, v, itup, len); - return; - } - datum = index_getattr(itup[i - 1], attno + 1, giststate->tupdesc, &IsNull); gistdentryinit(giststate, attno, &(entryvec->vector[i]), datum, r, page, i, @@ -582,8 +527,6 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *gist offNullTuples[nOffNullTuples++] = i; } - v->spl_leftvalid = v->spl_rightvalid = true; - if (nOffNullTuples == len) { /* diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 03c5773d4d..1754a10369 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -4,15 +4,17 @@ * utilities routines for the postgres GiST index access method. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.35 2010/01/02 16:57:34 momjian Exp $ + * src/backend/access/gist/gistutil.c *------------------------------------------------------------------------- */ #include "postgres.h" +#include <math.h> + #include "access/gist_private.h" #include "access/reloptions.h" #include "storage/freespace.h" @@ -152,7 +154,7 @@ gistfillitupvec(IndexTuple *vec, int veclen, int *memlen) * invalid tuple. Resulting Datums aren't compressed. */ -bool +void gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, int startkey, Datum *attr, bool *isnull) { @@ -180,10 +182,6 @@ gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, int startke Datum datum; bool IsNull; - if (GistTupleIsInvalid(itvec[j])) - return FALSE; /* signals that union with invalid tuple => - * result is invalid */ - datum = index_getattr(itvec[j], i + 1, giststate->tupdesc, &IsNull); if (IsNull) continue; @@ -211,15 +209,14 @@ gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, int startke } /* Make union and store in attr array */ - attr[i] = FunctionCall2(&giststate->unionFn[i], - PointerGetDatum(evec), - PointerGetDatum(&attrsize)); + attr[i] = FunctionCall2Coll(&giststate->unionFn[i], + giststate->supportCollation[i], + PointerGetDatum(evec), + PointerGetDatum(&attrsize)); isnull[i] = FALSE; } } - - return TRUE; } /* @@ -231,8 +228,7 @@ gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate) { memset(isnullS, TRUE, sizeof(bool) * giststate->tupdesc->natts); - if (!gistMakeUnionItVec(giststate, itvec, len, 0, attrS, isnullS)) - return gist_form_invalid_tuple(InvalidBlockNumber); + gistMakeUnionItVec(giststate, itvec, len, 0, attrS, isnullS); return gistFormTuple(giststate, r, attrS, isnullS, false); } @@ -278,9 +274,10 @@ gistMakeUnionKey(GISTSTATE *giststate, int attno, } *dstisnull = FALSE; - *dst = FunctionCall2(&giststate->unionFn[attno], - PointerGetDatum(evec), - PointerGetDatum(&dstsize)); + *dst = FunctionCall2Coll(&giststate->unionFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(evec), + PointerGetDatum(&dstsize)); } } @@ -289,9 +286,10 @@ gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b) { bool result; - FunctionCall3(&giststate->equalFn[attno], - a, b, - PointerGetDatum(&result)); + FunctionCall3Coll(&giststate->equalFn[attno], + giststate->supportCollation[attno], + a, b, + PointerGetDatum(&result)); return result; } @@ -328,9 +326,6 @@ gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *gis IndexTuple newtup = NULL; int i; - if (GistTupleIsInvalid(oldtup) || GistTupleIsInvalid(addtup)) - return gist_form_invalid_tuple(ItemPointerGetBlockNumber(&(oldtup->t_tid))); - gistDeCompressAtt(giststate, r, oldtup, NULL, (OffsetNumber) 0, oldentries, oldisnull); @@ -401,14 +396,6 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ int j; IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i)); - if (!GistPageIsLeaf(p) && GistTupleIsInvalid(itup)) - { - ereport(LOG, - (errmsg("index \"%s\" needs VACUUM or REINDEX to finish crash recovery", - RelationGetRelationName(r)))); - continue; - } - sum_grow = 0; for (j = 0; j < r->rd_att->natts; j++) { @@ -460,8 +447,9 @@ gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, gistentryinit(*e, k, r, pg, o, l); dep = (GISTENTRY *) - DatumGetPointer(FunctionCall1(&giststate->decompressFn[nkey], - PointerGetDatum(e))); + DatumGetPointer(FunctionCall1Coll(&giststate->decompressFn[nkey], + giststate->supportCollation[nkey], + PointerGetDatum(e))); /* decompressFn may just return the given pointer */ if (dep != e) gistentryinit(*e, dep->key, dep->rel, dep->page, dep->offset, @@ -486,8 +474,9 @@ gistcentryinit(GISTSTATE *giststate, int nkey, gistentryinit(*e, k, r, pg, o, l); cep = (GISTENTRY *) - DatumGetPointer(FunctionCall1(&giststate->compressFn[nkey], - PointerGetDatum(e))); + DatumGetPointer(FunctionCall1Coll(&giststate->compressFn[nkey], + giststate->supportCollation[nkey], + PointerGetDatum(e))); /* compressFn may just return the given pointer */ if (cep != e) gistentryinit(*e, cep->key, cep->rel, cep->page, cep->offset, @@ -521,7 +510,12 @@ gistFormTuple(GISTSTATE *giststate, Relation r, } res = index_form_tuple(giststate->tupdesc, compatt, isnull); - GistTupleSetValid(res); + + /* + * The offset number on tuples on internal pages is unused. For historical + * reasons, it is set 0xffff. + */ + ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff); return res; } @@ -532,16 +526,23 @@ gistpenalty(GISTSTATE *giststate, int attno, { float penalty = 0.0; - if (giststate->penaltyFn[attno].fn_strict == FALSE || (isNullOrig == FALSE && isNullAdd == FALSE)) - FunctionCall3(&giststate->penaltyFn[attno], - PointerGetDatum(orig), - PointerGetDatum(add), - PointerGetDatum(&penalty)); + if (giststate->penaltyFn[attno].fn_strict == FALSE || + (isNullOrig == FALSE && isNullAdd == FALSE)) + { + FunctionCall3Coll(&giststate->penaltyFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(orig), + PointerGetDatum(add), + PointerGetDatum(&penalty)); + /* disallow negative or NaN penalty */ + if (isnan(penalty) || penalty < 0.0) + penalty = 0.0; + } else if (isNullOrig && isNullAdd) penalty = 0.0; else - penalty = 1e10; /* try to prevent to mix null and non-null - * value */ + penalty = 1e10; /* try to prevent mixing null and non-null + * values */ return penalty; } @@ -677,3 +678,24 @@ gistoptions(PG_FUNCTION_ARGS) PG_RETURN_BYTEA_P(result); PG_RETURN_NULL(); } + +/* + * Temporary GiST indexes are not WAL-logged, but we need LSNs to detect + * concurrent page splits anyway. GetXLogRecPtrForTemp() provides a fake + * sequence of LSNs for that purpose. Each call generates an LSN that is + * greater than any previous value returned by this function in the same + * session. + */ +XLogRecPtr +GetXLogRecPtrForTemp(void) +{ + static XLogRecPtr counter = {0, 1}; + + counter.xrecoff++; + if (counter.xrecoff == 0) + { + counter.xlogid++; + counter.xrecoff++; + } + return counter; +} diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index abd3d99956..33e6f34154 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -4,11 +4,11 @@ * vacuuming routines for the postgres GiST index access method. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.48 2010/02/08 05:17:31 tgl Exp $ + * src/backend/access/gist/gistvacuum.c * *------------------------------------------------------------------------- */ @@ -26,13 +26,6 @@ #include "utils/memutils.h" -typedef struct GistBulkDeleteResult -{ - IndexBulkDeleteResult std; /* common state */ - bool needReindex; -} GistBulkDeleteResult; - - /* * VACUUM cleanup: update FSM */ @@ -40,13 +33,11 @@ Datum gistvacuumcleanup(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); - GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1); + IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); Relation rel = info->index; BlockNumber npages, blkno; BlockNumber totFreePages; - BlockNumber lastBlock = GIST_ROOT_BLKNO, - lastFilledBlock = GIST_ROOT_BLKNO; bool needLock; /* No-op in ANALYZE ONLY mode */ @@ -56,10 +47,10 @@ gistvacuumcleanup(PG_FUNCTION_ARGS) /* Set up all-zero stats if gistbulkdelete wasn't called */ if (stats == NULL) { - stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* use heap's tuple count */ - stats->std.num_index_tuples = info->num_heap_tuples; - stats->std.estimated_count = info->estimated_count; + stats->num_index_tuples = info->num_heap_tuples; + stats->estimated_count = info->estimated_count; /* * XXX the above is wrong if index is partial. Would it be OK to just @@ -67,11 +58,6 @@ gistvacuumcleanup(PG_FUNCTION_ARGS) */ } - if (stats->needReindex) - ereport(NOTICE, - (errmsg("index \"%s\" needs VACUUM FULL or REINDEX to finish crash recovery", - RelationGetRelationName(rel)))); - /* * Need lock unless it's local to this backend. */ @@ -102,20 +88,17 @@ gistvacuumcleanup(PG_FUNCTION_ARGS) totFreePages++; RecordFreeIndexPage(rel, blkno); } - else - lastFilledBlock = blkno; UnlockReleaseBuffer(buffer); } - lastBlock = npages - 1; /* Finally, vacuum the FSM */ IndexFreeSpaceMapVacuum(info->index); /* return statistics */ - stats->std.pages_free = totFreePages; + stats->pages_free = totFreePages; if (needLock) LockRelationForExtension(rel, ExclusiveLock); - stats->std.num_pages = RelationGetNumberOfBlocks(rel); + stats->num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); @@ -135,7 +118,7 @@ pushStackIfSplited(Page page, GistBDItem *stack) GISTPageOpaque opaque = GistPageGetOpaque(page); if (stack->blkno != GIST_ROOT_BLKNO && !XLogRecPtrIsInvalid(stack->parentlsn) && - XLByteLT(stack->parentlsn, opaque->nsn) && + (GistFollowRight(page) || XLByteLT(stack->parentlsn, opaque->nsn)) && opaque->rightlink != InvalidBlockNumber /* sanity check */ ) { /* split page detected, install right link to the stack */ @@ -162,7 +145,7 @@ Datum gistbulkdelete(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); - GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1); + IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation rel = info->index; @@ -171,10 +154,10 @@ gistbulkdelete(PG_FUNCTION_ARGS) /* first time through? */ if (stats == NULL) - stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* we'll re-count the tuples each time */ - stats->std.estimated_count = false; - stats->std.num_index_tuples = 0; + stats->estimated_count = false; + stats->num_index_tuples = 0; stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; @@ -232,10 +215,10 @@ gistbulkdelete(PG_FUNCTION_ARGS) { todelete[ntodelete] = i - ntodelete; ntodelete++; - stats->std.tuples_removed += 1; + stats->tuples_removed += 1; } else - stats->std.num_index_tuples += 1; + stats->num_index_tuples += 1; } if (ntodelete) @@ -248,27 +231,18 @@ gistbulkdelete(PG_FUNCTION_ARGS) PageIndexTupleDelete(page, todelete[i]); GistMarkTuplesDeleted(page); - if (!rel->rd_istemp) + if (RelationNeedsWAL(rel)) { - XLogRecData *rdata; XLogRecPtr recptr; - gistxlogPageUpdate *xlinfo; - rdata = formUpdateRdata(rel->rd_node, buffer, + recptr = gistXLogUpdate(rel->rd_node, buffer, todelete, ntodelete, - NULL, 0, - NULL); - xlinfo = (gistxlogPageUpdate *) rdata->next->data; - - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); + NULL, 0, InvalidBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); - - pfree(xlinfo); - pfree(rdata); } else - PageSetLSN(page, XLogRecPtrForTemp); + PageSetLSN(page, GetXLogRecPtrForTemp()); END_CRIT_SECTION(); } @@ -293,7 +267,11 @@ gistbulkdelete(PG_FUNCTION_ARGS) stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) - stats->needReindex = true; + ereport(LOG, + (errmsg("index \"%s\" contains an inner tuple marked as invalid", + RelationGetRelationName(rel)), + errdetail("This is caused by an incomplete page split at crash recovery before upgrading to 9.1."), + errhint("Please REINDEX it."))); } } diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 7f5dd990c8..02c4ec3a6f 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -4,11 +4,11 @@ * WAL replay logic for GiST. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistxlog.c,v 1.35 2010/01/02 16:57:34 momjian Exp $ + * src/backend/access/gist/gistxlog.c *------------------------------------------------------------------------- */ #include "postgres.h" @@ -20,15 +20,6 @@ #include "utils/memutils.h" #include "utils/rel.h" - -typedef struct -{ - gistxlogPageUpdate *data; - int len; - IndexTuple *itup; - OffsetNumber *todelete; -} PageUpdateRecord; - typedef struct { gistxlogPage *header; @@ -41,144 +32,37 @@ typedef struct NewPage *page; } PageSplitRecord; -/* track for incomplete inserts, idea was taken from nbtxlog.c */ - -typedef struct gistIncompleteInsert -{ - RelFileNode node; - BlockNumber origblkno; /* for splits */ - ItemPointerData key; - int lenblk; - BlockNumber *blkno; - XLogRecPtr lsn; - BlockNumber *path; - int pathlen; -} gistIncompleteInsert; - - static MemoryContext opCtx; /* working memory for operations */ -static MemoryContext insertCtx; /* holds incomplete_inserts list */ -static List *incomplete_inserts; - - -#define ItemPointerEQ(a, b) \ - ( ItemPointerGetOffsetNumber(a) == ItemPointerGetOffsetNumber(b) && \ - ItemPointerGetBlockNumber (a) == ItemPointerGetBlockNumber(b) ) - +/* + * Replay the clearing of F_FOLLOW_RIGHT flag. + */ static void -pushIncompleteInsert(RelFileNode node, XLogRecPtr lsn, ItemPointerData key, - BlockNumber *blkno, int lenblk, - PageSplitRecord *xlinfo /* to extract blkno info */ ) +gistRedoClearFollowRight(RelFileNode node, XLogRecPtr lsn, + BlockNumber leftblkno) { - MemoryContext oldCxt; - gistIncompleteInsert *ninsert; + Buffer buffer; - if (!ItemPointerIsValid(&key)) + buffer = XLogReadBuffer(node, leftblkno, false); + if (BufferIsValid(buffer)) + { + Page page = (Page) BufferGetPage(buffer); /* - * if key is null then we should not store insertion as incomplete, - * because it's a vacuum operation.. + * Note that we still update the page even if page LSN is equal to the + * LSN of this record, because the updated NSN is not included in the + * full page image. */ - return; - - oldCxt = MemoryContextSwitchTo(insertCtx); - ninsert = (gistIncompleteInsert *) palloc(sizeof(gistIncompleteInsert)); - - ninsert->node = node; - ninsert->key = key; - ninsert->lsn = lsn; - - if (lenblk && blkno) - { - ninsert->lenblk = lenblk; - ninsert->blkno = (BlockNumber *) palloc(sizeof(BlockNumber) * ninsert->lenblk); - memcpy(ninsert->blkno, blkno, sizeof(BlockNumber) * ninsert->lenblk); - ninsert->origblkno = *blkno; - } - else - { - int i; - - Assert(xlinfo); - ninsert->lenblk = xlinfo->data->npage; - ninsert->blkno = (BlockNumber *) palloc(sizeof(BlockNumber) * ninsert->lenblk); - for (i = 0; i < ninsert->lenblk; i++) - ninsert->blkno[i] = xlinfo->page[i].header->blkno; - ninsert->origblkno = xlinfo->data->origblkno; - } - Assert(ninsert->lenblk > 0); - - /* - * Stick the new incomplete insert onto the front of the list, not the - * back. This is so that gist_xlog_cleanup will process incompletions in - * last-in-first-out order. - */ - incomplete_inserts = lcons(ninsert, incomplete_inserts); - - MemoryContextSwitchTo(oldCxt); -} - -static void -forgetIncompleteInsert(RelFileNode node, ItemPointerData key) -{ - ListCell *l; - - if (!ItemPointerIsValid(&key)) - return; - - if (incomplete_inserts == NIL) - return; - - foreach(l, incomplete_inserts) - { - gistIncompleteInsert *insert = (gistIncompleteInsert *) lfirst(l); - - if (RelFileNodeEquals(node, insert->node) && ItemPointerEQ(&(insert->key), &(key))) + if (!XLByteLT(lsn, PageGetLSN(page))) { - /* found */ - incomplete_inserts = list_delete_ptr(incomplete_inserts, insert); - pfree(insert->blkno); - pfree(insert); - break; - } - } -} + GistPageGetOpaque(page)->nsn = lsn; + GistClearFollowRight(page); -static void -decodePageUpdateRecord(PageUpdateRecord *decoded, XLogRecord *record) -{ - char *begin = XLogRecGetData(record), - *ptr; - int i = 0, - addpath = 0; - - decoded->data = (gistxlogPageUpdate *) begin; - - if (decoded->data->ntodelete) - { - decoded->todelete = (OffsetNumber *) (begin + sizeof(gistxlogPageUpdate) + addpath); - addpath = MAXALIGN(sizeof(OffsetNumber) * decoded->data->ntodelete); - } - else - decoded->todelete = NULL; - - decoded->len = 0; - ptr = begin + sizeof(gistxlogPageUpdate) + addpath; - while (ptr - begin < record->xl_len) - { - decoded->len++; - ptr += IndexTupleSize((IndexTuple) ptr); - } - - decoded->itup = (IndexTuple *) palloc(sizeof(IndexTuple) * decoded->len); - - ptr = begin + sizeof(gistxlogPageUpdate) + addpath; - while (ptr - begin < record->xl_len) - { - decoded->itup[i] = (IndexTuple) ptr; - ptr += IndexTupleSize(decoded->itup[i]); - i++; + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); } } @@ -186,29 +70,22 @@ decodePageUpdateRecord(PageUpdateRecord *decoded, XLogRecord *record) * redo any page update (except page split) */ static void -gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record, bool isnewroot) +gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) { - gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record); - PageUpdateRecord xlrec; + char *begin = XLogRecGetData(record); + gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin; Buffer buffer; Page page; + char *data; - /* we must fix incomplete_inserts list even if XLR_BKP_BLOCK_1 is set */ - forgetIncompleteInsert(xldata->node, xldata->key); + if (BlockNumberIsValid(xldata->leftchild)) + gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild); - if (!isnewroot && xldata->blkno != GIST_ROOT_BLKNO) - /* operation with root always finalizes insertion */ - pushIncompleteInsert(xldata->node, lsn, xldata->key, - &(xldata->blkno), 1, - NULL); - - /* nothing else to do if page was backed up (and no info to do it with) */ + /* nothing more to do if page was backed up (and no info to do it with) */ if (record->xl_info & XLR_BKP_BLOCK_1) return; - decodePageUpdateRecord(&xlrec, record); - - buffer = XLogReadBuffer(xlrec.data->node, xlrec.data->blkno, false); + buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (!BufferIsValid(buffer)) return; page = (Page) BufferGetPage(buffer); @@ -219,28 +96,52 @@ gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record, bool isnewroot) return; } - if (isnewroot) - GISTInitBuffer(buffer, 0); - else if (xlrec.data->ntodelete) + data = begin + sizeof(gistxlogPageUpdate); + + /* Delete old tuples */ + if (xldata->ntodelete > 0) { int i; + OffsetNumber *todelete = (OffsetNumber *) data; + + data += sizeof(OffsetNumber) * xldata->ntodelete; - for (i = 0; i < xlrec.data->ntodelete; i++) - PageIndexTupleDelete(page, xlrec.todelete[i]); + for (i = 0; i < xldata->ntodelete; i++) + PageIndexTupleDelete(page, todelete[i]); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* add tuples */ - if (xlrec.len > 0) - gistfillbuffer(page, xlrec.itup, xlrec.len, InvalidOffsetNumber); + if (data - begin < record->xl_len) + { + OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : + OffsetNumberNext(PageGetMaxOffsetNumber(page)); - /* - * special case: leafpage, nothing to insert, nothing to delete, then - * vacuum marks page - */ - if (GistPageIsLeaf(page) && xlrec.len == 0 && xlrec.data->ntodelete == 0) - GistClearTuplesDeleted(page); + while (data - begin < record->xl_len) + { + IndexTuple itup = (IndexTuple) data; + Size sz = IndexTupleSize(itup); + OffsetNumber l; + + data += sz; + + l = PageAddItem(page, (Item) itup, sz, off, false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to GiST index page, size %d bytes", + (int) sz); + off++; + } + } + else + { + /* + * special case: leafpage, nothing to insert, nothing to delete, then + * vacuum marks page + */ + if (GistPageIsLeaf(page) && xldata->ntodelete == 0) + GistClearTuplesDeleted(page); + } if (!GistPageIsLeaf(page) && PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && xldata->blkno == GIST_ROOT_BLKNO) @@ -315,41 +216,67 @@ decodePageSplitRecord(PageSplitRecord *decoded, XLogRecord *record) static void gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record) { + gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record); PageSplitRecord xlrec; Buffer buffer; Page page; int i; - int flags; + bool isrootsplit = false; + if (BlockNumberIsValid(xldata->leftchild)) + gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild); decodePageSplitRecord(&xlrec, record); - flags = xlrec.data->origleaf ? F_LEAF : 0; /* loop around all pages */ for (i = 0; i < xlrec.data->npage; i++) { NewPage *newpage = xlrec.page + i; + int flags; + + if (newpage->header->blkno == GIST_ROOT_BLKNO) + { + Assert(i == 0); + isrootsplit = true; + } buffer = XLogReadBuffer(xlrec.data->node, newpage->header->blkno, true); Assert(BufferIsValid(buffer)); page = (Page) BufferGetPage(buffer); /* ok, clear buffer */ + if (xlrec.data->origleaf && newpage->header->blkno != GIST_ROOT_BLKNO) + flags = F_LEAF; + else + flags = 0; GISTInitBuffer(buffer, flags); /* and fill it */ gistfillbuffer(page, newpage->itup, newpage->header->num, FirstOffsetNumber); + if (newpage->header->blkno == GIST_ROOT_BLKNO) + { + GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; + GistPageGetOpaque(page)->nsn = xldata->orignsn; + GistClearFollowRight(page); + } + else + { + if (i < xlrec.data->npage - 1) + GistPageGetOpaque(page)->rightlink = xlrec.page[i + 1].header->blkno; + else + GistPageGetOpaque(page)->rightlink = xldata->origrlink; + GistPageGetOpaque(page)->nsn = xldata->orignsn; + if (i < xlrec.data->npage - 1 && !isrootsplit) + GistMarkFollowRight(page); + else + GistClearFollowRight(page); + } + PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); } - - forgetIncompleteInsert(xlrec.data->node, xlrec.data->key); - - pushIncompleteInsert(xlrec.data->node, lsn, xlrec.data->key, - NULL, 0, - &xlrec); } static void @@ -372,24 +299,6 @@ gistRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) UnlockReleaseBuffer(buffer); } -static void -gistRedoCompleteInsert(XLogRecPtr lsn, XLogRecord *record) -{ - char *begin = XLogRecGetData(record), - *ptr; - gistxlogInsertComplete *xlrec; - - xlrec = (gistxlogInsertComplete *) begin; - - ptr = begin + sizeof(gistxlogInsertComplete); - while (ptr - begin < record->xl_len) - { - Assert(record->xl_len - (ptr - begin) >= sizeof(ItemPointerData)); - forgetIncompleteInsert(xlrec->node, *((ItemPointerData *) ptr)); - ptr += sizeof(ItemPointerData); - } -} - void gist_redo(XLogRecPtr lsn, XLogRecord *record) { @@ -397,34 +306,27 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record) MemoryContext oldCxt; /* - * GIST indexes do not require any conflict processing. NB: If we ever + * GiST indexes do not require any conflict processing. NB: If we ever * implement a similar optimization we have in b-tree, and remove killed * tuples outside VACUUM, we'll need to handle that here. */ - RestoreBkpBlocks(lsn, record, false); oldCxt = MemoryContextSwitchTo(opCtx); switch (info) { case XLOG_GIST_PAGE_UPDATE: - gistRedoPageUpdateRecord(lsn, record, false); + gistRedoPageUpdateRecord(lsn, record); break; case XLOG_GIST_PAGE_DELETE: gistRedoPageDeleteRecord(lsn, record); break; - case XLOG_GIST_NEW_ROOT: - gistRedoPageUpdateRecord(lsn, record, true); - break; case XLOG_GIST_PAGE_SPLIT: gistRedoPageSplitRecord(lsn, record); break; case XLOG_GIST_CREATE_INDEX: gistRedoCreateIndex(lsn, record); break; - case XLOG_GIST_INSERT_COMPLETE: - gistRedoCompleteInsert(lsn, record); - break; default: elog(PANIC, "gist_redo: unknown op code %u", info); } @@ -434,20 +336,16 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record) } static void -out_target(StringInfo buf, RelFileNode node, ItemPointerData key) +out_target(StringInfo buf, RelFileNode node) { appendStringInfo(buf, "rel %u/%u/%u", node.spcNode, node.dbNode, node.relNode); - if (ItemPointerIsValid(&key)) - appendStringInfo(buf, "; tid %u/%u", - ItemPointerGetBlockNumber(&key), - ItemPointerGetOffsetNumber(&key)); } static void out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec) { - out_target(buf, xlrec->node, xlrec->key); + out_target(buf, xlrec->node); appendStringInfo(buf, "; block number %u", xlrec->blkno); } @@ -463,7 +361,7 @@ static void out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec) { appendStringInfo(buf, "page_split: "); - out_target(buf, xlrec->node, xlrec->key); + out_target(buf, xlrec->node); appendStringInfo(buf, "; block number %u splits to %d pages", xlrec->origblkno, xlrec->npage); } @@ -482,10 +380,6 @@ gist_desc(StringInfo buf, uint8 xl_info, char *rec) case XLOG_GIST_PAGE_DELETE: out_gistxlogPageDelete(buf, (gistxlogPageDelete *) rec); break; - case XLOG_GIST_NEW_ROOT: - appendStringInfo(buf, "new_root: "); - out_target(buf, ((gistxlogPageUpdate *) rec)->node, ((gistxlogPageUpdate *) rec)->key); - break; case XLOG_GIST_PAGE_SPLIT: out_gistxlogPageSplit(buf, (gistxlogPageSplit *) rec); break; @@ -495,415 +389,102 @@ gist_desc(StringInfo buf, uint8 xl_info, char *rec) ((RelFileNode *) rec)->dbNode, ((RelFileNode *) rec)->relNode); break; - case XLOG_GIST_INSERT_COMPLETE: - appendStringInfo(buf, "complete_insert: rel %u/%u/%u", - ((gistxlogInsertComplete *) rec)->node.spcNode, - ((gistxlogInsertComplete *) rec)->node.dbNode, - ((gistxlogInsertComplete *) rec)->node.relNode); - break; default: appendStringInfo(buf, "unknown gist op code %u", info); break; } } -IndexTuple -gist_form_invalid_tuple(BlockNumber blkno) -{ - /* - * we don't alloc space for null's bitmap, this is invalid tuple, be - * carefull in read and write code - */ - Size size = IndexInfoFindDataOffset(0); - IndexTuple tuple = (IndexTuple) palloc0(size); - - tuple->t_info |= size; - - ItemPointerSetBlockNumber(&(tuple->t_tid), blkno); - GistTupleSetInvalid(tuple); - - return tuple; -} - - -static void -gistxlogFindPath(Relation index, gistIncompleteInsert *insert) -{ - GISTInsertStack *top; - - insert->pathlen = 0; - insert->path = NULL; - - if ((top = gistFindPath(index, insert->origblkno)) != NULL) - { - int i; - GISTInsertStack *ptr; - - for (ptr = top; ptr; ptr = ptr->parent) - insert->pathlen++; - - insert->path = (BlockNumber *) palloc(sizeof(BlockNumber) * insert->pathlen); - - i = 0; - for (ptr = top; ptr; ptr = ptr->parent) - insert->path[i++] = ptr->blkno; - } - else - elog(ERROR, "lost parent for block %u", insert->origblkno); -} - -static SplitedPageLayout * -gistMakePageLayout(Buffer *buffers, int nbuffers) -{ - SplitedPageLayout *res = NULL, - *resptr; - - while (nbuffers-- > 0) - { - Page page = BufferGetPage(buffers[nbuffers]); - IndexTuple *vec; - int veclen; - - resptr = (SplitedPageLayout *) palloc0(sizeof(SplitedPageLayout)); - - resptr->block.blkno = BufferGetBlockNumber(buffers[nbuffers]); - resptr->block.num = PageGetMaxOffsetNumber(page); - - vec = gistextractpage(page, &veclen); - resptr->list = gistfillitupvec(vec, veclen, &(resptr->lenlist)); - - resptr->next = res; - res = resptr; - } - - return res; -} - -/* - * Continue insert after crash. In normal situations, there aren't any - * incomplete inserts, but if a crash occurs partway through an insertion - * sequence, we'll need to finish making the index valid at the end of WAL - * replay. - * - * Note that we assume the index is now in a valid state, except for the - * unfinished insertion. In particular it's safe to invoke gistFindPath(); - * there shouldn't be any garbage pages for it to run into. - * - * To complete insert we can't use basic insertion algorithm because - * during insertion we can't call user-defined support functions of opclass. - * So, we insert 'invalid' tuples without real key and do it by separate algorithm. - * 'invalid' tuple should be updated by vacuum full. - */ -static void -gistContinueInsert(gistIncompleteInsert *insert) -{ - IndexTuple *itup; - int i, - lenitup; - Relation index; - - index = CreateFakeRelcacheEntry(insert->node); - - /* - * needed vector itup never will be more than initial lenblkno+2, because - * during this processing Indextuple can be only smaller - */ - lenitup = insert->lenblk; - itup = (IndexTuple *) palloc(sizeof(IndexTuple) * (lenitup + 2 /* guarantee root split */ )); - - for (i = 0; i < insert->lenblk; i++) - itup[i] = gist_form_invalid_tuple(insert->blkno[i]); - - /* - * any insertion of itup[] should make LOG message about - */ - - if (insert->origblkno == GIST_ROOT_BLKNO) - { - /* - * it was split root, so we should only make new root. it can't be - * simple insert into root, we should replace all content of root. - */ - Buffer buffer = XLogReadBuffer(insert->node, GIST_ROOT_BLKNO, true); - - gistnewroot(index, buffer, itup, lenitup, NULL); - UnlockReleaseBuffer(buffer); - } - else - { - Buffer *buffers; - Page *pages; - int numbuffer; - OffsetNumber *todelete; - - /* construct path */ - gistxlogFindPath(index, insert); - - Assert(insert->pathlen > 0); - - buffers = (Buffer *) palloc(sizeof(Buffer) * (insert->lenblk + 2 /* guarantee root split */ )); - pages = (Page *) palloc(sizeof(Page) * (insert->lenblk + 2 /* guarantee root split */ )); - todelete = (OffsetNumber *) palloc(sizeof(OffsetNumber) * (insert->lenblk + 2 /* guarantee root split */ )); - - for (i = 0; i < insert->pathlen; i++) - { - int j, - k, - pituplen = 0; - uint8 xlinfo; - XLogRecData *rdata; - XLogRecPtr recptr; - Buffer tempbuffer = InvalidBuffer; - int ntodelete = 0; - - numbuffer = 1; - buffers[0] = ReadBuffer(index, insert->path[i]); - LockBuffer(buffers[0], GIST_EXCLUSIVE); - - /* - * we check buffer, because we restored page earlier - */ - gistcheckpage(index, buffers[0]); - - pages[0] = BufferGetPage(buffers[0]); - Assert(!GistPageIsLeaf(pages[0])); - - pituplen = PageGetMaxOffsetNumber(pages[0]); - - /* find remove old IndexTuples to remove */ - for (j = 0; j < pituplen && ntodelete < lenitup; j++) - { - BlockNumber blkno; - ItemId iid = PageGetItemId(pages[0], j + FirstOffsetNumber); - IndexTuple idxtup = (IndexTuple) PageGetItem(pages[0], iid); - - blkno = ItemPointerGetBlockNumber(&(idxtup->t_tid)); - - for (k = 0; k < lenitup; k++) - if (ItemPointerGetBlockNumber(&(itup[k]->t_tid)) == blkno) - { - todelete[ntodelete] = j + FirstOffsetNumber - ntodelete; - ntodelete++; - break; - } - } - - if (ntodelete == 0) - elog(PANIC, "gistContinueInsert: cannot find pointer to page(s)"); - - /* - * we check space with subtraction only first tuple to delete, - * hope, that wiil be enough space.... - */ - - if (gistnospace(pages[0], itup, lenitup, *todelete, 0)) - { - - /* no space left on page, so we must split */ - buffers[numbuffer] = ReadBuffer(index, P_NEW); - LockBuffer(buffers[numbuffer], GIST_EXCLUSIVE); - GISTInitBuffer(buffers[numbuffer], 0); - pages[numbuffer] = BufferGetPage(buffers[numbuffer]); - gistfillbuffer(pages[numbuffer], itup, lenitup, FirstOffsetNumber); - numbuffer++; - - if (BufferGetBlockNumber(buffers[0]) == GIST_ROOT_BLKNO) - { - Buffer tmp; - - /* - * we split root, just copy content from root to new page - */ - - /* sanity check */ - if (i + 1 != insert->pathlen) - elog(PANIC, "unexpected pathlen in index \"%s\"", - RelationGetRelationName(index)); - - /* fill new page, root will be changed later */ - tempbuffer = ReadBuffer(index, P_NEW); - LockBuffer(tempbuffer, GIST_EXCLUSIVE); - memcpy(BufferGetPage(tempbuffer), pages[0], BufferGetPageSize(tempbuffer)); - - /* swap buffers[0] (was root) and temp buffer */ - tmp = buffers[0]; - buffers[0] = tempbuffer; - tempbuffer = tmp; /* now in tempbuffer GIST_ROOT_BLKNO, - * it is still unchanged */ - - pages[0] = BufferGetPage(buffers[0]); - } - - START_CRIT_SECTION(); - - for (j = 0; j < ntodelete; j++) - PageIndexTupleDelete(pages[0], todelete[j]); - - xlinfo = XLOG_GIST_PAGE_SPLIT; - rdata = formSplitRdata(index->rd_node, insert->path[i], - false, &(insert->key), - gistMakePageLayout(buffers, numbuffer)); - - } - else - { - START_CRIT_SECTION(); - - for (j = 0; j < ntodelete; j++) - PageIndexTupleDelete(pages[0], todelete[j]); - gistfillbuffer(pages[0], itup, lenitup, InvalidOffsetNumber); - - xlinfo = XLOG_GIST_PAGE_UPDATE; - rdata = formUpdateRdata(index->rd_node, buffers[0], - todelete, ntodelete, - itup, lenitup, &(insert->key)); - } - - /* - * use insert->key as mark for completion of insert (form*Rdata() - * above) for following possible replays - */ - - /* write pages, we should mark it dirty befor XLogInsert() */ - for (j = 0; j < numbuffer; j++) - { - GistPageGetOpaque(pages[j])->rightlink = InvalidBlockNumber; - MarkBufferDirty(buffers[j]); - } - recptr = XLogInsert(RM_GIST_ID, xlinfo, rdata); - for (j = 0; j < numbuffer; j++) - { - PageSetLSN(pages[j], recptr); - PageSetTLI(pages[j], ThisTimeLineID); - } - - END_CRIT_SECTION(); - - lenitup = numbuffer; - for (j = 0; j < numbuffer; j++) - { - itup[j] = gist_form_invalid_tuple(BufferGetBlockNumber(buffers[j])); - UnlockReleaseBuffer(buffers[j]); - } - - if (tempbuffer != InvalidBuffer) - { - /* - * it was a root split, so fill it by new values - */ - gistnewroot(index, tempbuffer, itup, lenitup, &(insert->key)); - UnlockReleaseBuffer(tempbuffer); - } - } - } - - FreeFakeRelcacheEntry(index); - - ereport(LOG, - (errmsg("index %u/%u/%u needs VACUUM FULL or REINDEX to finish crash recovery", - insert->node.spcNode, insert->node.dbNode, insert->node.relNode), - errdetail("Incomplete insertion detected during crash replay."))); -} - void gist_xlog_startup(void) { - incomplete_inserts = NIL; - insertCtx = AllocSetContextCreate(CurrentMemoryContext, - "GiST recovery temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); opCtx = createTempGistContext(); } void gist_xlog_cleanup(void) { - ListCell *l; - MemoryContext oldCxt; - - oldCxt = MemoryContextSwitchTo(opCtx); - - foreach(l, incomplete_inserts) - { - gistIncompleteInsert *insert = (gistIncompleteInsert *) lfirst(l); - - gistContinueInsert(insert); - MemoryContextReset(opCtx); - } - MemoryContextSwitchTo(oldCxt); - MemoryContextDelete(opCtx); - MemoryContextDelete(insertCtx); -} - -bool -gist_safe_restartpoint(void) -{ - if (incomplete_inserts) - return false; - return true; } - -XLogRecData * -formSplitRdata(RelFileNode node, BlockNumber blkno, bool page_is_leaf, - ItemPointer key, SplitedPageLayout *dist) +/* + * Write WAL record of a page split. + */ +XLogRecPtr +gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf, + SplitedPageLayout *dist, + BlockNumber origrlink, GistNSN orignsn, + Buffer leftchildbuf) { XLogRecData *rdata; - gistxlogPageSplit *xlrec = (gistxlogPageSplit *) palloc(sizeof(gistxlogPageSplit)); + gistxlogPageSplit xlrec; SplitedPageLayout *ptr; int npage = 0, - cur = 1; + cur; + XLogRecPtr recptr; - ptr = dist; - while (ptr) - { + for (ptr = dist; ptr; ptr = ptr->next) npage++; - ptr = ptr->next; - } rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (npage * 2 + 2)); - xlrec->node = node; - xlrec->origblkno = blkno; - xlrec->origleaf = page_is_leaf; - xlrec->npage = (uint16) npage; - if (key) - xlrec->key = *key; - else - ItemPointerSetInvalid(&(xlrec->key)); + xlrec.node = node; + xlrec.origblkno = blkno; + xlrec.origrlink = origrlink; + xlrec.orignsn = orignsn; + xlrec.origleaf = page_is_leaf; + xlrec.npage = (uint16) npage; + xlrec.leftchild = + BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber; - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) xlrec; + rdata[0].data = (char *) &xlrec; rdata[0].len = sizeof(gistxlogPageSplit); - rdata[0].next = NULL; + rdata[0].buffer = InvalidBuffer; + + cur = 1; - ptr = dist; - while (ptr) + /* + * Include a full page image of the child buf. (only necessary if a + * checkpoint happened since the child page was split) + */ + if (BufferIsValid(leftchildbuf)) { + rdata[cur - 1].next = &(rdata[cur]); + rdata[cur].data = NULL; + rdata[cur].len = 0; + rdata[cur].buffer = leftchildbuf; + rdata[cur].buffer_std = true; + cur++; + } + + for (ptr = dist; ptr; ptr = ptr->next) + { + rdata[cur - 1].next = &(rdata[cur]); rdata[cur].buffer = InvalidBuffer; rdata[cur].data = (char *) &(ptr->block); rdata[cur].len = sizeof(gistxlogPage); - rdata[cur - 1].next = &(rdata[cur]); cur++; + rdata[cur - 1].next = &(rdata[cur]); rdata[cur].buffer = InvalidBuffer; rdata[cur].data = (char *) (ptr->list); rdata[cur].len = ptr->lenlist; - rdata[cur - 1].next = &(rdata[cur]); - rdata[cur].next = NULL; cur++; - ptr = ptr->next; } + rdata[cur - 1].next = NULL; + + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata); - return rdata; + pfree(rdata); + return recptr; } /* - * Construct the rdata array for an XLOG record describing a page update - * (deletion and/or insertion of tuples on a single index page). + * Write XLOG record describing a page update. The update can include any + * number of deletions and/or insertions of tuples on a single index page. + * + * If this update inserts a downlink for a split page, also record that + * the F_FOLLOW_RIGHT flag on the child page is cleared and NSN set. * * Note that both the todelete array and the tuples are marked as belonging * to the target buffer; they need not be stored in XLOG if XLogInsert decides @@ -911,27 +492,26 @@ formSplitRdata(RelFileNode node, BlockNumber blkno, bool page_is_leaf, * at least one rdata item referencing the buffer, even when ntodelete and * ituplen are both zero; this ensures that XLogInsert knows about the buffer. */ -XLogRecData * -formUpdateRdata(RelFileNode node, Buffer buffer, - OffsetNumber *todelete, int ntodelete, - IndexTuple *itup, int ituplen, ItemPointer key) +XLogRecPtr +gistXLogUpdate(RelFileNode node, Buffer buffer, + OffsetNumber *todelete, int ntodelete, + IndexTuple *itup, int ituplen, + Buffer leftchildbuf) { XLogRecData *rdata; gistxlogPageUpdate *xlrec; int cur, i; + XLogRecPtr recptr; - rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (3 + ituplen)); + rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (4 + ituplen)); xlrec = (gistxlogPageUpdate *) palloc(sizeof(gistxlogPageUpdate)); xlrec->node = node; xlrec->blkno = BufferGetBlockNumber(buffer); xlrec->ntodelete = ntodelete; - - if (key) - xlrec->key = *key; - else - ItemPointerSetInvalid(&(xlrec->key)); + xlrec->leftchild = + BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber; rdata[0].buffer = buffer; rdata[0].buffer_std = true; @@ -945,13 +525,13 @@ formUpdateRdata(RelFileNode node, Buffer buffer, rdata[1].next = &(rdata[2]); rdata[2].data = (char *) todelete; - rdata[2].len = MAXALIGN(sizeof(OffsetNumber) * ntodelete); + rdata[2].len = sizeof(OffsetNumber) * ntodelete; rdata[2].buffer = buffer; rdata[2].buffer_std = true; - rdata[2].next = NULL; - /* new tuples */ cur = 3; + + /* new tuples */ for (i = 0; i < ituplen; i++) { rdata[cur - 1].next = &(rdata[cur]); @@ -959,38 +539,26 @@ formUpdateRdata(RelFileNode node, Buffer buffer, rdata[cur].len = IndexTupleSize(itup[i]); rdata[cur].buffer = buffer; rdata[cur].buffer_std = true; - rdata[cur].next = NULL; cur++; } - return rdata; -} - -XLogRecPtr -gistxlogInsertCompletion(RelFileNode node, ItemPointerData *keys, int len) -{ - gistxlogInsertComplete xlrec; - XLogRecData rdata[2]; - XLogRecPtr recptr; - - Assert(len > 0); - xlrec.node = node; - - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(gistxlogInsertComplete); - rdata[0].next = &(rdata[1]); - - rdata[1].buffer = InvalidBuffer; - rdata[1].data = (char *) keys; - rdata[1].len = sizeof(ItemPointerData) * len; - rdata[1].next = NULL; - - START_CRIT_SECTION(); - - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_INSERT_COMPLETE, rdata); + /* + * Include a full page image of the child buf. (only necessary if a + * checkpoint happened since the child page was split) + */ + if (BufferIsValid(leftchildbuf)) + { + rdata[cur - 1].next = &(rdata[cur]); + rdata[cur].data = NULL; + rdata[cur].len = 0; + rdata[cur].buffer = leftchildbuf; + rdata[cur].buffer_std = true; + cur++; + } + rdata[cur - 1].next = NULL; - END_CRIT_SECTION(); + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); + pfree(rdata); return recptr; } diff --git a/src/backend/access/hash/Makefile b/src/backend/access/hash/Makefile index 80f9ea61e9..82297606dc 100644 --- a/src/backend/access/hash/Makefile +++ b/src/backend/access/hash/Makefile @@ -4,7 +4,7 @@ # Makefile for access/hash # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/access/hash/Makefile,v 1.15 2008/03/16 23:15:08 tgl Exp $ +# src/backend/access/hash/Makefile # #------------------------------------------------------------------------- diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README index 026ad40bfb..cd4e058389 100644 --- a/src/backend/access/hash/README +++ b/src/backend/access/hash/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.9 2009/11/01 21:25:25 tgl Exp $ +src/backend/access/hash/README Hash Indexing ============= diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 6474f45940..4cb29b2bb4 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -3,12 +3,12 @@ * hash.c * Implementation of Margo Seltzer's Hashing package for postgres. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.117 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/hash/hash.c * * NOTES * This file contains only the public interface routines. @@ -69,7 +69,7 @@ hashbuild(PG_FUNCTION_ARGS) estimate_rel_size(heap, NULL, &relpages, &reltuples); /* Initialize the hash index metadata page and initial buckets */ - num_buckets = _hash_metapinit(index, reltuples); + num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM); /* * If we just insert the tuples into the index in scan order, then @@ -114,6 +114,19 @@ hashbuild(PG_FUNCTION_ARGS) } /* + * hashbuildempty() -- build an empty hash index in the initialization fork + */ +Datum +hashbuildempty(PG_FUNCTION_ARGS) +{ + Relation index = (Relation) PG_GETARG_POINTER(0); + + _hash_metapinit(index, 0, INIT_FORKNUM); + + PG_RETURN_VOID(); +} + +/* * Per-tuple callback from IndexBuildHeapScan */ static void @@ -366,12 +379,16 @@ Datum hashbeginscan(PG_FUNCTION_ARGS) { Relation rel = (Relation) PG_GETARG_POINTER(0); - int keysz = PG_GETARG_INT32(1); - ScanKey scankey = (ScanKey) PG_GETARG_POINTER(2); + int nkeys = PG_GETARG_INT32(1); + int norderbys = PG_GETARG_INT32(2); IndexScanDesc scan; HashScanOpaque so; - scan = RelationGetIndexScan(rel, keysz, scankey); + /* no order by operators allowed */ + Assert(norderbys == 0); + + scan = RelationGetIndexScan(rel, nkeys, norderbys); + so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData)); so->hashso_bucket_valid = false; so->hashso_bucket_blkno = 0; @@ -396,26 +413,24 @@ hashrescan(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1); + + /* remaining arguments are ignored */ HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - /* if we are called from beginscan, so is still NULL */ - if (so) - { - /* release any pin we still hold */ - if (BufferIsValid(so->hashso_curbuf)) - _hash_dropbuf(rel, so->hashso_curbuf); - so->hashso_curbuf = InvalidBuffer; - - /* release lock on bucket, too */ - if (so->hashso_bucket_blkno) - _hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE); - so->hashso_bucket_blkno = 0; - - /* set position invalid (this will cause _hash_first call) */ - ItemPointerSetInvalid(&(so->hashso_curpos)); - ItemPointerSetInvalid(&(so->hashso_heappos)); - } + /* release any pin we still hold */ + if (BufferIsValid(so->hashso_curbuf)) + _hash_dropbuf(rel, so->hashso_curbuf); + so->hashso_curbuf = InvalidBuffer; + + /* release lock on bucket, too */ + if (so->hashso_bucket_blkno) + _hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE); + so->hashso_bucket_blkno = 0; + + /* set position invalid (this will cause _hash_first call) */ + ItemPointerSetInvalid(&(so->hashso_curpos)); + ItemPointerSetInvalid(&(so->hashso_heappos)); /* Update scan key, if a new one is given */ if (scankey && scan->numberOfKeys > 0) @@ -423,8 +438,7 @@ hashrescan(PG_FUNCTION_ARGS) memmove(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData)); - if (so) - so->hashso_bucket_valid = false; + so->hashso_bucket_valid = false; } PG_RETURN_VOID(); diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index 6b99acff88..e00696eb66 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -3,12 +3,12 @@ * hashfunc.c * Support functions for hash access method. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.62 2010/01/07 04:53:34 tgl Exp $ + * src/backend/access/hash/hashfunc.c * * NOTES * These functions are stored in pg_amproc. For each operator class diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index 4d1cbbc8da..dd58f16840 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -3,12 +3,12 @@ * hashinsert.c * Item insertion in hash tables for Postgres. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.54 2010/01/02 16:57:34 momjian Exp $ + * src/backend/access/hash/hashinsert.c * *------------------------------------------------------------------------- */ diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 3ca8d733ad..ae8b2b1cfd 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -3,12 +3,12 @@ * hashovfl.c * Overflow page management code for the Postgres hash access method * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.69 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/hash/hashovfl.c * * NOTES * Overflow pages look like ordinary relation pages. @@ -259,7 +259,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf) * convenient to pre-mark them as "in use" too. */ bit = metap->hashm_spares[splitnum]; - _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit)); + _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM); metap->hashm_spares[splitnum]++; } else @@ -280,7 +280,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf) * with metapage write lock held; would be better to use a lock that * doesn't block incoming searches. */ - newbuf = _hash_getnewbuf(rel, blkno); + newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); metap->hashm_spares[splitnum]++; @@ -503,7 +503,8 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, * All bits in the new bitmap page are set to "1", indicating "in use". */ void -_hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno) +_hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, + ForkNumber forkNum) { Buffer buf; Page pg; @@ -520,7 +521,7 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno) * page while holding the metapage lock, but this path is taken so seldom * that it's not worth worrying about. */ - buf = _hash_getnewbuf(rel, blkno); + buf = _hash_getnewbuf(rel, blkno, forkNum); pg = BufferGetPage(buf); /* initialize the page's special space */ diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 77b072c880..fe991cf27b 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -3,12 +3,12 @@ * hashpage.c * Hash table page management code for the Postgres hash access method * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.83 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/hash/hashpage.c * * NOTES * Postgres hash pages look like ordinary relation pages. The opaque @@ -183,9 +183,9 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno) * extend the index at a time. */ Buffer -_hash_getnewbuf(Relation rel, BlockNumber blkno) +_hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum) { - BlockNumber nblocks = RelationGetNumberOfBlocks(rel); + BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum); Buffer buf; if (blkno == P_NEW) @@ -197,13 +197,13 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno) /* smgr insists we use P_NEW to extend the relation */ if (blkno == nblocks) { - buf = ReadBuffer(rel, P_NEW); + buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL); if (BufferGetBlockNumber(buf) != blkno) elog(ERROR, "unexpected hash relation size: %u, should be %u", BufferGetBlockNumber(buf), blkno); } else - buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO, NULL); + buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO, NULL); LockBuffer(buf, HASH_WRITE); @@ -324,7 +324,7 @@ _hash_chgbufaccess(Relation rel, * multiple buffer locks is ignored. */ uint32 -_hash_metapinit(Relation rel, double num_tuples) +_hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) { HashMetaPage metap; HashPageOpaque pageopaque; @@ -340,7 +340,7 @@ _hash_metapinit(Relation rel, double num_tuples) uint32 i; /* safety check */ - if (RelationGetNumberOfBlocks(rel) != 0) + if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); @@ -383,7 +383,7 @@ _hash_metapinit(Relation rel, double num_tuples) * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. */ - metabuf = _hash_getnewbuf(rel, HASH_METAPAGE); + metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); pg = BufferGetPage(metabuf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); @@ -451,7 +451,7 @@ _hash_metapinit(Relation rel, double num_tuples) /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); - buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i)); + buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum); pg = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; @@ -468,7 +468,7 @@ _hash_metapinit(Relation rel, double num_tuples) /* * Initialize first bitmap page */ - _hash_initbitmap(rel, metap, num_buckets + 1); + _hash_initbitmap(rel, metap, num_buckets + 1, forkNum); /* all done */ _hash_wrtbuf(rel, metabuf); @@ -733,7 +733,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) MemSet(zerobuf, 0, sizeof(zerobuf)); RelationOpenSmgr(rel); - smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, rel->rd_istemp); + smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false); return true; } @@ -785,7 +785,7 @@ _hash_splitbucket(Relation rel, oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); nblkno = start_nblkno; - nbuf = _hash_getnewbuf(rel, nblkno); + nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM); npage = BufferGetPage(nbuf); /* initialize the new bucket's primary page */ diff --git a/src/backend/access/hash/hashscan.c b/src/backend/access/hash/hashscan.c index fd2486a556..93d63b86ea 100644 --- a/src/backend/access/hash/hashscan.c +++ b/src/backend/access/hash/hashscan.c @@ -3,12 +3,12 @@ * hashscan.c * manage scans on hash tables * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashscan.c,v 1.48 2010/01/02 16:57:34 momjian Exp $ + * src/backend/access/hash/hashscan.c * *------------------------------------------------------------------------- */ diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index b1ccff673e..bf42be103f 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -3,12 +3,12 @@ * hashsearch.c * search code for postgres hash tables * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.59 2010/01/02 16:57:34 momjian Exp $ + * src/backend/access/hash/hashsearch.c * *------------------------------------------------------------------------- */ diff --git a/src/backend/access/hash/hashsort.c b/src/backend/access/hash/hashsort.c index b250e814f9..dbb9c3f39b 100644 --- a/src/backend/access/hash/hashsort.c +++ b/src/backend/access/hash/hashsort.c @@ -14,11 +14,11 @@ * plenty of locality of access. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashsort.c,v 1.4 2010/01/02 16:57:34 momjian Exp $ + * src/backend/access/hash/hashsort.c * *------------------------------------------------------------------------- */ diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index d6676702cd..6283f4a82b 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -3,12 +3,12 @@ * hashutil.c * Utility code for Postgres hash implementation. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashutil.c,v 1.62 2010/01/02 16:57:34 momjian Exp $ + * src/backend/access/hash/hashutil.c * *------------------------------------------------------------------------- */ @@ -56,7 +56,8 @@ _hash_checkqual(IndexScanDesc scan, IndexTuple itup) if (key->sk_flags & SK_ISNULL) return false; - test = FunctionCall2(&key->sk_func, datum, key->sk_argument); + test = FunctionCall2Coll(&key->sk_func, key->sk_collation, + datum, key->sk_argument); if (!DatumGetBool(test)) return false; @@ -79,11 +80,13 @@ uint32 _hash_datum2hashkey(Relation rel, Datum key) { FmgrInfo *procinfo; + Oid collation; /* XXX assumes index has only one attribute */ procinfo = index_getprocinfo(rel, 1, HASHPROC); + collation = rel->rd_indcollation[0]; - return DatumGetUInt32(FunctionCall1(procinfo, key)); + return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key)); } /* @@ -97,6 +100,7 @@ uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype) { RegProcedure hash_proc; + Oid collation; /* XXX assumes index has only one attribute */ hash_proc = get_opfamily_proc(rel->rd_opfamily[0], @@ -107,8 +111,9 @@ _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype) elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"", HASHPROC, keytype, keytype, RelationGetRelationName(rel)); + collation = rel->rd_indcollation[0]; - return DatumGetUInt32(OidFunctionCall1(hash_proc, key)); + return DatumGetUInt32(OidFunctionCall1Coll(hash_proc, collation, key)); } /* diff --git a/src/backend/access/heap/Makefile b/src/backend/access/heap/Makefile index dc33054641..b83d496bcd 100644 --- a/src/backend/access/heap/Makefile +++ b/src/backend/access/heap/Makefile @@ -4,7 +4,7 @@ # Makefile for access/heap # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.19 2008/12/03 13:05:22 heikki Exp $ +# src/backend/access/heap/Makefile # #------------------------------------------------------------------------- diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index 2f9162e52d..f12cad44e5 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/access/heap/README.HOT,v 1.6 2010/04/23 23:21:44 rhaas Exp $ +src/backend/access/heap/README.HOT Heap Only Tuples (HOT) ====================== @@ -246,6 +246,12 @@ scans reasonably sized, the maximum number of line pointers per page is arbitrarily capped at MaxHeapTuplesPerPage (the most tuples that could fit without HOT pruning). +Effectively, space reclamation happens during tuple retrieval when the +page is nearly full (<10% free) and a buffer cleanup lock can be +acquired. This means that UPDATE, DELETE, and SELECT can trigger space +reclamation, but often not during INSERT ... VALUES because it does +not retrieve a row. + VACUUM ------ diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index e17b6362c3..01a492e496 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -3,12 +3,12 @@ * heapam.c * heap access method code * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.292 2010/07/06 19:18:55 momjian Exp $ + * src/backend/access/heap/heapam.c * * * INTERFACE ROUTINES @@ -57,6 +57,7 @@ #include "storage/bufmgr.h" #include "storage/freespace.h" #include "storage/lmgr.h" +#include "storage/predicate.h" #include "storage/procarray.h" #include "storage/smgr.h" #include "storage/standby.h" @@ -124,7 +125,7 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) * * During a rescan, don't make a new strategy object if we don't have to. */ - if (!scan->rs_rd->rd_istemp && + if (!RelationUsesLocalBuffers(scan->rs_rd) && scan->rs_nblocks > NBuffers / 4) { allow_strat = scan->rs_allow_strat; @@ -261,20 +262,20 @@ heapgetpage(HeapScanDesc scan, BlockNumber page) { if (ItemIdIsNormal(lpp)) { + HeapTupleData loctup; bool valid; + loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); + loctup.t_len = ItemIdGetLength(lpp); + ItemPointerSet(&(loctup.t_self), page, lineoff); + if (all_visible) valid = true; else - { - HeapTupleData loctup; + valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); - loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); - loctup.t_len = ItemIdGetLength(lpp); - ItemPointerSet(&(loctup.t_self), page, lineoff); + CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, buffer); - valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); - } if (valid) scan->rs_vistuples[ntup++] = lineoff; } @@ -468,12 +469,16 @@ heapgettup(HeapScanDesc scan, snapshot, scan->rs_cbuf); + CheckForSerializableConflictOut(valid, scan->rs_rd, tuple, scan->rs_cbuf); + if (valid && key != NULL) HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd), nkeys, key, valid); if (valid) { + if (!scan->rs_relpredicatelocked) + PredicateLockTuple(scan->rs_rd, tuple); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); return; } @@ -741,12 +746,16 @@ heapgettup_pagemode(HeapScanDesc scan, nkeys, key, valid); if (valid) { + if (!scan->rs_relpredicatelocked) + PredicateLockTuple(scan->rs_rd, tuple); scan->rs_cindex = lineindex; return; } } else { + if (!scan->rs_relpredicatelocked) + PredicateLockTuple(scan->rs_rd, tuple); scan->rs_cindex = lineindex; return; } @@ -905,7 +914,7 @@ relation_open(Oid relationId, LOCKMODE lockmode) elog(ERROR, "could not open relation with OID %u", relationId); /* Make note that we've accessed a temporary relation */ - if (r->rd_istemp) + if (RelationUsesLocalBuffers(r)) MyXactAccessedTempRel = true; pgstat_initstats(r); @@ -951,7 +960,7 @@ try_relation_open(Oid relationId, LOCKMODE lockmode) elog(ERROR, "could not open relation with OID %u", relationId); /* Make note that we've accessed a temporary relation */ - if (r->rd_istemp) + if (RelationUsesLocalBuffers(r)) MyXactAccessedTempRel = true; pgstat_initstats(r); @@ -1060,7 +1069,8 @@ relation_close(Relation relation, LOCKMODE lockmode) * * This is essentially relation_open plus check that the relation * is not an index nor a composite type. (The caller should also - * check that it's not a view before assuming it has storage.) + * check that it's not a view or foreign table before assuming it has + * storage.) * ---------------- */ Relation @@ -1212,6 +1222,7 @@ heap_beginscan_internal(Relation relation, Snapshot snapshot, scan->rs_strategy = NULL; /* set in initscan */ scan->rs_allow_strat = allow_strat; scan->rs_allow_sync = allow_sync; + scan->rs_relpredicatelocked = false; /* * we can use page-at-a-time mode if it's an MVCC-safe snapshot @@ -1458,6 +1469,11 @@ heap_fetch(Relation relation, */ valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer); + if (valid) + PredicateLockTuple(relation, tuple); + + CheckForSerializableConflictOut(valid, relation, tuple, buffer); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); if (valid) @@ -1505,13 +1521,14 @@ heap_fetch(Relation relation, * heap_fetch, we do not report any pgstats count; caller may do so if wanted. */ bool -heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot, - bool *all_dead) +heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, + Snapshot snapshot, bool *all_dead) { Page dp = (Page) BufferGetPage(buffer); TransactionId prev_xmax = InvalidTransactionId; OffsetNumber offnum; bool at_chain_start; + bool valid; if (all_dead) *all_dead = true; @@ -1551,6 +1568,8 @@ heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot, heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp); heapTuple.t_len = ItemIdGetLength(lp); + heapTuple.t_tableOid = relation->rd_id; + heapTuple.t_self = *tid; /* * Shouldn't see a HEAP_ONLY tuple at chain start. @@ -1568,9 +1587,12 @@ heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot, break; /* If it's visible per the snapshot, we must return it */ - if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer)) + valid = HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer); + CheckForSerializableConflictOut(valid, relation, &heapTuple, buffer); + if (valid) { ItemPointerSetOffsetNumber(tid, offnum); + PredicateLockTuple(relation, &heapTuple); if (all_dead) *all_dead = false; return true; @@ -1621,7 +1643,7 @@ heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); LockBuffer(buffer, BUFFER_LOCK_SHARE); - result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead); + result = heap_hot_search_buffer(tid, relation, buffer, snapshot, all_dead); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); return result; @@ -1728,6 +1750,7 @@ heap_get_latest_tid(Relation relation, * result candidate. */ valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer); + CheckForSerializableConflictOut(valid, relation, &tp, buffer); if (valid) *tid = ctid; @@ -1892,6 +1915,13 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, buffer = RelationGetBufferForTuple(relation, heaptup->t_len, InvalidBuffer, options, bistate); + /* + * We're about to do the actual insert -- check for conflict at the + * relation or buffer level first, to avoid possibly having to roll back + * work we've just done. + */ + CheckForSerializableConflictIn(relation, NULL, buffer); + /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); @@ -1917,7 +1947,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, MarkBufferDirty(buffer); /* XLOG stuff */ - if (!(options & HEAP_INSERT_SKIP_WAL) && !relation->rd_istemp) + if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation)) { xl_heap_insert xlrec; xl_heap_header xlhdr; @@ -2173,7 +2203,7 @@ l1: if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated) { - /* Perform additional check for serializable RI updates */ + /* Perform additional check for transaction-snapshot mode RI updates */ if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer)) result = HeapTupleUpdated; } @@ -2192,6 +2222,12 @@ l1: return result; } + /* + * We're about to do the actual delete -- check for conflict first, to + * avoid possibly having to roll back work we've just done. + */ + CheckForSerializableConflictIn(relation, &tp, buffer); + /* replace cid with a combo cid if necessary */ HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo); @@ -2227,7 +2263,7 @@ l1: MarkBufferDirty(buffer); /* XLOG stuff */ - if (!relation->rd_istemp) + if (RelationNeedsWAL(relation)) { xl_heap_delete xlrec; XLogRecPtr recptr; @@ -2525,7 +2561,7 @@ l2: if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated) { - /* Perform additional check for serializable RI updates */ + /* Perform additional check for transaction-snapshot mode RI updates */ if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer)) result = HeapTupleUpdated; } @@ -2545,6 +2581,12 @@ l2: return result; } + /* + * We're about to do the actual update -- check for conflict first, to + * avoid possibly having to roll back work we've just done. + */ + CheckForSerializableConflictIn(relation, &oldtup, buffer); + /* Fill in OID and transaction status data for newtup */ if (relation->rd_rel->relhasoids) { @@ -2690,6 +2732,16 @@ l2: } /* + * We're about to create the new tuple -- check for conflict first, to + * avoid possibly having to roll back work we've just done. + * + * NOTE: For a tuple insert, we only need to check for table locks, since + * predicate locking at the index level will cover ranges for anything + * except a table scan. Therefore, only provide the relation. + */ + CheckForSerializableConflictIn(relation, NULL, InvalidBuffer); + + /* * At this point newbuf and buffer are both pinned and locked, and newbuf * has enough space for the new tuple. If they are the same buffer, only * one pin is held. @@ -2780,7 +2832,7 @@ l2: MarkBufferDirty(buffer); /* XLOG stuff */ - if (!relation->rd_istemp) + if (RelationNeedsWAL(relation)) { XLogRecPtr recptr = log_heap_update(relation, buffer, oldtup.t_self, newbuf, heaptup, @@ -3403,7 +3455,7 @@ l3: * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG * entries for everything anyway.) */ - if (!relation->rd_istemp) + if (RelationNeedsWAL(relation)) { xl_heap_lock xlrec; XLogRecPtr recptr; @@ -3505,7 +3557,7 @@ heap_inplace_update(Relation relation, HeapTuple tuple) MarkBufferDirty(buffer); /* XLOG stuff */ - if (!relation->rd_istemp) + if (RelationNeedsWAL(relation)) { xl_heap_inplace xlrec; XLogRecPtr recptr; @@ -3776,8 +3828,11 @@ heap_restrpos(HeapScanDesc scan) } /* - * If 'tuple' contains any XID greater than latestRemovedXid, update - * latestRemovedXid to the greatest one found. + * If 'tuple' contains any visible XID greater than latestRemovedXid, + * ratchet forwards latestRemovedXid to the greatest one found. + * This is used as the basis for generating Hot Standby conflicts, so + * if a tuple was never visible then removing it should not conflict + * with queries. */ void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, @@ -3793,13 +3848,25 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, *latestRemovedXid = xvac; } - if (TransactionIdPrecedes(*latestRemovedXid, xmax)) - *latestRemovedXid = xmax; - - if (TransactionIdPrecedes(*latestRemovedXid, xmin)) - *latestRemovedXid = xmin; + /* + * Ignore tuples inserted by an aborted transaction or if the tuple was + * updated/deleted by the inserting transaction. + * + * Look for a committed hint bit, or if no xmin bit is set, check clog. + * This needs to work on both master and standby, where it is used to + * assess btree delete records. + */ + if ((tuple->t_infomask & HEAP_XMIN_COMMITTED) || + (!(tuple->t_infomask & HEAP_XMIN_COMMITTED) && + !(tuple->t_infomask & HEAP_XMIN_INVALID) && + TransactionIdDidCommit(xmin))) + { + if (xmax != xmin && + TransactionIdFollows(xmax, *latestRemovedXid)) + *latestRemovedXid = xmax; + } - Assert(TransactionIdIsValid(*latestRemovedXid)); + /* *latestRemovedXid may still be invalid at end */ } /* @@ -3852,8 +3919,8 @@ log_heap_clean(Relation reln, Buffer buffer, XLogRecPtr recptr; XLogRecData rdata[4]; - /* Caller should not call me on a temp relation */ - Assert(!reln->rd_istemp); + /* Caller should not call me on a non-WAL-logged relation */ + Assert(RelationNeedsWAL(reln)); xlrec.node = reln->rd_node; xlrec.block = BufferGetBlockNumber(buffer); @@ -3935,8 +4002,8 @@ log_heap_freeze(Relation reln, Buffer buffer, XLogRecPtr recptr; XLogRecData rdata[2]; - /* Caller should not call me on a temp relation */ - Assert(!reln->rd_istemp); + /* Caller should not call me on a non-WAL-logged relation */ + Assert(RelationNeedsWAL(reln)); /* nor when there are no tuples to freeze */ Assert(offcnt > 0); @@ -3981,8 +4048,8 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from, XLogRecData rdata[4]; Page page = BufferGetPage(newbuf); - /* Caller should not call me on a temp relation */ - Assert(!reln->rd_istemp); + /* Caller should not call me on a non-WAL-logged relation */ + Assert(RelationNeedsWAL(reln)); if (HeapTupleIsHeapOnly(newtup)) info = XLOG_HEAP_HOT_UPDATE; @@ -4079,8 +4146,15 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_NEWPAGE, rdata); - PageSetLSN(page, recptr); - PageSetTLI(page, ThisTimeLineID); + /* + * The page may be uninitialized. If so, we can't set the LSN and TLI + * because that would corrupt the page. + */ + if (!PageIsNew(page)) + { + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + } END_CRIT_SECTION(); @@ -4266,8 +4340,16 @@ heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record) Assert(record->xl_len == SizeOfHeapNewpage + BLCKSZ); memcpy(page, (char *) xlrec + SizeOfHeapNewpage, BLCKSZ); - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); + /* + * The page may be uninitialized. If so, we can't set the LSN and TLI + * because that would corrupt the page. + */ + if (!PageIsNew(page)) + { + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + } + MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); } @@ -4967,7 +5049,7 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec) * heap_sync - sync a heap, for use when no WAL has been written * * This forces the heap contents (including TOAST heap if any) down to disk. - * If we skipped using WAL, and it's not a temp relation, we must force the + * If we skipped using WAL, and WAL is otherwise needed, we must force the * relation down to disk before it's safe to commit the transaction. This * requires writing out any dirty buffers and then doing a forced fsync. * @@ -4980,8 +5062,8 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec) void heap_sync(Relation rel) { - /* temp tables never need fsync */ - if (rel->rd_istemp) + /* non-WAL-logged tables never need fsync */ + if (!RelationNeedsWAL(rel)) return; /* main heap */ diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index 83915ba33d..72a69e52b0 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -3,12 +3,12 @@ * hio.c * POSTGRES heap access method input/output code. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.78 2010/02/09 21:43:29 tgl Exp $ + * src/backend/access/heap/hio.c * *------------------------------------------------------------------------- */ @@ -150,7 +150,7 @@ ReadBufferBI(Relation relation, BlockNumber targetBlock, Buffer RelationGetBufferForTuple(Relation relation, Size len, Buffer otherBuffer, int options, - struct BulkInsertStateData *bistate) + struct BulkInsertStateData * bistate) { bool use_fsm = !(options & HEAP_INSERT_SKIP_FSM); Buffer buffer = InvalidBuffer; diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 3332e085b8..0cfa866108 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -3,12 +3,12 @@ * pruneheap.c * heap page pruning and HOT-chain management code * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.25 2010/07/06 19:18:55 momjian Exp $ + * src/backend/access/heap/pruneheap.c * *------------------------------------------------------------------------- */ @@ -233,11 +233,10 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, /* * Emit a WAL HEAP_CLEAN record showing what we did */ - if (!relation->rd_istemp) + if (RelationNeedsWAL(relation)) { XLogRecPtr recptr; - Assert(TransactionIdIsValid(prstate.latestRemovedXid)); recptr = log_heap_clean(relation, buffer, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index e35fbeb9b1..e56140950a 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -92,11 +92,11 @@ * heap's TOAST table will go through the normal bufmgr. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994-5, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/rewriteheap.c,v 1.22 2010/04/28 16:10:40 heikki Exp $ + * src/backend/access/heap/rewriteheap.c * *------------------------------------------------------------------------- */ @@ -131,7 +131,7 @@ typedef struct RewriteStateData * them */ HTAB *rs_unresolved_tups; /* unmatched A tuples */ HTAB *rs_old_new_tid_map; /* unmatched B tuples */ -} RewriteStateData; +} RewriteStateData; /* * The lookup keys for the hash tables are tuple TID and xmin (we must check @@ -254,8 +254,6 @@ end_heap_rewrite(RewriteState state) /* * Write any remaining tuples in the UnresolvedTups table. If we have any * left, they should in fact be dead, but let's err on the safe side. - * - * XXX this really is a waste of code no? */ hash_seq_init(&seq_status, state->rs_unresolved_tups); @@ -279,8 +277,8 @@ end_heap_rewrite(RewriteState state) } /* - * If the rel isn't temp, must fsync before commit. We use heap_sync to - * ensure that the toast table gets fsync'd too. + * If the rel is WAL-logged, must fsync before commit. We use heap_sync + * to ensure that the toast table gets fsync'd too. * * It's obvious that we must do this when not WAL-logging. It's less * obvious that we have to do it even if we did WAL-log the pages. The @@ -289,7 +287,7 @@ end_heap_rewrite(RewriteState state) * occurring during the rewriteheap operation won't have fsync'd data we * wrote before the checkpoint. */ - if (!state->rs_new_rel->rd_istemp) + if (RelationNeedsWAL(state->rs_new_rel)) heap_sync(state->rs_new_rel); /* Deleting the context frees everything */ @@ -502,8 +500,12 @@ rewrite_heap_tuple(RewriteState state, * Register a dead tuple with an ongoing rewrite. Dead tuples are not * copied to the new table, but we still make note of them so that we * can release some resources earlier. + * + * Returns true if a tuple was removed from the unresolved_tups table. + * This indicates that that tuple, previously thought to be "recently dead", + * is now known really dead and won't be written to the output. */ -void +bool rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple) { /* @@ -539,7 +541,10 @@ rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple) hash_search(state->rs_unresolved_tups, &hashkey, HASH_REMOVE, &found); Assert(found); + return true; } + + return false; } /* @@ -636,7 +641,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup) } /* And now we can insert the tuple into the page */ - newoff = PageAddItem(page, (Item) heaptup->t_data, len, + newoff = PageAddItem(page, (Item) heaptup->t_data, heaptup->t_len, InvalidOffsetNumber, false, true); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add tuple"); diff --git a/src/backend/access/heap/syncscan.c b/src/backend/access/heap/syncscan.c index 8dc2bb476d..957d1a12f3 100644 --- a/src/backend/access/heap/syncscan.c +++ b/src/backend/access/heap/syncscan.c @@ -36,11 +36,11 @@ * ss_report_location - update current scan location * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/syncscan.c,v 1.7 2010/01/02 16:57:35 momjian Exp $ + * src/backend/access/heap/syncscan.c * *------------------------------------------------------------------------- */ diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c index 7518db16c8..4f4dd69291 100644 --- a/src/backend/access/heap/tuptoaster.c +++ b/src/backend/access/heap/tuptoaster.c @@ -4,11 +4,11 @@ * Support routines for external and compressed storage of * variable size attributes. * - * Copyright (c) 2000-2010, PostgreSQL Global Development Group + * Copyright (c) 2000-2011, PostgreSQL Global Development Group * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/tuptoaster.c,v 1.98 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/heap/tuptoaster.c * * * INTERFACE ROUTINES diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 88a0c74e32..58bab7df10 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -3,12 +3,12 @@ * visibilitymap.c * bitmap for tracking visibility of heap tuples * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/visibilitymap.c,v 1.10 2010/04/23 23:21:44 rhaas Exp $ + * src/backend/access/heap/visibilitymap.c * * INTERFACE ROUTINES * visibilitymap_clear - clear a bit in the visibility map @@ -373,8 +373,7 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) } /* Truncate the unused VM pages, and send smgr inval message */ - smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks, - rel->rd_istemp); + smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks); /* * We might as well update the local smgr_vm_nblocks setting. smgrtruncate @@ -478,7 +477,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) while (vm_nblocks_now < vm_nblocks) { smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now, - (char *) pg, rel->rd_istemp); + (char *) pg, false); vm_nblocks_now++; } diff --git a/src/backend/access/index/Makefile b/src/backend/access/index/Makefile index 6357a43f89..96490db032 100644 --- a/src/backend/access/index/Makefile +++ b/src/backend/access/index/Makefile @@ -4,7 +4,7 @@ # Makefile for access/index # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/access/index/Makefile,v 1.14 2008/02/19 10:30:06 petere Exp $ +# src/backend/access/index/Makefile # #------------------------------------------------------------------------- diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index d95fd90a42..db04e26a65 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -3,12 +3,12 @@ * genam.c * general index access method routines * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.81 2010/02/26 02:00:33 momjian Exp $ + * src/backend/access/index/genam.c * * NOTES * many of the old access method routines have been turned into @@ -57,22 +57,20 @@ /* ---------------- * RelationGetIndexScan -- Create and fill an IndexScanDesc. * - * This routine creates an index scan structure and sets its contents - * up correctly. This routine calls AMrescan to set up the scan with - * the passed key. + * This routine creates an index scan structure and sets up initial + * contents for it. * * Parameters: * indexRelation -- index relation for scan. - * nkeys -- count of scan keys. - * key -- array of scan keys to restrict the index scan. + * nkeys -- count of scan keys (index qual conditions). + * norderbys -- count of index order-by operators. * * Returns: * An initialized IndexScanDesc. * ---------------- */ IndexScanDesc -RelationGetIndexScan(Relation indexRelation, - int nkeys, ScanKey key) +RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) { IndexScanDesc scan; @@ -82,15 +80,19 @@ RelationGetIndexScan(Relation indexRelation, scan->indexRelation = indexRelation; scan->xs_snapshot = SnapshotNow; /* may be set later */ scan->numberOfKeys = nkeys; + scan->numberOfOrderBys = norderbys; /* - * We allocate the key space here, but the AM is responsible for actually - * filling it from the passed key array. + * We allocate key workspace here, but it won't get filled until amrescan. */ if (nkeys > 0) scan->keyData = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys); else scan->keyData = NULL; + if (norderbys > 0) + scan->orderByData = (ScanKey) palloc(sizeof(ScanKeyData) * norderbys); + else + scan->orderByData = NULL; /* * During recovery we ignore killed tuples and don't bother to kill them @@ -115,11 +117,6 @@ RelationGetIndexScan(Relation indexRelation, scan->xs_next_hot = InvalidOffsetNumber; scan->xs_prev_xmax = InvalidTransactionId; - /* - * Let the AM fill in the key and any opaque data it wants. - */ - index_rescan(scan, key); - return scan; } @@ -140,6 +137,8 @@ IndexScanEnd(IndexScanDesc scan) { if (scan->keyData != NULL) pfree(scan->keyData); + if (scan->orderByData != NULL) + pfree(scan->orderByData); pfree(scan); } @@ -286,7 +285,8 @@ systable_beginscan(Relation heapRelation, } sysscan->iscan = index_beginscan(heapRelation, irel, - snapshot, nkeys, key); + snapshot, nkeys, 0); + index_rescan(sysscan->iscan, key, nkeys, NULL, 0); sysscan->scan = NULL; } else @@ -450,7 +450,8 @@ systable_beginscan_ordered(Relation heapRelation, } sysscan->iscan = index_beginscan(heapRelation, indexRelation, - snapshot, nkeys, key); + snapshot, nkeys, 0); + index_rescan(sysscan->iscan, key, nkeys, NULL, 0); sysscan->scan = NULL; return sysscan; diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 3e7331ae7b..0208765964 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -3,12 +3,12 @@ * indexam.c * general index access method routines * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.118 2010/02/26 02:00:34 momjian Exp $ + * src/backend/access/index/indexam.c * * INTERFACE ROUTINES * index_open - open an index relation by relation OID @@ -64,9 +64,12 @@ #include "access/relscan.h" #include "access/transam.h" +#include "access/xact.h" +#include "catalog/index.h" #include "pgstat.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" +#include "storage/predicate.h" #include "utils/relcache.h" #include "utils/snapmgr.h" #include "utils/tqual.h" @@ -74,12 +77,21 @@ /* ---------------------------------------------------------------- * macros used in index_ routines + * + * Note: the ReindexIsProcessingIndex() check in RELATION_CHECKS is there + * to check that we don't try to scan or do retail insertions into an index + * that is currently being rebuilt or pending rebuild. This helps to catch + * things that don't work when reindexing system catalogs. The assertion + * doesn't prevent the actual rebuild because we don't use RELATION_CHECKS + * when calling the index AM's ambuild routine, and there is no reason for + * ambuild to call its subsidiary routines through this file. * ---------------------------------------------------------------- */ #define RELATION_CHECKS \ ( \ AssertMacro(RelationIsValid(indexRelation)), \ - AssertMacro(PointerIsValid(indexRelation->rd_am)) \ + AssertMacro(PointerIsValid(indexRelation->rd_am)), \ + AssertMacro(!ReindexIsProcessingIndex(RelationGetRelid(indexRelation))) \ ) #define SCAN_CHECKS \ @@ -114,7 +126,7 @@ do { \ } while(0) static IndexScanDesc index_beginscan_internal(Relation indexRelation, - int nkeys, ScanKey key); + int nkeys, int norderbys); /* ---------------------------------------------------------------- @@ -192,6 +204,11 @@ index_insert(Relation indexRelation, RELATION_CHECKS; GET_REL_PROCEDURE(aminsert); + if (!(indexRelation->rd_am->ampredlocks)) + CheckForSerializableConflictIn(indexRelation, + (HeapTuple) NULL, + InvalidBuffer); + /* * have the am's insert proc do all the work. */ @@ -213,11 +230,11 @@ IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, Snapshot snapshot, - int nkeys, ScanKey key) + int nkeys, int norderbys) { IndexScanDesc scan; - scan = index_beginscan_internal(indexRelation, nkeys, key); + scan = index_beginscan_internal(indexRelation, nkeys, norderbys); /* * Save additional parameters into the scandesc. Everything else was set @@ -238,11 +255,11 @@ index_beginscan(Relation heapRelation, IndexScanDesc index_beginscan_bitmap(Relation indexRelation, Snapshot snapshot, - int nkeys, ScanKey key) + int nkeys) { IndexScanDesc scan; - scan = index_beginscan_internal(indexRelation, nkeys, key); + scan = index_beginscan_internal(indexRelation, nkeys, 0); /* * Save additional parameters into the scandesc. Everything else was set @@ -258,7 +275,7 @@ index_beginscan_bitmap(Relation indexRelation, */ static IndexScanDesc index_beginscan_internal(Relation indexRelation, - int nkeys, ScanKey key) + int nkeys, int norderbys) { IndexScanDesc scan; FmgrInfo *procedure; @@ -266,6 +283,9 @@ index_beginscan_internal(Relation indexRelation, RELATION_CHECKS; GET_REL_PROCEDURE(ambeginscan); + if (!(indexRelation->rd_am->ampredlocks)) + PredicateLockRelation(indexRelation); + /* * We hold a reference count to the relcache entry throughout the scan. */ @@ -278,7 +298,7 @@ index_beginscan_internal(Relation indexRelation, DatumGetPointer(FunctionCall3(procedure, PointerGetDatum(indexRelation), Int32GetDatum(nkeys), - PointerGetDatum(key))); + Int32GetDatum(norderbys))); return scan; } @@ -286,23 +306,28 @@ index_beginscan_internal(Relation indexRelation, /* ---------------- * index_rescan - (re)start a scan of an index * - * The caller may specify a new set of scankeys (but the number of keys - * cannot change). To restart the scan without changing keys, pass NULL - * for the key array. - * - * Note that this is also called when first starting an indexscan; - * see RelationGetIndexScan. Keys *must* be passed in that case, - * unless scan->numberOfKeys is zero. + * During a restart, the caller may specify a new set of scankeys and/or + * orderbykeys; but the number of keys cannot differ from what index_beginscan + * was told. (Later we might relax that to "must not exceed", but currently + * the index AMs tend to assume that scan->numberOfKeys is what to believe.) + * To restart the scan without changing keys, pass NULL for the key arrays. + * (Of course, keys *must* be passed on the first call, unless + * scan->numberOfKeys is zero.) * ---------------- */ void -index_rescan(IndexScanDesc scan, ScanKey key) +index_rescan(IndexScanDesc scan, + ScanKey keys, int nkeys, + ScanKey orderbys, int norderbys) { FmgrInfo *procedure; SCAN_CHECKS; GET_SCAN_PROCEDURE(amrescan); + Assert(nkeys == scan->numberOfKeys); + Assert(norderbys == scan->numberOfOrderBys); + /* Release any held pin on a heap page */ if (BufferIsValid(scan->xs_cbuf)) { @@ -314,9 +339,12 @@ index_rescan(IndexScanDesc scan, ScanKey key) scan->kill_prior_tuple = false; /* for safety */ - FunctionCall2(procedure, + FunctionCall5(procedure, PointerGetDatum(scan), - PointerGetDatum(key)); + PointerGetDatum(keys), + Int32GetDatum(nkeys), + PointerGetDatum(orderbys), + Int32GetDatum(norderbys)); } /* ---------------- @@ -515,6 +543,7 @@ index_getnext(IndexScanDesc scan, ScanDirection direction) { ItemId lp; ItemPointer ctid; + bool valid; /* check for bogus TID */ if (offnum < FirstOffsetNumber || @@ -569,8 +598,13 @@ index_getnext(IndexScanDesc scan, ScanDirection direction) break; /* If it's visible per the snapshot, we must return it */ - if (HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot, - scan->xs_cbuf)) + valid = HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot, + scan->xs_cbuf); + + CheckForSerializableConflictOut(valid, scan->heapRelation, + heapTuple, scan->xs_cbuf); + + if (valid) { /* * If the snapshot is MVCC, we know that it could accept at @@ -590,6 +624,8 @@ index_getnext(IndexScanDesc scan, ScanDirection direction) else scan->xs_next_hot = InvalidOffsetNumber; + PredicateLockTuple(scan->heapRelation, heapTuple); + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); pgstat_count_heap_fetch(scan->indexRelation); diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile index 676ae3c270..2d76d648e0 100644 --- a/src/backend/access/nbtree/Makefile +++ b/src/backend/access/nbtree/Makefile @@ -4,7 +4,7 @@ # Makefile for access/nbtree # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/access/nbtree/Makefile,v 1.16 2008/02/19 10:30:06 petere Exp $ +# src/backend/access/nbtree/Makefile # #------------------------------------------------------------------------- diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 57d6308ada..561ffbb9d4 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.22 2010/02/08 04:33:53 tgl Exp $ +src/backend/access/nbtree/README Btree Indexing ============== @@ -154,7 +154,7 @@ even pages that don't contain any deletable tuples. This guarantees that the btbulkdelete call cannot return while any indexscan is still holding a copy of a deleted index tuple. Note that this requirement does not say that btbulkdelete must visit the pages in any particular order. (See also -on-the-fly deletion, below.) +on-the-fly deletion, below.) There is no such interlocking for deletion of items in internal pages, since backends keep no lock nor pin on a page they have descended past. diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c index 13c127a93c..23f2b61fe9 100644 --- a/src/backend/access/nbtree/nbtcompare.c +++ b/src/backend/access/nbtree/nbtcompare.c @@ -3,12 +3,12 @@ * nbtcompare.c * Comparison functions for btree access method. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtcompare.c,v 1.58 2010/01/02 16:57:35 momjian Exp $ + * src/backend/access/nbtree/nbtcompare.c * * NOTES * diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index cd70a4c73e..d758659c31 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -3,12 +3,12 @@ * nbtinsert.c * Item insertion in Lehman and Yao btrees for Postgres. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.178 2010/03/28 09:27:01 sriggs Exp $ + * src/backend/access/nbtree/nbtinsert.c * *------------------------------------------------------------------------- */ @@ -21,6 +21,7 @@ #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" +#include "storage/predicate.h" #include "utils/inval.h" #include "utils/tqual.h" @@ -74,9 +75,8 @@ static OffsetNumber _bt_findsplitloc(Relation rel, Page page, static void _bt_checksplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int dataitemstoleft, Size firstoldonrightsz); -static void _bt_pgaddtup(Relation rel, Page page, - Size itemsize, IndexTuple itup, - OffsetNumber itup_off, const char *where); +static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, + OffsetNumber itup_off); static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey); static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); @@ -175,6 +175,14 @@ top: if (checkUnique != UNIQUE_CHECK_EXISTING) { + /* + * The only conflict predicate locking cares about for indexes is when + * an index tuple insert conflicts with an existing lock. Since the + * actual location of the insert is hard to predict because of the + * random search used to prevent O(N^2) performance when there are + * many duplicate entries, we can just use the "first valid" page. + */ + CheckForSerializableConflictIn(rel, NULL, buf); /* do the insertion */ _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel); _bt_insertonpg(rel, buf, stack, itup, offset, false); @@ -697,6 +705,9 @@ _bt_insertonpg(Relation rel, /* split the buffer into left and right halves */ rbuf = _bt_split(rel, buf, firstright, newitemoff, itemsz, itup, newitemonleft); + PredicateLockPageSplit(rel, + BufferGetBlockNumber(buf), + BufferGetBlockNumber(rbuf)); /*---------- * By here, @@ -753,7 +764,9 @@ _bt_insertonpg(Relation rel, /* Do the update. No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); - _bt_pgaddtup(rel, page, itemsz, itup, newitemoff, "page"); + if (!_bt_pgaddtup(page, itemsz, itup, newitemoff)) + elog(PANIC, "failed to add new item to block %u in index \"%s\"", + itup_blkno, RelationGetRelationName(rel)); MarkBufferDirty(buf); @@ -765,7 +778,7 @@ _bt_insertonpg(Relation rel, } /* XLOG stuff */ - if (!rel->rd_istemp) + if (RelationNeedsWAL(rel)) { xl_btree_insert xlrec; BlockNumber xldownlink; @@ -879,6 +892,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, Page origpage; Page leftpage, rightpage; + BlockNumber origpagenumber, + rightpagenumber; BTPageOpaque ropaque, lopaque, oopaque; @@ -894,11 +909,27 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, OffsetNumber i; bool isroot; + /* Acquire a new page to split into */ rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + + /* + * origpage is the original page to be split. leftpage is a temporary + * buffer that receives the left-sibling data, which will be copied back + * into origpage on success. rightpage is the new page that receives the + * right-sibling data. If we fail before reaching the critical section, + * origpage hasn't been modified and leftpage is only workspace. In + * principle we shouldn't need to worry about rightpage either, because it + * hasn't been linked into the btree page structure; but to avoid leaving + * possibly-confusing junk behind, we are careful to rewrite rightpage as + * zeroes before throwing any error. + */ origpage = BufferGetPage(buf); leftpage = PageGetTempPage(origpage); rightpage = BufferGetPage(rbuf); + origpagenumber = BufferGetBlockNumber(buf); + rightpagenumber = BufferGetBlockNumber(rbuf); + _bt_pageinit(leftpage, BufferGetPageSize(buf)); /* rightpage was already initialized by _bt_getbuf */ @@ -923,8 +954,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); ropaque->btpo_flags = lopaque->btpo_flags; lopaque->btpo_prev = oopaque->btpo_prev; - lopaque->btpo_next = BufferGetBlockNumber(rbuf); - ropaque->btpo_prev = BufferGetBlockNumber(buf); + lopaque->btpo_next = rightpagenumber; + ropaque->btpo_prev = origpagenumber; ropaque->btpo_next = oopaque->btpo_next; lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level; /* Since we already have write-lock on both pages, ok to read cycleid */ @@ -947,9 +978,12 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, item = (IndexTuple) PageGetItem(origpage, itemid); if (PageAddItem(rightpage, (Item) item, itemsz, rightoff, false, false) == InvalidOffsetNumber) - elog(PANIC, "failed to add hikey to the right sibling" + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add hikey to the right sibling" " while splitting block %u of index \"%s\"", - BufferGetBlockNumber(buf), RelationGetRelationName(rel)); + origpagenumber, RelationGetRelationName(rel)); + } rightoff = OffsetNumberNext(rightoff); } @@ -974,9 +1008,12 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, } if (PageAddItem(leftpage, (Item) item, itemsz, leftoff, false, false) == InvalidOffsetNumber) - elog(PANIC, "failed to add hikey to the left sibling" + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add hikey to the left sibling" " while splitting block %u of index \"%s\"", - BufferGetBlockNumber(buf), RelationGetRelationName(rel)); + origpagenumber, RelationGetRelationName(rel)); + } leftoff = OffsetNumberNext(leftoff); /* @@ -998,14 +1035,24 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, { if (newitemonleft) { - _bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff, - "left sibling"); + if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add new item to the left sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } leftoff = OffsetNumberNext(leftoff); } else { - _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff, - "right sibling"); + if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add new item to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } rightoff = OffsetNumberNext(rightoff); } } @@ -1013,14 +1060,24 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, /* decide which page to put it on */ if (i < firstright) { - _bt_pgaddtup(rel, leftpage, itemsz, item, leftoff, - "left sibling"); + if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add old item to the left sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } leftoff = OffsetNumberNext(leftoff); } else { - _bt_pgaddtup(rel, rightpage, itemsz, item, rightoff, - "right sibling"); + if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add old item to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } rightoff = OffsetNumberNext(rightoff); } } @@ -1034,8 +1091,13 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, * not be splitting the page). */ Assert(!newitemonleft); - _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff, - "right sibling"); + if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add new item to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } rightoff = OffsetNumberNext(rightoff); } @@ -1047,16 +1109,19 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, * neighbors. */ - if (!P_RIGHTMOST(ropaque)) + if (!P_RIGHTMOST(oopaque)) { - sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE); + sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE); spage = BufferGetPage(sbuf); sopaque = (BTPageOpaque) PageGetSpecialPointer(spage); - if (sopaque->btpo_prev != ropaque->btpo_prev) - elog(PANIC, "right sibling's left-link doesn't match: " + if (sopaque->btpo_prev != origpagenumber) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "right sibling's left-link doesn't match: " "block %u links to %u instead of expected %u in index \"%s\"", - ropaque->btpo_next, sopaque->btpo_prev, ropaque->btpo_prev, + oopaque->btpo_next, sopaque->btpo_prev, origpagenumber, RelationGetRelationName(rel)); + } /* * Check to see if we can set the SPLIT_END flag in the right-hand @@ -1081,8 +1146,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, * * NO EREPORT(ERROR) till right sibling is updated. We can get away with * not starting the critical section till here because we haven't been - * scribbling on the original page yet, and we don't care about the new - * sibling until it's linked into the btree. + * scribbling on the original page yet; see comments above. */ START_CRIT_SECTION(); @@ -1094,24 +1158,26 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, * (in the page management code) that the center of a page always be * clean, and the most efficient way to guarantee this is just to compact * the data by reinserting it into a new left page. (XXX the latter - * comment is probably obsolete.) + * comment is probably obsolete; but in any case it's good to not scribble + * on the original page until we enter the critical section.) * * We need to do this before writing the WAL record, so that XLogInsert * can WAL log an image of the page if necessary. */ PageRestoreTempPage(leftpage, origpage); + /* leftpage, lopaque must not be used below here */ MarkBufferDirty(buf); MarkBufferDirty(rbuf); if (!P_RIGHTMOST(ropaque)) { - sopaque->btpo_prev = BufferGetBlockNumber(rbuf); + sopaque->btpo_prev = rightpagenumber; MarkBufferDirty(sbuf); } /* XLOG stuff */ - if (!rel->rd_istemp) + if (RelationNeedsWAL(rel)) { xl_btree_split xlrec; uint8 xlinfo; @@ -1120,8 +1186,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, XLogRecData *lastrdata; xlrec.node = rel->rd_node; - xlrec.leftsib = BufferGetBlockNumber(buf); - xlrec.rightsib = BufferGetBlockNumber(rbuf); + xlrec.leftsib = origpagenumber; + xlrec.rightsib = rightpagenumber; xlrec.rnext = ropaque->btpo_next; xlrec.level = ropaque->btpo.level; xlrec.firstright = firstright; @@ -1860,7 +1926,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) MarkBufferDirty(metabuf); /* XLOG stuff */ - if (!rel->rd_istemp) + if (RelationNeedsWAL(rel)) { xl_btree_newroot xlrec; XLogRecPtr recptr; @@ -1920,13 +1986,11 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) * we insert the tuples in order, so that the given itup_off does * represent the final position of the tuple! */ -static void -_bt_pgaddtup(Relation rel, - Page page, +static bool +_bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, - OffsetNumber itup_off, - const char *where) + OffsetNumber itup_off) { BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); IndexTupleData trunctuple; @@ -1941,8 +2005,9 @@ _bt_pgaddtup(Relation rel, if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false) == InvalidOffsetNumber) - elog(PANIC, "failed to add item to the %s in index \"%s\"", - where, RelationGetRelationName(rel)); + return false; + + return true; } /* @@ -1978,9 +2043,10 @@ _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, if (isNull || (scankey->sk_flags & SK_ISNULL)) return false; - result = DatumGetInt32(FunctionCall2(&scankey->sk_func, - datum, - scankey->sk_argument)); + result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, + scankey->sk_collation, + datum, + scankey->sk_argument)); if (result != 0) return false; diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 5e9e7b9270..58f47e7b64 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -4,12 +4,12 @@ * BTree-specific page management code for the Postgres btree access * method. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.123 2010/07/06 19:18:55 momjian Exp $ + * src/backend/access/nbtree/nbtpage.c * * NOTES * Postgres btree pages look like ordinary relation pages. The opaque @@ -29,6 +29,7 @@ #include "storage/freespace.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" +#include "storage/predicate.h" #include "utils/inval.h" #include "utils/snapmgr.h" @@ -224,7 +225,7 @@ _bt_getroot(Relation rel, int access) MarkBufferDirty(metabuf); /* XLOG stuff */ - if (!rel->rd_istemp) + if (RelationNeedsWAL(rel)) { xl_btree_newroot xlrec; XLogRecPtr recptr; @@ -452,7 +453,7 @@ _bt_checkpage(Relation rel, Buffer buf) static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid) { - if (rel->rd_istemp) + if (!RelationNeedsWAL(rel)) return; /* No ereport(ERROR) until changes are logged */ @@ -465,7 +466,6 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX /* XLOG stuff */ { - XLogRecPtr recptr; XLogRecData rdata[1]; xl_btree_reuse_page xlrec_reuse; @@ -477,7 +477,7 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX rdata[0].buffer = InvalidBuffer; rdata[0].next = NULL; - recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE, rdata); + XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE, rdata); /* * We don't do PageSetLSN or PageSetTLI here because we're about @@ -751,7 +751,7 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, MarkBufferDirty(buf); /* XLOG stuff */ - if (!rel->rd_istemp) + if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; XLogRecData rdata[2]; @@ -829,7 +829,7 @@ _bt_delitems_delete(Relation rel, Buffer buf, MarkBufferDirty(buf); /* XLOG stuff */ - if (!rel->rd_istemp) + if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; XLogRecData rdata[3]; @@ -1175,6 +1175,19 @@ _bt_pagedel(Relation rel, Buffer buf, BTStack stack) */ rightsib = opaque->btpo_next; rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); + page = BufferGetPage(rbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (opaque->btpo_prev != target) + elog(ERROR, "right sibling's left-link doesn't match: " + "block %u links to %u instead of expected %u in index \"%s\"", + rightsib, opaque->btpo_prev, target, + RelationGetRelationName(rel)); + + /* + * Any insert which would have gone on the target block will now go to the + * right sibling block. + */ + PredicateLockPageCombine(rel, target, rightsib); /* * Next find and write-lock the current parent of the target page. This is @@ -1253,6 +1266,38 @@ _bt_pagedel(Relation rel, Buffer buf, BTStack stack) } /* + * Check that the parent-page index items we're about to delete/overwrite + * contain what we expect. This can fail if the index has become corrupt + * for some reason. We want to throw any error before entering the + * critical section --- otherwise it'd be a PANIC. + * + * The test on the target item is just an Assert because _bt_getstackbuf + * should have guaranteed it has the expected contents. The test on the + * next-child downlink is known to sometimes fail in the field, though. + */ + page = BufferGetPage(pbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + +#ifdef USE_ASSERT_CHECKING + itemid = PageGetItemId(page, poffset); + itup = (IndexTuple) PageGetItem(page, itemid); + Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target); +#endif + + if (!parent_half_dead) + { + OffsetNumber nextoffset; + + nextoffset = OffsetNumberNext(poffset); + itemid = PageGetItemId(page, nextoffset); + itup = (IndexTuple) PageGetItem(page, itemid); + if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib) + elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"", + rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)), + parent, RelationGetRelationName(rel)); + } + + /* * Here we begin doing the deletion. */ @@ -1265,8 +1310,6 @@ _bt_pagedel(Relation rel, Buffer buf, BTStack stack) * to copy the right sibling's downlink over the target downlink, and then * delete the following item. */ - page = BufferGetPage(pbuf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (parent_half_dead) { PageIndexTupleDelete(page, poffset); @@ -1278,23 +1321,16 @@ _bt_pagedel(Relation rel, Buffer buf, BTStack stack) itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); - Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target); ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY); nextoffset = OffsetNumberNext(poffset); - /* This part is just for double-checking */ - itemid = PageGetItemId(page, nextoffset); - itup = (IndexTuple) PageGetItem(page, itemid); - if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib) - elog(PANIC, "right sibling %u of block %u is not next child of %u in index \"%s\"", - rightsib, target, BufferGetBlockNumber(pbuf), - RelationGetRelationName(rel)); PageIndexTupleDelete(page, nextoffset); } /* * Update siblings' side-links. Note the target page's side-links will - * continue to point to the siblings. + * continue to point to the siblings. Asserts here are just rechecking + * things we already verified above. */ if (BufferIsValid(lbuf)) { @@ -1335,7 +1371,7 @@ _bt_pagedel(Relation rel, Buffer buf, BTStack stack) MarkBufferDirty(lbuf); /* XLOG stuff */ - if (!rel->rd_istemp) + if (RelationNeedsWAL(rel)) { xl_btree_delete_page xlrec; xl_btree_metadata xlmeta; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 0fcde95ccd..6a7ddd7db4 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -8,11 +8,11 @@ * This file contains only the public interface routines. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.177 2010/03/28 09:27:01 sriggs Exp $ + * src/backend/access/nbtree/nbtree.c * *------------------------------------------------------------------------- */ @@ -29,6 +29,9 @@ #include "storage/indexfsm.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/predicate.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" #include "utils/memutils.h" @@ -205,6 +208,36 @@ btbuildCallback(Relation index, } /* + * btbuildempty() -- build an empty btree index in the initialization fork + */ +Datum +btbuildempty(PG_FUNCTION_ARGS) +{ + Relation index = (Relation) PG_GETARG_POINTER(0); + Page metapage; + + /* Construct metapage. */ + metapage = (Page) palloc(BLCKSZ); + _bt_initmetapage(metapage, P_NONE, 0); + + /* Write the page. If archiving/streaming, XLOG it. */ + smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE, + (char *) metapage, true); + if (XLogIsNeeded()) + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + BTREE_METAPAGE, metapage); + + /* + * An immediate sync is require even if we xlog'd the page, because the + * write did not go through shared_buffers and therefore a concurrent + * checkpoint may have move the redo pointer past our xlog record. + */ + smgrimmedsync(index->rd_smgr, INIT_FORKNUM); + + PG_RETURN_VOID(); +} + +/* * btinsert() -- insert an index tuple into a btree. * * Descend the tree recursively, find the appropriate location for our @@ -337,12 +370,27 @@ Datum btbeginscan(PG_FUNCTION_ARGS) { Relation rel = (Relation) PG_GETARG_POINTER(0); - int keysz = PG_GETARG_INT32(1); - ScanKey scankey = (ScanKey) PG_GETARG_POINTER(2); + int nkeys = PG_GETARG_INT32(1); + int norderbys = PG_GETARG_INT32(2); IndexScanDesc scan; + BTScanOpaque so; + + /* no order by operators allowed */ + Assert(norderbys == 0); /* get the scan */ - scan = RelationGetIndexScan(rel, keysz, scankey); + scan = RelationGetIndexScan(rel, nkeys, norderbys); + + /* allocate private workspace */ + so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); + so->currPos.buf = so->markPos.buf = InvalidBuffer; + if (scan->numberOfKeys > 0) + so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); + else + so->keyData = NULL; + so->killedItems = NULL; /* until needed */ + so->numKilled = 0; + scan->opaque = so; PG_RETURN_POINTER(scan); } @@ -355,22 +403,9 @@ btrescan(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1); - BTScanOpaque so; - so = (BTScanOpaque) scan->opaque; - - if (so == NULL) /* if called from btbeginscan */ - { - so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); - so->currPos.buf = so->markPos.buf = InvalidBuffer; - if (scan->numberOfKeys > 0) - so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); - else - so->keyData = NULL; - so->killedItems = NULL; /* until needed */ - so->numKilled = 0; - scan->opaque = so; - } + /* remaining arguments are ignored */ + BTScanOpaque so = (BTScanOpaque) scan->opaque; /* we aren't holding any read locks, but gotta drop the pins */ if (BTScanPosIsValid(so->currPos)) diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index fc11829495..2ce2bc2f00 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -4,11 +4,11 @@ * Search code for postgres btrees. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.121 2010/01/02 16:57:35 momjian Exp $ + * src/backend/access/nbtree/nbtsearch.c * *------------------------------------------------------------------------- */ @@ -21,6 +21,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" +#include "storage/predicate.h" #include "utils/lsyscache.h" #include "utils/rel.h" @@ -63,7 +64,10 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, /* If index is empty and access = BT_READ, no root page is created. */ if (!BufferIsValid(*bufP)) + { + PredicateLockRelation(rel); /* Nothing finer to lock exists. */ return (BTStack) NULL; + } /* Loop iterates once per level descended in the tree */ for (;;) @@ -88,7 +92,11 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISLEAF(opaque)) + { + if (access == BT_READ) + PredicateLockPage(rel, BufferGetBlockNumber(*bufP)); break; + } /* * Find the appropriate item on the internal page, and get the child @@ -402,9 +410,10 @@ _bt_compare(Relation rel, * to flip the sign of the comparison result. (Unless it's a DESC * column, in which case we *don't* flip the sign.) */ - result = DatumGetInt32(FunctionCall2(&scankey->sk_func, - datum, - scankey->sk_argument)); + result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, + scankey->sk_collation, + datum, + scankey->sk_argument)); if (!(scankey->sk_flags & SK_BT_DESC)) result = -result; @@ -713,6 +722,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) cur->sk_attno, InvalidStrategy, cur->sk_subtype, + cur->sk_collation, procinfo, cur->sk_argument); } @@ -733,6 +743,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) cur->sk_attno, InvalidStrategy, cur->sk_subtype, + cur->sk_collation, cmp_proc, cur->sk_argument); } @@ -1142,6 +1153,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { + PredicateLockPage(rel, blkno); /* see if there are any matches on this page */ /* note that this will clear moreRight if we can stop */ if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque))) @@ -1189,6 +1201,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { + PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf)); /* see if there are any matches on this page */ /* note that this will clear moreLeft if we can stop */ if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page))) @@ -1352,6 +1365,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) if (!BufferIsValid(buf)) { /* empty index... */ + PredicateLockRelation(rel); /* Nothing finer to lock exists. */ return InvalidBuffer; } @@ -1431,10 +1445,12 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) if (!BufferIsValid(buf)) { /* empty index... */ + PredicateLockRelation(rel); /* Nothing finer to lock exists. */ so->currPos.buf = InvalidBuffer; return false; } + PredicateLockPage(rel, BufferGetBlockNumber(buf)); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISLEAF(opaque)); diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 15964e127e..93a928c66b 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -55,11 +55,11 @@ * This code isn't concerned about the FSM at all. The caller is responsible * for initializing that. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.125 2010/04/28 16:10:40 heikki Exp $ + * src/backend/access/nbtree/nbtsort.c * *------------------------------------------------------------------------- */ @@ -70,6 +70,7 @@ #include "access/nbtree.h" #include "miscadmin.h" #include "storage/smgr.h" +#include "tcop/tcopprot.h" #include "utils/rel.h" #include "utils/tuplesort.h" @@ -211,9 +212,9 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) /* * We need to log index creation in WAL iff WAL archiving/streaming is - * enabled AND it's not a temp index. + * enabled UNLESS the index isn't WAL-logged anyway. */ - wstate.btws_use_wal = XLogIsNeeded() && !wstate.index->rd_istemp; + wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; @@ -295,9 +296,8 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) } /* - * Now write the page. We say isTemp = true even if it's not a temp - * index, because there's no need for smgr to schedule an fsync for this - * write; we'll do it ourselves before ending the build. + * Now write the page. There's no need for smgr to schedule an fsync for + * this write; we'll do it ourselves before ending the build. */ if (blkno == wstate->btws_pages_written) { @@ -736,9 +736,11 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) } else { - compare = DatumGetInt32(FunctionCall2(&entry->sk_func, - attrDatum1, - attrDatum2)); + compare = + DatumGetInt32(FunctionCall2Coll(&entry->sk_func, + entry->sk_collation, + attrDatum1, + attrDatum2)); if (entry->sk_flags & SK_BT_DESC) compare = -compare; @@ -798,9 +800,9 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) _bt_uppershutdown(wstate, state); /* - * If the index isn't temp, we must fsync it down to disk before it's safe - * to commit the transaction. (For a temp index we don't care since the - * index will be uninteresting after a crash anyway.) + * If the index is WAL-logged, we must fsync it down to disk before it's + * safe to commit the transaction. (For a non-WAL-logged index we don't + * care since the index will be uninteresting after a crash anyway.) * * It's obvious that we must do this when not WAL-logging the build. It's * less obvious that we have to do it even if we did WAL-log the index @@ -812,7 +814,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) * fsync those pages here, they might still not be on disk when the crash * occurs. */ - if (!wstate->index->rd_istemp) + if (RelationNeedsWAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 6b399d34a6..2e896a258f 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -3,12 +3,12 @@ * nbtutils.c * Utility code for Postgres btree implementation. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.98 2010/02/26 02:00:34 momjian Exp $ + * src/backend/access/nbtree/nbtutils.c * *------------------------------------------------------------------------- */ @@ -80,6 +80,7 @@ _bt_mkscankey(Relation rel, IndexTuple itup) (AttrNumber) (i + 1), InvalidStrategy, InvalidOid, + rel->rd_indcollation[i], procinfo, arg); } @@ -127,6 +128,7 @@ _bt_mkscankey_nodata(Relation rel) (AttrNumber) (i + 1), InvalidStrategy, InvalidOid, + rel->rd_indcollation[i], procinfo, (Datum) 0); } @@ -600,9 +602,10 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, */ if (lefttype == opcintype && righttype == optype) { - *result = DatumGetBool(FunctionCall2(&op->sk_func, - leftarg->sk_argument, - rightarg->sk_argument)); + *result = DatumGetBool(FunctionCall2Coll(&op->sk_func, + op->sk_collation, + leftarg->sk_argument, + rightarg->sk_argument)); return true; } @@ -629,9 +632,10 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, if (RegProcedureIsValid(cmp_proc)) { - *result = DatumGetBool(OidFunctionCall2(cmp_proc, - leftarg->sk_argument, - rightarg->sk_argument)); + *result = DatumGetBool(OidFunctionCall2Coll(cmp_proc, + op->sk_collation, + leftarg->sk_argument, + rightarg->sk_argument)); return true; } } @@ -685,6 +689,10 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption) * Likewise, "x IS NOT NULL" is supported. We treat that as either "less * than NULL" in a NULLS LAST index, or "greater than NULL" in a NULLS * FIRST index. + * + * Note: someday we might have to fill in sk_collation from the index + * column's collation. At the moment this is a non-issue because we'll + * never actually call the comparison operator on a NULL. */ if (skey->sk_flags & SK_ISNULL) { @@ -699,6 +707,7 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption) { skey->sk_strategy = BTEqualStrategyNumber; skey->sk_subtype = InvalidOid; + skey->sk_collation = InvalidOid; } else if (skey->sk_flags & SK_SEARCHNOTNULL) { @@ -707,6 +716,7 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption) else skey->sk_strategy = BTLessStrategyNumber; skey->sk_subtype = InvalidOid; + skey->sk_collation = InvalidOid; } else { @@ -972,7 +982,8 @@ _bt_checkkeys(IndexScanDesc scan, return false; } - test = FunctionCall2(&key->sk_func, datum, key->sk_argument); + test = FunctionCall2Coll(&key->sk_func, key->sk_collation, + datum, key->sk_argument); if (!DatumGetBool(test)) { @@ -1095,9 +1106,10 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc, } /* Perform the test --- three-way comparison not bool operator */ - cmpresult = DatumGetInt32(FunctionCall2(&subkey->sk_func, - datum, - subkey->sk_argument)); + cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func, + subkey->sk_collation, + datum, + subkey->sk_argument)); if (subkey->sk_flags & SK_BT_DESC) cmpresult = -cmpresult; diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 3261483475..2775ae6d29 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -4,11 +4,11 @@ * WAL replay logic for btrees. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.69 2010/07/06 19:18:55 momjian Exp $ + * src/backend/access/nbtree/nbtxlog.c * *------------------------------------------------------------------------- */ @@ -580,7 +580,6 @@ btree_xlog_delete_get_latestRemovedXid(XLogRecord *record) BlockNumber hblkno; OffsetNumber hoffnum; TransactionId latestRemovedXid = InvalidTransactionId; - TransactionId htupxid = InvalidTransactionId; int i; /* @@ -646,24 +645,16 @@ btree_xlog_delete_get_latestRemovedXid(XLogRecord *record) } /* - * If the heap item has storage, then read the header. Some LP_DEAD - * items may not be accessible, so we ignore them. + * If the heap item has storage, then read the header and use that to + * set latestRemovedXid. + * + * Some LP_DEAD items may not be accessible, so we ignore them. */ if (ItemIdHasStorage(hitemid)) { htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid); - /* - * Get the heap tuple's xmin/xmax and ratchet up the - * latestRemovedXid. No need to consider xvac values here. - */ - htupxid = HeapTupleHeaderGetXmin(htuphdr); - if (TransactionIdFollows(htupxid, latestRemovedXid)) - latestRemovedXid = htupxid; - - htupxid = HeapTupleHeaderGetXmax(htuphdr); - if (TransactionIdFollows(htupxid, latestRemovedXid)) - latestRemovedXid = htupxid; + HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid); } else if (ItemIdIsDead(hitemid)) { diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index fe34e4eaaa..849ab384d5 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -4,7 +4,7 @@ # Makefile for access/transam # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/access/transam/Makefile,v 1.22 2008/02/19 10:30:07 petere Exp $ +# src/backend/access/transam/Makefile # #------------------------------------------------------------------------- diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index 05c41d487c..eaac1393b8 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/access/transam/README,v 1.13 2009/12/19 01:32:33 sriggs Exp $ +src/backend/access/transam/README The Transaction System ====================== @@ -543,6 +543,85 @@ consistency. Such insertions occur after WAL is operational, so they can and should write WAL records for the additional generated actions. +Write-Ahead Logging for Filesystem Actions +------------------------------------------ + +The previous section described how to WAL-log actions that only change page +contents within shared buffers. For that type of action it is generally +possible to check all likely error cases (such as insufficient space on the +page) before beginning to make the actual change. Therefore we can make +the change and the creation of the associated WAL log record "atomic" by +wrapping them into a critical section --- the odds of failure partway +through are low enough that PANIC is acceptable if it does happen. + +Clearly, that approach doesn't work for cases where there's a significant +probability of failure within the action to be logged, such as creation +of a new file or database. We don't want to PANIC, and we especially don't +want to PANIC after having already written a WAL record that says we did +the action --- if we did, replay of the record would probably fail again +and PANIC again, making the failure unrecoverable. This means that the +ordinary WAL rule of "write WAL before the changes it describes" doesn't +work, and we need a different design for such cases. + +There are several basic types of filesystem actions that have this +issue. Here is how we deal with each: + +1. Adding a disk page to an existing table. + +This action isn't WAL-logged at all. We extend a table by writing a page +of zeroes at its end. We must actually do this write so that we are sure +the filesystem has allocated the space. If the write fails we can just +error out normally. Once the space is known allocated, we can initialize +and fill the page via one or more normal WAL-logged actions. Because it's +possible that we crash between extending the file and writing out the WAL +entries, we have to treat discovery of an all-zeroes page in a table or +index as being a non-error condition. In such cases we can just reclaim +the space for re-use. + +2. Creating a new table, which requires a new file in the filesystem. + +We try to create the file, and if successful we make a WAL record saying +we did it. If not successful, we can just throw an error. Notice that +there is a window where we have created the file but not yet written any +WAL about it to disk. If we crash during this window, the file remains +on disk as an "orphan". It would be possible to clean up such orphans +by having database restart search for files that don't have any committed +entry in pg_class, but that currently isn't done because of the possibility +of deleting data that is useful for forensic analysis of the crash. +Orphan files are harmless --- at worst they waste a bit of disk space --- +because we check for on-disk collisions when allocating new relfilenode +OIDs. So cleaning up isn't really necessary. + +3. Deleting a table, which requires an unlink() that could fail. + +Our approach here is to WAL-log the operation first, but to treat failure +of the actual unlink() call as a warning rather than error condition. +Again, this can leave an orphan file behind, but that's cheap compared to +the alternatives. Since we can't actually do the unlink() until after +we've committed the DROP TABLE transaction, throwing an error would be out +of the question anyway. (It may be worth noting that the WAL entry about +the file deletion is actually part of the commit record for the dropping +transaction.) + +4. Creating and deleting databases and tablespaces, which requires creating +and deleting directories and entire directory trees. + +These cases are handled similarly to creating individual files, ie, we +try to do the action first and then write a WAL entry if it succeeded. +The potential amount of wasted disk space is rather larger, of course. +In the creation case we try to delete the directory tree again if creation +fails, so as to reduce the risk of wasted space. Failure partway through +a deletion operation results in a corrupt database: the DROP failed, but +some of the data is gone anyway. There is little we can do about that, +though, and in any case it was presumably data the user no longer wants. + +In all of these cases, if WAL replay fails to redo the original action +we must panic and abort recovery. The DBA will have to manually clean up +(for instance, free up some disk space or fix directory permissions) and +then restart recovery. This is part of the reason for not writing a WAL +entry until we've successfully done the original action. + + Asynchronous Commit ------------------- diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 1d29d80c75..2a1b4ae111 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -23,11 +23,11 @@ * for aborts (whether sync or async), since the post-crash assumption would * be that such transactions failed anyway. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation * - * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.55 2010/01/02 16:57:35 momjian Exp $ + * src/backend/access/transam/clog.c * *------------------------------------------------------------------------- */ @@ -442,8 +442,8 @@ CLOGShmemInit(void) /* * This func must be called ONCE on system install. It creates * the initial CLOG segment. (The CLOG directory is assumed to - * have been created by the initdb shell script, and CLOGShmemInit - * must have been called already.) + * have been created by initdb, and CLOGShmemInit must have been + * called already.) */ void BootStrapCLOG(void) @@ -456,7 +456,7 @@ BootStrapCLOG(void) slotno = ZeroCLOGPage(0, false); /* Make sure it's written out */ - SimpleLruWritePage(ClogCtl, slotno, NULL); + SimpleLruWritePage(ClogCtl, slotno); Assert(!ClogCtl->shared->page_dirty[slotno]); LWLockRelease(CLogControlLock); @@ -745,7 +745,7 @@ clog_redo(XLogRecPtr lsn, XLogRecord *record) LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); slotno = ZeroCLOGPage(pageno, false); - SimpleLruWritePage(ClogCtl, slotno, NULL); + SimpleLruWritePage(ClogCtl, slotno); Assert(!ClogCtl->shared->page_dirty[slotno]); LWLockRelease(CLogControlLock); diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 3f3bdc0335..bb3afd6000 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -39,10 +39,10 @@ * anything we saw during replay. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.35 2010/02/26 02:00:34 momjian Exp $ + * src/backend/access/transam/multixact.c * *------------------------------------------------------------------------- */ @@ -1454,7 +1454,7 @@ BootStrapMultiXact(void) slotno = ZeroMultiXactOffsetPage(0, false); /* Make sure it's written out */ - SimpleLruWritePage(MultiXactOffsetCtl, slotno, NULL); + SimpleLruWritePage(MultiXactOffsetCtl, slotno); Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); LWLockRelease(MultiXactOffsetControlLock); @@ -1465,7 +1465,7 @@ BootStrapMultiXact(void) slotno = ZeroMultiXactMemberPage(0, false); /* Make sure it's written out */ - SimpleLruWritePage(MultiXactMemberCtl, slotno, NULL); + SimpleLruWritePage(MultiXactMemberCtl, slotno); Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); LWLockRelease(MultiXactMemberControlLock); @@ -1986,7 +1986,7 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record) LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); slotno = ZeroMultiXactOffsetPage(pageno, false); - SimpleLruWritePage(MultiXactOffsetCtl, slotno, NULL); + SimpleLruWritePage(MultiXactOffsetCtl, slotno); Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); LWLockRelease(MultiXactOffsetControlLock); @@ -2001,7 +2001,7 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record) LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); slotno = ZeroMultiXactMemberPage(pageno, false); - SimpleLruWritePage(MultiXactMemberCtl, slotno, NULL); + SimpleLruWritePage(MultiXactMemberCtl, slotno); Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); LWLockRelease(MultiXactMemberControlLock); diff --git a/src/backend/access/transam/recovery.conf.sample b/src/backend/access/transam/recovery.conf.sample index 855c11318f..2fee00e518 100644 --- a/src/backend/access/transam/recovery.conf.sample +++ b/src/backend/access/transam/recovery.conf.sample @@ -3,8 +3,8 @@ # ------------------------------- # # Edit this file to provide the parameters that PostgreSQL needs to -# perform an archive recovery of a database, or to act as a log-streaming -# replication standby. +# perform an archive recovery of a database, or to act as a replication +# standby. # # If "recovery.conf" is present in the PostgreSQL data directory, it is # read on postmaster startup. After successful recovery, it is renamed @@ -13,9 +13,7 @@ # # This file consists of lines of the form: # -# name = 'value' -# -# (The quotes around the value are NOT optional, but the "=" is.) +# name = value # # Comments are introduced with '#'. # @@ -68,11 +66,14 @@ # If you want to stop rollforward at a specific point, you # must set a recovery target. # -# You may set a recovery target either by transactionId, -# by timestamp or by barrier id. Recovery may either include or exclude the -# transaction(s) with the recovery target value in case of timestamp or -# transactionId (ie, stop either just after or just before the given target, -# respectively). In case of barrier, the recovery stops exactly at that point +# You may set a recovery target either by transactionId, by name, +# or by timestamp or by barrier. Recovery may either include or exclude the +# transaction(s) with the recovery target value (ie, stop either +# just after or just before the given target, respectively). In case of +# barrier, the recovery stops exactly at that point. +# +# +#recovery_target_name = '' # e.g. 'daily backup 2011-01-26' # #recovery_target_time = '' # e.g. '2004-07-14 22:39:00 EST' # @@ -80,7 +81,7 @@ # #recovery_target_barrier = '' # -#recovery_target_inclusive = 'true' +#recovery_target_inclusive = true # # # If you want to recover into a timeline other than the "main line" shown in @@ -89,25 +90,39 @@ # #recovery_target_timeline = 'latest' # +# +# If pause_at_recovery_target is enabled, recovery will pause when +# the recovery target is reached. The pause state will continue until +# pg_xlog_replay_resume() is called. This setting has no effect if +# hot standby is not enabled, or if no recovery target is set. +# +#pause_at_recovery_target = true +# #--------------------------------------------------------------------------- # STANDBY SERVER PARAMETERS #--------------------------------------------------------------------------- # -# When standby_mode is enabled, the PostgreSQL server will work as -# a standby. It tries to connect to the primary according to the -# connection settings primary_conninfo, and receives XLOG records -# continuously. +# standby_mode +# +# When standby_mode is enabled, the PostgreSQL server will work as a +# standby. It will continuously wait for the additional XLOG records, using +# restore_command and/or primary_conninfo. +# +#standby_mode = off +# +# primary_conninfo # -#standby_mode = 'off' +# If set, the PostgreSQL server will try to connect to the primary using this +# connection string and receive XLOG records continuously. # #primary_conninfo = '' # e.g. 'host=localhost port=5432' # # -# By default, a standby server keeps streaming XLOG records from the -# primary indefinitely. If you want to stop streaming and finish recovery, -# opening up the system in read/write mode, specify path to a trigger file. -# Server will poll the trigger file path periodically and stop streaming -# when it's found. +# By default, a standby server keeps restoring XLOG records from the +# primary indefinitely. If you want to stop the standby mode, finish recovery +# and open the system in read/write mode, specify path to a trigger file. +# The server will poll the trigger file path periodically and start as a +# primary server when it's found. # #trigger_file = '' # diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index d989a59197..734d258002 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -3,7 +3,7 @@ * * Resource managers definition * - * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.29 2010/02/07 20:48:09 tgl Exp $ + * src/backend/access/transam/rmgr.c */ #include "postgres.h" @@ -42,7 +42,7 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = { {"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint}, {"Hash", hash_redo, hash_desc, NULL, NULL, NULL}, {"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint}, - {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, gist_safe_restartpoint}, + {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, NULL}, {"Sequence", seq_redo, seq_desc, NULL, NULL, NULL} #ifdef PGXC , diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index d122a15ff6..f4cc6282eb 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -38,10 +38,10 @@ * by re-setting the page's page_dirty flag. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.50 2010/04/28 16:54:15 tgl Exp $ + * src/backend/access/transam/slru.c * *------------------------------------------------------------------------- */ @@ -78,6 +78,8 @@ typedef struct SlruFlushData int segno[MAX_FLUSH_BUFFERS]; /* their log seg#s */ } SlruFlushData; +typedef struct SlruFlushData *SlruFlush; + /* * Macro to mark a buffer slot "most recently used". Note multiple evaluation * of arguments! @@ -123,6 +125,7 @@ static int slru_errno; static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno); static void SimpleLruWaitIO(SlruCtl ctl, int slotno); +static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata); static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno); static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata); @@ -485,8 +488,8 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid) * * Control lock must be held at entry, and will be held at exit. */ -void -SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) +static void +SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int pageno = shared->page_number[slotno]; @@ -567,6 +570,17 @@ SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) } /* + * Wrapper of SlruInternalWritePage, for external callers. + * fdata is always passed a NULL here. + */ +void +SimpleLruWritePage(SlruCtl ctl, int slotno) +{ + SlruInternalWritePage(ctl, slotno, NULL); +} + + +/* * Physical read of a (previously existing) page into a buffer slot * * On failure, we cannot just ereport(ERROR) since caller has put state in @@ -989,7 +1003,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) * we wait for the existing I/O to complete. */ if (shared->page_status[bestslot] == SLRU_PAGE_VALID) - SimpleLruWritePage(ctl, bestslot, NULL); + SlruInternalWritePage(ctl, bestslot, NULL); else SimpleLruWaitIO(ctl, bestslot); @@ -1023,7 +1037,7 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint) for (slotno = 0; slotno < shared->num_slots; slotno++) { - SimpleLruWritePage(ctl, slotno, &fdata); + SlruInternalWritePage(ctl, slotno, &fdata); /* * When called during a checkpoint, we cannot assert that the slot is @@ -1128,7 +1142,7 @@ restart:; * keep the logic the same as it was.) */ if (shared->page_status[slotno] == SLRU_PAGE_VALID) - SimpleLruWritePage(ctl, slotno, NULL); + SlruInternalWritePage(ctl, slotno, NULL); else SimpleLruWaitIO(ctl, slotno); goto restart; diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index 7e01a90710..a6d9d7f0b5 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -19,11 +19,11 @@ * data across crashes. During database startup, we simply force the * currently-active page of SUBTRANS to zeroes. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation * - * $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.27 2010/02/26 02:00:34 momjian Exp $ + * src/backend/access/transam/subtrans.c * *------------------------------------------------------------------------- */ @@ -210,7 +210,7 @@ BootStrapSUBTRANS(void) slotno = ZeroSUBTRANSPage(0); /* Make sure it's written out */ - SimpleLruWritePage(SubTransCtl, slotno, NULL); + SimpleLruWritePage(SubTransCtl, slotno); Assert(!SubTransCtl->shared->page_dirty[slotno]); LWLockRelease(SubtransControlLock); diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c index 0c2e2dd824..bc02f15e86 100644 --- a/src/backend/access/transam/transam.c +++ b/src/backend/access/transam/transam.c @@ -3,12 +3,12 @@ * transam.c * postgres transaction log interface routines * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/transam.c,v 1.80 2010/01/02 16:57:35 momjian Exp $ + * src/backend/access/transam/transam.c * * NOTES * This file contains the high level access-method interface to the diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 8ab6d31e7f..ee327ebb59 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -3,12 +3,12 @@ * twophase.c * Two-phase commit support functions. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.62 2010/07/06 19:18:55 momjian Exp $ + * src/backend/access/transam/twophase.c * * NOTES * Each global transaction is associated with a global transaction @@ -56,7 +56,10 @@ #include "miscadmin.h" #include "pg_trace.h" #include "pgstat.h" +#include "replication/walsender.h" +#include "replication/syncrep.h" #include "storage/fd.h" +#include "storage/predicate.h" #include "storage/procarray.h" #include "storage/sinvaladt.h" #include "storage/smgr.h" @@ -122,7 +125,7 @@ typedef struct GlobalTransactionData TransactionId locking_xid; /* top-level XID of backend working on xact */ bool valid; /* TRUE if fully prepared */ char gid[GIDSIZE]; /* The GID assigned to the prepared xact */ -} GlobalTransactionData; +} GlobalTransactionData; /* * Two Phase Commit shared state. Access to this struct is protected @@ -870,8 +873,8 @@ StartPrepare(GlobalTransaction gxact) hdr.prepared_at = gxact->prepared_at; hdr.owner = gxact->owner; hdr.nsubxacts = xactGetCommittedChildren(&children); - hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels, NULL); - hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels, NULL); + hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels); + hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels); hdr.ninvalmsgs = xactGetCommittedInvalidationMessages(&invalmsgs, &hdr.initfileinval); StrNCpy(hdr.gid, gxact->gid, GIDSIZE); @@ -1032,6 +1035,13 @@ EndPrepare(GlobalTransaction gxact) /* If we crash now, we have prepared: WAL replay will fix things */ + /* + * Wake up all walsenders to send WAL up to the PREPARE record immediately + * if replication is enabled + */ + if (max_wal_senders > 0) + WalSndWakeup(); + /* write correct CRC and close file */ if ((write(fd, &statefile_crc, sizeof(pg_crc32))) != sizeof(pg_crc32)) { @@ -1069,6 +1079,14 @@ EndPrepare(GlobalTransaction gxact) END_CRIT_SECTION(); + /* + * Wait for synchronous replication, if required. + * + * Note that at this stage we have marked the prepare, but still show as + * running in the procarray (twice!) and continue to hold locks. + */ + SyncRepWaitForLSN(gxact->prepare_lsn); + records.tail = records.head = NULL; } @@ -1327,13 +1345,13 @@ FinishPreparedTransaction(const char *gid, bool isCommit) } for (i = 0; i < ndelrels; i++) { - SMgrRelation srel = smgropen(delrels[i]); + SMgrRelation srel = smgropen(delrels[i], InvalidBackendId); ForkNumber fork; for (fork = 0; fork <= MAX_FORKNUM; fork++) { if (smgrexists(srel, fork)) - smgrdounlink(srel, fork, false, false); + smgrdounlink(srel, fork, false); } smgrclose(srel); } @@ -1356,6 +1374,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit) else ProcessRecords(bufptr, xid, twophase_postabort_callbacks); + PredicateLockTwoPhaseFinish(xid, isCommit); + /* Count the prepared xact as committed or aborted */ AtEOXact_PgStat(isCommit); @@ -2012,6 +2032,13 @@ RecordTransactionCommitPrepared(TransactionId xid, /* Flush XLOG to disk */ XLogFlush(recptr); + /* + * Wake up all walsenders to send WAL up to the COMMIT PREPARED record + * immediately if replication is enabled + */ + if (max_wal_senders > 0) + WalSndWakeup(); + /* Mark the transaction committed in pg_clog */ TransactionIdCommitTree(xid, nchildren, children); @@ -2019,6 +2046,14 @@ RecordTransactionCommitPrepared(TransactionId xid, MyProc->inCommit = false; END_CRIT_SECTION(); + + /* + * Wait for synchronous replication, if required. + * + * Note that at this stage we have marked clog, but still show as running + * in the procarray and continue to hold locks. + */ + SyncRepWaitForLSN(recptr); } /* @@ -2085,10 +2120,25 @@ RecordTransactionAbortPrepared(TransactionId xid, XLogFlush(recptr); /* + * Wake up all walsenders to send WAL up to the ABORT PREPARED record + * immediately if replication is enabled + */ + if (max_wal_senders > 0) + WalSndWakeup(); + + /* * Mark the transaction aborted in clog. This is not absolutely necessary * but we may as well do it while we are here. */ TransactionIdAbortTree(xid, nchildren, children); END_CRIT_SECTION(); + + /* + * Wait for synchronous replication, if required. + * + * Note that at this stage we have marked clog, but still show as running + * in the procarray and continue to hold locks. + */ + SyncRepWaitForLSN(recptr); } diff --git a/src/backend/access/transam/twophase_rmgr.c b/src/backend/access/transam/twophase_rmgr.c index d8f7fb6a03..47c15af241 100644 --- a/src/backend/access/transam/twophase_rmgr.c +++ b/src/backend/access/transam/twophase_rmgr.c @@ -3,12 +3,12 @@ * twophase_rmgr.c * Two-phase-commit resource managers tables * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/twophase_rmgr.c,v 1.14 2010/02/26 02:00:34 momjian Exp $ + * src/backend/access/transam/twophase_rmgr.c * *------------------------------------------------------------------------- */ @@ -18,12 +18,14 @@ #include "access/twophase_rmgr.h" #include "pgstat.h" #include "storage/lock.h" +#include "storage/predicate.h" const TwoPhaseCallback twophase_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] = { NULL, /* END ID */ lock_twophase_recover, /* Lock */ + predicatelock_twophase_recover, /* PredicateLock */ NULL, /* pgstat */ multixact_twophase_recover /* MultiXact */ }; @@ -32,6 +34,7 @@ const TwoPhaseCallback twophase_postcommit_callbacks[TWOPHASE_RM_MAX_ID + 1] = { NULL, /* END ID */ lock_twophase_postcommit, /* Lock */ + NULL, /* PredicateLock */ pgstat_twophase_postcommit, /* pgstat */ multixact_twophase_postcommit /* MultiXact */ }; @@ -40,6 +43,7 @@ const TwoPhaseCallback twophase_postabort_callbacks[TWOPHASE_RM_MAX_ID + 1] = { NULL, /* END ID */ lock_twophase_postabort, /* Lock */ + NULL, /* PredicateLock */ pgstat_twophase_postabort, /* pgstat */ multixact_twophase_postabort /* MultiXact */ }; @@ -48,6 +52,7 @@ const TwoPhaseCallback twophase_standby_recover_callbacks[TWOPHASE_RM_MAX_ID + 1 { NULL, /* END ID */ lock_twophase_standby_recover, /* Lock */ + NULL, /* PredicateLock */ NULL, /* pgstat */ NULL /* MultiXact */ }; diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 4a829480f2..a5ff7537b3 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -3,11 +3,11 @@ * varsup.c * postgres OID & XID variables support routines * - * Copyright (c) 2000-2010, PostgreSQL Global Development Group + * Copyright (c) 2000-2011, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/varsup.c,v 1.91 2010/02/26 02:00:34 momjian Exp $ + * src/backend/access/transam/varsup.c * *------------------------------------------------------------------------- */ @@ -17,6 +17,7 @@ #include "access/clog.h" #include "access/subtrans.h" #include "access/transam.h" +#include "access/xact.h" #include "commands/dbcommands.h" #include "miscadmin.h" #include "postmaster/autovacuum.h" @@ -512,13 +513,22 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid) /* Give an immediate warning if past the wrap warn point */ if (TransactionIdFollowsOrEquals(curXid, xidWarnLimit) && !InRecovery) { - char *oldest_datname = get_database_name(oldest_datoid); + char *oldest_datname; /* - * Note: it's possible that get_database_name fails and returns NULL, - * for example because the database just got dropped. We'll still - * warn, even though the warning might now be unnecessary. + * We can be called when not inside a transaction, for example during + * StartupXLOG(). In such a case we cannot do database access, so we + * must just report the oldest DB's OID. + * + * Note: it's also possible that get_database_name fails and returns + * NULL, for example because the database just got dropped. We'll + * still warn, even though the warning might now be unnecessary. */ + if (IsTransactionState()) + oldest_datname = get_database_name(oldest_datoid); + else + oldest_datname = NULL; + if (oldest_datname) ereport(WARNING, (errmsg("database \"%s\" must be vacuumed within %u transactions", diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 982f500c50..941f6aa312 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -5,13 +5,13 @@ * * See src/backend/access/transam/README for more information. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.293 2010/07/06 19:18:55 momjian Exp $ + * src/backend/access/transam/xact.c * *------------------------------------------------------------------------- */ @@ -46,9 +46,12 @@ #include "libpq/be-fsstubs.h" #include "miscadmin.h" #include "pgstat.h" +#include "replication/walsender.h" +#include "replication/syncrep.h" #include "storage/bufmgr.h" #include "storage/fd.h" #include "storage/lmgr.h" +#include "storage/predicate.h" #include "storage/procarray.h" #include "storage/sinvaladt.h" #include "storage/smgr.h" @@ -71,7 +74,10 @@ int XactIsoLevel; bool DefaultXactReadOnly = false; bool XactReadOnly; -bool XactSyncCommit = true; +bool DefaultXactDeferrable = false; +bool XactDeferrable; + +int synchronous_commit = SYNCHRONOUS_COMMIT_ON; int CommitDelay = 0; /* precommit delay in microseconds */ int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ @@ -498,10 +504,32 @@ AssignTransactionId(TransactionState s) /* * Ensure parent(s) have XIDs, so that a child always has an XID later - * than its parent. + * than its parent. Musn't recurse here, or we might get a stack overflow + * if we're at the bottom of a huge stack of subtransactions none of which + * have XIDs yet. */ if (isSubXact && !TransactionIdIsValid(s->parent->transactionId)) - AssignTransactionId(s->parent); + { + TransactionState p = s->parent; + TransactionState *parents; + size_t parentOffset = 0; + + parents = palloc(sizeof(TransactionState) * s->nestingLevel); + while (p != NULL && !TransactionIdIsValid(p->transactionId)) + { + parents[parentOffset++] = p; + p = p->parent; + } + + /* + * This is technically a recursive call, but the recursion will never + * be more than one layer deep. + */ + while (parentOffset != 0) + AssignTransactionId(parents[--parentOffset]); + + pfree(parents); + } /* * Generate a new Xid and record it in PG_PROC and pg_subtrans. @@ -538,6 +566,13 @@ AssignTransactionId(TransactionState s) SubTransSetParent(s->transactionId, s->parent->transactionId, false); /* + * If it's a top-level transaction, the predicate locking system needs to + * be told about it too. + */ + if (!isSubXact) + RegisterPredicateLockingXid(s->transactionId); + + /* * Acquire lock on the transaction XID. (We assume this cannot block.) We * have to ensure that the lock is assigned to the transaction's own * ResourceOwner. @@ -874,17 +909,6 @@ CommandCounterIncrement(void) */ AtCCI_LocalCache(); } - - /* - * Make any other backends' catalog changes visible to me. - * - * XXX this is probably in the wrong place: CommandCounterIncrement should - * be purely a local operation, most likely. However fooling with this - * will affect asynchronous cross-backend interactions, which doesn't seem - * like a wise thing to do in late beta, so save improving this for - * another day - tgl 2007-11-30 - */ - AtStart_Cache(); } /* @@ -1058,18 +1082,20 @@ RecordTransactionCommit(void) TransactionId latestXid = InvalidTransactionId; int nrels; RelFileNode *rels; - bool haveNonTemp; int nchildren; TransactionId *children; - int nmsgs; + int nmsgs = 0; SharedInvalidationMessage *invalMessages = NULL; - bool RelcacheInitFileInval; + bool RelcacheInitFileInval = false; + bool wrote_xlog; /* Get data needed for commit record */ - nrels = smgrGetPendingDeletes(true, &rels, &haveNonTemp); + nrels = smgrGetPendingDeletes(true, &rels); nchildren = xactGetCommittedChildren(&children); - nmsgs = xactGetCommittedInvalidationMessages(&invalMessages, - &RelcacheInitFileInval); + if (XLogStandbyInfoActive()) + nmsgs = xactGetCommittedInvalidationMessages(&invalMessages, + &RelcacheInitFileInval); + wrote_xlog = (XactLastRecEnd.xrecoff != 0); /* * If we haven't been assigned an XID yet, we neither can, nor do we want @@ -1096,7 +1122,7 @@ RecordTransactionCommit(void) * assigned is a sequence advance record due to nextval() --- we want * to flush that to disk before reporting commit.) */ - if (XactLastRecEnd.xrecoff == 0) + if (!wrote_xlog) goto cleanup; } else @@ -1189,16 +1215,30 @@ RecordTransactionCommit(void) } /* - * Check if we want to commit asynchronously. If the user has set - * synchronous_commit = off, and we're not doing cleanup of any non-temp - * rels nor committing any command that wanted to force sync commit, then - * we can defer flushing XLOG. (We must not allow asynchronous commit if - * there are any non-temp tables to be deleted, because we might delete - * the files before the COMMIT record is flushed to disk. We do allow - * asynchronous commit if all to-be-deleted tables are temporary though, - * since they are lost anyway if we crash.) - */ - if (XactSyncCommit || forceSyncCommit || haveNonTemp) + * Check if we want to commit asynchronously. We can allow the XLOG flush + * to happen asynchronously if synchronous_commit=off, or if the current + * transaction has not performed any WAL-logged operation. The latter + * case can arise if the current transaction wrote only to temporary + * and/or unlogged tables. In case of a crash, the loss of such a + * transaction will be irrelevant since temp tables will be lost anyway, + * and unlogged tables will be truncated. (Given the foregoing, you might + * think that it would be unnecessary to emit the XLOG record at all in + * this case, but we don't currently try to do that. It would certainly + * cause problems at least in Hot Standby mode, where the + * KnownAssignedXids machinery requires tracking every XID assignment. It + * might be OK to skip it only when wal_level < hot_standby, but for now + * we don't.) + * + * However, if we're doing cleanup of any non-temp rels or committing any + * command that wanted to force sync commit, then we must flush XLOG + * immediately. (We must not allow asynchronous commit if there are any + * non-temp tables to be deleted, because we might delete the files before + * the COMMIT record is flushed to disk. We do allow asynchronous commit + * if all to-be-deleted tables are temporary though, since they are lost + * anyway if we crash.) + */ + if ((wrote_xlog && synchronous_commit > SYNCHRONOUS_COMMIT_OFF) || + forceSyncCommit || nrels > 0) { /* * Synchronous commit case: @@ -1213,12 +1253,19 @@ RecordTransactionCommit(void) * fewer than CommitSiblings other backends with active transactions. */ if (CommitDelay > 0 && enableFsync && - CountActiveBackends() >= CommitSiblings) + MinimumActiveBackends(CommitSiblings)) pg_usleep(CommitDelay); XLogFlush(XactLastRecEnd); /* + * Wake up all walsenders to send WAL up to the COMMIT record + * immediately if replication is enabled + */ + if (max_wal_senders > 0) + WalSndWakeup(); + + /* * Now we may update the CLOG, if we wrote a COMMIT record above */ if (markXidCommitted) @@ -1237,7 +1284,7 @@ RecordTransactionCommit(void) * Report the latest async commit LSN, so that the WAL writer knows to * flush this commit. */ - XLogSetAsyncCommitLSN(XactLastRecEnd); + XLogSetAsyncXactLSN(XactLastRecEnd); /* * We must not immediately update the CLOG, since we didn't flush the @@ -1261,6 +1308,14 @@ RecordTransactionCommit(void) /* Compute latestXid while we have the child XIDs handy */ latestXid = TransactionIdLatest(xid, nchildren, children); + /* + * Wait for synchronous replication, if required. + * + * Note that at this stage we have marked clog, but still show as running + * in the procarray and continue to hold locks. + */ + SyncRepWaitForLSN(XactLastRecEnd); + /* Reset XactLastRecEnd until the next transaction writes something */ XactLastRecEnd.xrecoff = 0; @@ -1484,7 +1539,7 @@ RecordTransactionAbort(bool isSubXact) xid); /* Fetch the data we need for the abort record */ - nrels = smgrGetPendingDeletes(false, &rels, NULL); + nrels = smgrGetPendingDeletes(false, &rels); nchildren = xactGetCommittedChildren(&children); /* XXX do we really need a critical section here? */ @@ -1540,7 +1595,7 @@ RecordTransactionAbort(bool isSubXact) * problems occur at that point. */ if (!isSubXact) - XLogSetAsyncCommitLSN(XactLastRecEnd); + XLogSetAsyncXactLSN(XactLastRecEnd); /* * Mark the transaction aborted in clog. This is not absolutely necessary @@ -1794,6 +1849,7 @@ StartTransaction(void) s->startedInRecovery = false; XactReadOnly = DefaultXactReadOnly; } + XactDeferrable = DefaultXactDeferrable; XactIsoLevel = DefaultXactIsoLevel; forceSyncCommit = false; MyXactAccessedTempRel = false; @@ -1957,12 +2013,10 @@ CommitTransaction(bool contact_gtm) #endif /* - * Do pre-commit processing (most of this stuff requires database access, - * and in fact could still cause an error...) - * - * It is possible for CommitHoldablePortals to invoke functions that queue - * deferred triggers, and it's also possible that triggers create holdable - * cursors. So we have to loop until there's nothing left to do. + * Do pre-commit processing that involves calling user-defined code, such + * as triggers. Since closing cursors could queue trigger actions, + * triggers could open cursors, etc, we have to keep looping until there's + * nothing left to do. */ #ifdef PGXC if (!PrepareLocalCoord) @@ -1976,21 +2030,25 @@ CommitTransaction(bool contact_gtm) AfterTriggerFireDeferred(); /* - * Convert any open holdable cursors into static portals. If there - * weren't any, we are done ... otherwise loop back to check if they - * queued deferred triggers. Lather, rinse, repeat. + * Close open portals (converting holdable ones into static portals). + * If there weren't any, we are done ... otherwise loop back to check + * if they queued deferred triggers. Lather, rinse, repeat. */ - if (!CommitHoldablePortals()) + if (!PreCommit_Portals(false)) break; else IsHoldableCursor = true; } - /* Now we can shut down the deferred-trigger manager */ - AfterTriggerEndXact(true); + /* + * The remaining actions cannot call any user-defined code, so it's safe + * to start shutting down within-transaction services. But note that most + * of this stuff could still throw an error, which would switch us into + * the transaction-abort path. + */ - /* Close any open regular cursors */ - AtCommit_Portals(); + /* Shut down the deferred-trigger manager */ + AfterTriggerEndXact(true); /* * Let ON COMMIT management do its thing (must happen after closing @@ -2002,6 +2060,13 @@ CommitTransaction(bool contact_gtm) AtEOXact_LargeObject(true); /* + * Mark serializable transaction as complete for predicate locking + * purposes. This should be done as late as we can put it and still allow + * errors to be raised for failure patterns found at commit. + */ + PreCommit_CheckForSerializationFailure(); + + /* * Insert notifications sent by NOTIFY commands into the queue. This * should be late in the pre-commit sequence to minimize time spent * holding the notify-insertion lock. @@ -2268,12 +2333,10 @@ PrepareTransaction(void) Assert(s->parent == NULL); /* - * Do pre-commit processing (most of this stuff requires database access, - * and in fact could still cause an error...) - * - * It is possible for PrepareHoldablePortals to invoke functions that - * queue deferred triggers, and it's also possible that triggers create - * holdable cursors. So we have to loop until there's nothing left to do. + * Do pre-commit processing that involves calling user-defined code, such + * as triggers. Since closing cursors could queue trigger actions, + * triggers could open cursors, etc, we have to keep looping until there's + * nothing left to do. */ for (;;) { @@ -2283,19 +2346,23 @@ PrepareTransaction(void) AfterTriggerFireDeferred(); /* - * Convert any open holdable cursors into static portals. If there - * weren't any, we are done ... otherwise loop back to check if they - * queued deferred triggers. Lather, rinse, repeat. + * Close open portals (converting holdable ones into static portals). + * If there weren't any, we are done ... otherwise loop back to check + * if they queued deferred triggers. Lather, rinse, repeat. */ - if (!PrepareHoldablePortals()) + if (!PreCommit_Portals(true)) break; } - /* Now we can shut down the deferred-trigger manager */ - AfterTriggerEndXact(true); + /* + * The remaining actions cannot call any user-defined code, so it's safe + * to start shutting down within-transaction services. But note that most + * of this stuff could still throw an error, which would switch us into + * the transaction-abort path. + */ - /* Close any open regular cursors */ - AtCommit_Portals(); + /* Shut down the deferred-trigger manager */ + AfterTriggerEndXact(true); /* * Let ON COMMIT management do its thing (must happen after closing @@ -2306,6 +2373,13 @@ PrepareTransaction(void) /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); + /* + * Mark serializable transaction as complete for predicate locking + * purposes. This should be done as late as we can put it and still allow + * errors to be raised for failure patterns found at commit. + */ + PreCommit_CheckForSerializationFailure(); + /* NOTIFY will be handled below */ /* @@ -2370,6 +2444,7 @@ PrepareTransaction(void) AtPrepare_Notify(); AtPrepare_Locks(); + AtPrepare_PredicateLocks(); AtPrepare_PgStat(); AtPrepare_MultiXact(); AtPrepare_RelationMap(); @@ -2429,6 +2504,7 @@ PrepareTransaction(void) PostPrepare_MultiXact(xid); PostPrepare_Locks(xid); + PostPrepare_PredicateLocks(xid); #ifdef PGXC /* @@ -5020,7 +5096,7 @@ xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid, XLogRecPtr lsn) /* Make sure files supposed to be dropped are dropped */ for (i = 0; i < xlrec->nrels; i++) { - SMgrRelation srel = smgropen(xlrec->xnodes[i]); + SMgrRelation srel = smgropen(xlrec->xnodes[i], InvalidBackendId); ForkNumber fork; for (fork = 0; fork <= MAX_FORKNUM; fork++) @@ -5028,7 +5104,7 @@ xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid, XLogRecPtr lsn) if (smgrexists(srel, fork)) { XLogDropRelation(xlrec->xnodes[i], fork); - smgrdounlink(srel, fork, false, true); + smgrdounlink(srel, fork, true); } } smgrclose(srel); @@ -5069,9 +5145,9 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid) sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]); max_xid = TransactionIdLatest(xid, xlrec->nsubxacts, sub_xids); - /* Make sure nextXid is beyond any XID mentioned in the record */ - /* + * Make sure nextXid is beyond any XID mentioned in the record. + * * We don't expect anyone else to modify nextXid, hence we don't need to * hold a lock while checking this. We still acquire the lock to modify * it, though. @@ -5125,7 +5201,7 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid) /* Make sure files supposed to be dropped are dropped */ for (i = 0; i < xlrec->nrels; i++) { - SMgrRelation srel = smgropen(xlrec->xnodes[i]); + SMgrRelation srel = smgropen(xlrec->xnodes[i], InvalidBackendId); ForkNumber fork; for (fork = 0; fork <= MAX_FORKNUM; fork++) @@ -5133,7 +5209,7 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid) if (smgrexists(srel, fork)) { XLogDropRelation(xlrec->xnodes[i], fork); - smgrdounlink(srel, fork, false, true); + smgrdounlink(srel, fork, true); } } smgrclose(srel); @@ -5207,7 +5283,7 @@ xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec) appendStringInfo(buf, "; rels:"); for (i = 0; i < xlrec->nrels; i++) { - char *path = relpath(xlrec->xnodes[i], MAIN_FORKNUM); + char *path = relpathperm(xlrec->xnodes[i], MAIN_FORKNUM); appendStringInfo(buf, " %s", path); pfree(path); @@ -5262,7 +5338,7 @@ xact_desc_abort(StringInfo buf, xl_xact_abort *xlrec) appendStringInfo(buf, "; rels:"); for (i = 0; i < xlrec->nrels; i++) { - char *path = relpath(xlrec->xnodes[i], MAIN_FORKNUM); + char *path = relpathperm(xlrec->xnodes[i], MAIN_FORKNUM); appendStringInfo(buf, " %s", path); pfree(path); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 71ee729c1a..11512e0048 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4,10 +4,10 @@ * PostgreSQL transaction log manager * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.430 2010/07/06 19:18:55 momjian Exp $ + * src/backend/access/transam/xlog.c * *------------------------------------------------------------------------- */ @@ -49,8 +49,12 @@ #include "storage/bufmgr.h" #include "storage/fd.h" #include "storage/ipc.h" +#include "storage/latch.h" #include "storage/pmsignal.h" +#include "storage/predicate.h" +#include "storage/proc.h" #include "storage/procarray.h" +#include "storage/reinit.h" #include "storage/smgr.h" #include "storage/spin.h" #include "utils/builtins.h" @@ -61,16 +65,15 @@ /* File path names (all relative to $PGDATA) */ -#define BACKUP_LABEL_FILE "backup_label" -#define BACKUP_LABEL_OLD "backup_label.old" #define RECOVERY_COMMAND_FILE "recovery.conf" #define RECOVERY_COMMAND_DONE "recovery.done" +#define PROMOTE_SIGNAL_FILE "promote" /* User-settable parameters */ int CheckPointSegments = 3; int wal_keep_segments = 0; -int XLOGbuffers = 8; +int XLOGbuffers = -1; int XLogArchiveTimeout = 0; bool XLogArchiveMode = false; char *XLogArchiveCommand = NULL; @@ -162,6 +165,12 @@ static XLogRecPtr LastRec; static bool LocalRecoveryInProgress = true; /* + * Local copy of SharedHotStandbyActive variable. False actually means "not + * known, need to check the shared state". + */ +static bool LocalHotStandbyActive = false; + +/* * Local state for XLogInsertAllowed(): * 1: unconditionally allowed to insert XLOG * 0: unconditionally not allowed to insert XLOG @@ -185,18 +194,21 @@ static char *recoveryEndCommand = NULL; static char *archiveCleanupCommand = NULL; static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET; static bool recoveryTargetInclusive = true; +static bool recoveryPauseAtTarget = true; static TransactionId recoveryTargetXid; static TimestampTz recoveryTargetTime; static char *recoveryTargetBarrierId; +static char *recoveryTargetName; /* options taken from recovery.conf for XLOG streaming */ static bool StandbyMode = false; static char *PrimaryConnInfo = NULL; static char *TriggerFile = NULL; -/* if recoveryStopsHere returns true, it saves actual stop xid/time here */ +/* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */ static TransactionId recoveryStopXid; static TimestampTz recoveryStopTime; +static char recoveryStopName[MAXFNAMELEN]; static bool recoveryStopAfter; /* @@ -209,6 +221,8 @@ static bool recoveryStopAfter; * * recoveryTargetTLI: the desired timeline that we want to end in. * + * recoveryTargetIsLatest: was the requested target timeline 'latest'? + * * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of * its known parents, newest first (so recoveryTargetTLI is always the * first list member). Only these TLIs are expected to be seen in the WAL @@ -222,6 +236,7 @@ static bool recoveryStopAfter; * to decrease. */ static TimeLineID recoveryTargetTLI; +static bool recoveryTargetIsLatest = false; static List *expectedTLIs; static TimeLineID curFileTLI; @@ -341,6 +356,17 @@ typedef struct XLogCtlInsert char *currpos; /* current insertion point in cache */ XLogRecPtr RedoRecPtr; /* current redo point for insertions */ bool forcePageWrites; /* forcing full-page writes for PITR? */ + + /* + * exclusiveBackup is true if a backup started with pg_start_backup() is + * in progress, and nonExclusiveBackups is a counter indicating the number + * of streaming base backups currently in progress. forcePageWrites is set + * to true when either of these is non-zero. lastBackupStart is the latest + * checkpoint redo location used as a starting point for an online backup. + */ + bool exclusiveBackup; + int nonExclusiveBackups; + XLogRecPtr lastBackupStart; } XLogCtlInsert; /* @@ -366,7 +392,7 @@ typedef struct XLogCtlData XLogwrtResult LogwrtResult; uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */ TransactionId ckptXid; - XLogRecPtr asyncCommitLSN; /* LSN of newest async commit */ + XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */ uint32 lastRemovedLog; /* latest removed/recycled XLOG segment */ uint32 lastRemovedSeg; @@ -397,6 +423,19 @@ typedef struct XLogCtlData bool SharedRecoveryInProgress; /* + * SharedHotStandbyActive indicates if we're still in crash or archive + * recovery. Protected by info_lck. + */ + bool SharedHotStandbyActive; + + /* + * recoveryWakeupLatch is used to wake up the startup process to continue + * WAL replay, if it is waiting for WAL to arrive or failover trigger file + * to appear. + */ + Latch recoveryWakeupLatch; + + /* * During recovery, we keep a copy of the latest checkpoint record here. * Used by the background writer when it wants to create a restartpoint. * @@ -411,6 +450,8 @@ typedef struct XLogCtlData XLogRecPtr recoveryLastRecPtr; /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */ TimestampTz recoveryLastXTime; + /* Are we requested to pause recovery? */ + bool recoveryPause; slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -536,11 +577,19 @@ typedef struct xl_parameter_change int wal_level; } xl_parameter_change; +/* logs restore point */ +typedef struct xl_restore_point +{ + TimestampTz rp_time; + char rp_name[MAXFNAMELEN]; +} xl_restore_point; + /* * Flags set by interrupt handlers for later service in the redo loop. */ static volatile sig_atomic_t got_SIGHUP = false; static volatile sig_atomic_t shutdown_requested = false; +static volatile sig_atomic_t promote_triggered = false; /* * Flag set when executing a restore command, to tell SIGTERM signal handler @@ -558,6 +607,9 @@ static void readRecoveryCommandFile(void); static void exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg); static bool recoveryStopsHere(XLogRecord *record, bool *includeThis); +static void recoveryPausesHere(void); +static bool RecoveryIsPaused(void); +static void SetRecoveryPause(bool recoveryPause); static void SetLatestXTime(TimestampTz xtime); static TimestampTz GetLatestXTime(void); static void CheckRequiredParameterValues(void); @@ -597,6 +649,7 @@ static bool ValidXLOGHeader(XLogPageHeader hdr, int emode); static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt); static List *readTimeLineHistory(TimeLineID targetTLI); static bool existsTimeLineHistory(TimeLineID probeTLI); +static bool rescanLatestTimeLine(void); static TimeLineID findNewestTimeLine(TimeLineID startTLI); static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, TimeLineID endTLI, @@ -1878,18 +1931,18 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch) } /* - * Record the LSN for an asynchronous transaction commit. - * (This should not be called for aborts, nor for synchronous commits.) + * Record the LSN for an asynchronous transaction commit/abort. + * (This should not be called for for synchronous commits.) */ void -XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN) +XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN) { /* use volatile pointer to prevent code rearrangement */ volatile XLogCtlData *xlogctl = XLogCtl; SpinLockAcquire(&xlogctl->info_lck); - if (XLByteLT(xlogctl->asyncCommitLSN, asyncCommitLSN)) - xlogctl->asyncCommitLSN = asyncCommitLSN; + if (XLByteLT(xlogctl->asyncXactLSN, asyncXactLSN)) + xlogctl->asyncXactLSN = asyncXactLSN; SpinLockRelease(&xlogctl->info_lck); } @@ -2138,7 +2191,7 @@ XLogBackgroundFlush(void) volatile XLogCtlData *xlogctl = XLogCtl; SpinLockAcquire(&xlogctl->info_lck); - WriteRqstPtr = xlogctl->asyncCommitLSN; + WriteRqstPtr = xlogctl->asyncXactLSN; SpinLockRelease(&xlogctl->info_lck); flexible = false; /* ensure it all gets written */ } @@ -3083,10 +3136,10 @@ not_available: * * 'command' is the shell command to be executed, 'commandName' is a * human-readable name describing the command emitted in the logs. If - * 'failonSignal' is true and the command is killed by a signal, a FATAL + * 'failOnSignal' is true and the command is killed by a signal, a FATAL * error is thrown. Otherwise a WARNING is emitted. * - * This is currently used for restore_end_command and archive_cleanup_command. + * This is currently used for recovery_end_command and archive_cleanup_command. */ static void ExecuteRecoveryCommand(char *command, char *commandName, bool failOnSignal) @@ -3215,7 +3268,7 @@ PreallocXlogFiles(XLogRecPtr endptr) /* * Get the log/seg of the latest removed or recycled WAL segment. - * Returns 0 if no WAL segments have been removed since startup. + * Returns 0/0 if no WAL segments have been removed since startup. */ void XLogGetLastRemoved(uint32 *log, uint32 *seg) @@ -3276,9 +3329,6 @@ RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr) #endif struct stat statbuf; - elog(DEBUG2, "removing WAL segments older than log file %u, segment %u", - log, seg); - /* * Initialize info about where to try to recycle to. We allow recycling * segments up to XLOGfileslop segments beyond the current XLOG location. @@ -3295,6 +3345,9 @@ RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr) XLogFileName(lastoff, ThisTimeLineID, log, seg); + elog(DEBUG2, "attempting to remove WAL segments older than log file %s", + lastoff); + while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) { /* @@ -3447,7 +3500,7 @@ ValidateXLOGDirectoryStructure(void) { ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", path))); - if (mkdir(path, 0700) < 0) + if (mkdir(path, S_IRWXU) < 0) ereport(FATAL, (errmsg("could not create missing directory \"%s\": %m", path))); @@ -3679,23 +3732,32 @@ ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt) RecPtr = &tmpRecPtr; /* - * Align recptr to next page if no more records can fit on the current - * page. + * RecPtr is pointing to end+1 of the previous WAL record. We must + * advance it if necessary to where the next record starts. First, + * align to next page if no more records can fit on the current page. */ if (XLOG_BLCKSZ - (RecPtr->xrecoff % XLOG_BLCKSZ) < SizeOfXLogRecord) - { - NextLogPage(tmpRecPtr); - /* We will account for page header size below */ - } + NextLogPage(*RecPtr); - if (tmpRecPtr.xrecoff >= XLogFileSize) + /* Check for crossing of xlog segment boundary */ + if (RecPtr->xrecoff >= XLogFileSize) { - (tmpRecPtr.xlogid)++; - tmpRecPtr.xrecoff = 0; + (RecPtr->xlogid)++; + RecPtr->xrecoff = 0; } + + /* + * If at page start, we must skip over the page header. But we can't + * do that until we've read in the page, since the header size is + * variable. + */ } else { + /* + * In this case, the passed-in record pointer should already be + * pointing to a valid record starting position. + */ if (!XRecOffIsValid(RecPtr->xrecoff)) ereport(PANIC, (errmsg("invalid record offset at %X/%X", @@ -3724,11 +3786,13 @@ retry: if (targetRecOff == 0) { /* - * Can only get here in the continuing-from-prev-page case, because - * XRecOffIsValid eliminated the zero-page-offset case otherwise. Need - * to skip over the new page's header. + * At page start, so skip over page header. The Assert checks that + * we're not scribbling on caller's record pointer; it's OK because we + * can only get here in the continuing-from-prev-record case, since + * XRecOffIsValid rejected the zero-page-offset case otherwise. */ - tmpRecPtr.xrecoff += pageHeaderSize; + Assert(RecPtr == &tmpRecPtr); + RecPtr->xrecoff += pageHeaderSize; targetRecOff = pageHeaderSize; } else if (targetRecOff < pageHeaderSize) @@ -4214,6 +4278,64 @@ existsTimeLineHistory(TimeLineID probeTLI) } /* + * Scan for new timelines that might have appeared in the archive since we + * started recovery. + * + * If there are any, the function changes recovery target TLI to the latest + * one and returns 'true'. + */ +static bool +rescanLatestTimeLine(void) +{ + TimeLineID newtarget; + + newtarget = findNewestTimeLine(recoveryTargetTLI); + if (newtarget != recoveryTargetTLI) + { + /* + * Determine the list of expected TLIs for the new TLI + */ + List *newExpectedTLIs; + + newExpectedTLIs = readTimeLineHistory(newtarget); + + /* + * If the current timeline is not part of the history of the new + * timeline, we cannot proceed to it. + * + * XXX This isn't foolproof: The new timeline might have forked from + * the current one, but before the current recovery location. In that + * case we will still switch to the new timeline and proceed replaying + * from it even though the history doesn't match what we already + * replayed. That's not good. We will likely notice at the next online + * checkpoint, as the TLI won't match what we expected, but it's not + * guaranteed. The admin needs to make sure that doesn't happen. + */ + if (!list_member_int(newExpectedTLIs, + (int) recoveryTargetTLI)) + ereport(LOG, + (errmsg("new timeline %u is not a child of database system timeline %u", + newtarget, + ThisTimeLineID))); + else + { + /* Switch target */ + recoveryTargetTLI = newtarget; + list_free(expectedTLIs); + expectedTLIs = newExpectedTLIs; + + XLogCtl->RecoveryTargetTLI = recoveryTargetTLI; + + ereport(LOG, + (errmsg("new target timeline is %u", + recoveryTargetTLI))); + return true; + } + } + return false; +} + +/* * Find the newest existing timeline, assuming that startTLI exists. * * Note: while this is somewhat heuristic, it does positively guarantee @@ -4365,7 +4487,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, xlogfname, recoveryStopAfter ? "after" : "before", recoveryStopXid); - if (recoveryTarget == RECOVERY_TARGET_TIME) + else if (recoveryTarget == RECOVERY_TARGET_TIME) snprintf(buffer, sizeof(buffer), "%s%u\t%s\t%s %s\n", (srcfd < 0) ? "" : "\n", @@ -4383,6 +4505,13 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, recoveryStopAfter ? "after" : "before", recoveryTargetBarrierId); #endif + else if (recoveryTarget == RECOVERY_TARGET_NAME) + snprintf(buffer, sizeof(buffer), + "%s%u\t%s\tat restore point \"%s\"\n", + (srcfd < 0) ? "" : "\n", + parentTLI, + xlogfname, + recoveryStopName); else snprintf(buffer, sizeof(buffer), "%s%u\t%s\tno recovery target specified\n", @@ -4783,6 +4912,65 @@ GetSystemIdentifier(void) } /* + * Auto-tune the number of XLOG buffers. + * + * The preferred setting for wal_buffers is about 3% of shared_buffers, with + * a maximum of one XLOG segment (there is little reason to think that more + * is helpful, at least so long as we force an fsync when switching log files) + * and a minimum of 8 blocks (which was the default value prior to PostgreSQL + * 9.1, when auto-tuning was added). + * + * This should not be called until NBuffers has received its final value. + */ +static int +XLOGChooseNumBuffers(void) +{ + int xbuffers; + + xbuffers = NBuffers / 32; + if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ) + xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ; + if (xbuffers < 8) + xbuffers = 8; + return xbuffers; +} + +/* + * GUC check_hook for wal_buffers + */ +bool +check_wal_buffers(int *newval, void **extra, GucSource source) +{ + /* + * -1 indicates a request for auto-tune. + */ + if (*newval == -1) + { + /* + * If we haven't yet changed the boot_val default of -1, just let it + * be. We'll fix it when XLOGShmemSize is called. + */ + if (XLOGbuffers == -1) + return true; + + /* Otherwise, substitute the auto-tune value */ + *newval = XLOGChooseNumBuffers(); + } + + /* + * We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL + * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer + * the case, we just silently treat such values as a request for the + * minimum. (We could throw an error instead, but that doesn't seem very + * helpful.) + */ + if (*newval < 4) + *newval = 4; + + return true; +} + +/* * Initialization of shared memory for XLOG */ Size @@ -4790,6 +4978,21 @@ XLOGShmemSize(void) { Size size; + /* + * If the value of wal_buffers is -1, use the preferred auto-tune value. + * This isn't an amazingly clean place to do this, but we must wait till + * NBuffers has received its final value, and must do it before using the + * value of XLOGbuffers to do anything important. + */ + if (XLOGbuffers == -1) + { + char buf[32]; + + snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers()); + SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE); + } + Assert(XLOGbuffers > 0); + /* XLogCtl */ size = sizeof(XLogCtlData); /* xlblocks array */ @@ -4852,8 +5055,10 @@ XLOGShmemInit(void) */ XLogCtl->XLogCacheBlck = XLOGbuffers - 1; XLogCtl->SharedRecoveryInProgress = true; + XLogCtl->SharedHotStandbyActive = false; XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages); SpinLockInit(&XLogCtl->info_lck); + InitSharedLatch(&XLogCtl->recoveryWakeupLatch); /* * If we are not in bootstrap mode, pg_control should already exist. Read @@ -4905,9 +5110,15 @@ BootStrapXLOG(void) page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer); memset(page, 0, XLOG_BLCKSZ); - /* Set up information for the initial checkpoint record */ + /* + * Set up information for the initial checkpoint record + * + * The initial checkpoint record is written to the beginning of the WAL + * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not + * used, so that we can use 0/0 to mean "before any valid WAL segment". + */ checkPoint.redo.xlogid = 0; - checkPoint.redo.xrecoff = SizeOfXLogLongPHD; + checkPoint.redo.xrecoff = XLogSegSize + SizeOfXLogLongPHD; checkPoint.ThisTimeLineID = ThisTimeLineID; checkPoint.nextXidEpoch = 0; checkPoint.nextXid = FirstNormalTransactionId; @@ -4930,7 +5141,7 @@ BootStrapXLOG(void) page->xlp_info = XLP_LONG_HEADER; page->xlp_tli = ThisTimeLineID; page->xlp_pageaddr.xlogid = 0; - page->xlp_pageaddr.xrecoff = 0; + page->xlp_pageaddr.xrecoff = XLogSegSize; longpage = (XLogLongPageHeader) page; longpage->xlp_sysid = sysidentifier; longpage->xlp_seg_size = XLogSegSize; @@ -4956,7 +5167,7 @@ BootStrapXLOG(void) /* Create first XLOG segment file */ use_existent = false; - openLogFile = XLogFileInit(0, 0, &use_existent, false); + openLogFile = XLogFileInit(0, 1, &use_existent, false); /* Write the first page with the initial record */ errno = 0; @@ -5023,115 +5234,20 @@ str_time(pg_time_t tnow) } /* - * Parse one line from recovery.conf. 'cmdline' is the raw line from the - * file. If the line is parsed successfully, returns true, false indicates - * syntax error. On success, *key_p and *value_p are set to the parameter - * name and value on the line, respectively. If the line is an empty line, - * consisting entirely of whitespace and comments, function returns true - * and *keyp_p and *value_p are set to NULL. - * - * The pointers returned in *key_p and *value_p point to an internal buffer - * that is valid only until the next call of parseRecoveryCommandFile(). - */ -static bool -parseRecoveryCommandFileLine(char *cmdline, char **key_p, char **value_p) -{ - char *ptr; - char *bufp; - char *key; - char *value; - static char *buf = NULL; - - *key_p = *value_p = NULL; - - /* - * Allocate the buffer on first use. It's used to hold both the parameter - * name and value. - */ - if (buf == NULL) - buf = malloc(MAXPGPATH + 1); - bufp = buf; - - /* Skip any whitespace at the beginning of line */ - for (ptr = cmdline; *ptr; ptr++) - { - if (!isspace((unsigned char) *ptr)) - break; - } - /* Ignore empty lines */ - if (*ptr == '\0' || *ptr == '#') - return true; - - /* Read the parameter name */ - key = bufp; - while (*ptr && !isspace((unsigned char) *ptr) && - *ptr != '=' && *ptr != '\'') - *(bufp++) = *(ptr++); - *(bufp++) = '\0'; - - /* Skip to the beginning quote of the parameter value */ - ptr = strchr(ptr, '\''); - if (!ptr) - return false; - ptr++; - - /* Read the parameter value to *bufp. Collapse any '' escapes as we go. */ - value = bufp; - for (;;) - { - if (*ptr == '\'') - { - ptr++; - if (*ptr == '\'') - *(bufp++) = '\''; - else - { - /* end of parameter */ - *bufp = '\0'; - break; - } - } - else if (*ptr == '\0') - return false; /* unterminated quoted string */ - else - *(bufp++) = *ptr; - - ptr++; - } - *(bufp++) = '\0'; - - /* Check that there's no garbage after the value */ - while (*ptr) - { - if (*ptr == '#') - break; - if (!isspace((unsigned char) *ptr)) - return false; - ptr++; - } - - /* Success! */ - *key_p = key; - *value_p = value; - return true; -} - -/* * See if there is a recovery command file (recovery.conf), and if so * read in parameters for archive recovery and XLOG streaming. * - * XXX longer term intention is to expand this to - * cater for additional parameters and controls - * possibly use a flex lexer similar to the GUC one + * The file is parsed using the main configuration parser. */ static void readRecoveryCommandFile(void) { FILE *fd; - char cmdline[MAXPGPATH]; TimeLineID rtli = 0; bool rtliGiven = false; - bool syntaxError = false; + ConfigVariable *item, + *head = NULL, + *tail = NULL; fd = AllocateFile(RECOVERY_COMMAND_FILE, "r"); if (fd == NULL) @@ -5145,55 +5261,56 @@ readRecoveryCommandFile(void) } /* - * Parse the file... + * Since we're asking ParseConfigFp() to error out at FATAL, there's no + * need to check the return value. */ - while (fgets(cmdline, sizeof(cmdline), fd) != NULL) - { - char *tok1; - char *tok2; + ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail); - if (!parseRecoveryCommandFileLine(cmdline, &tok1, &tok2)) - { - syntaxError = true; - break; - } - if (tok1 == NULL) - continue; - - if (strcmp(tok1, "restore_command") == 0) + for (item = head; item; item = item->next) + { + if (strcmp(item->name, "restore_command") == 0) { - recoveryRestoreCommand = pstrdup(tok2); + recoveryRestoreCommand = pstrdup(item->value); ereport(DEBUG2, (errmsg("restore_command = '%s'", recoveryRestoreCommand))); } - else if (strcmp(tok1, "recovery_end_command") == 0) + else if (strcmp(item->name, "recovery_end_command") == 0) { - recoveryEndCommand = pstrdup(tok2); + recoveryEndCommand = pstrdup(item->value); ereport(DEBUG2, (errmsg("recovery_end_command = '%s'", recoveryEndCommand))); } - else if (strcmp(tok1, "archive_cleanup_command") == 0) + else if (strcmp(item->name, "archive_cleanup_command") == 0) { - archiveCleanupCommand = pstrdup(tok2); + archiveCleanupCommand = pstrdup(item->value); ereport(DEBUG2, (errmsg("archive_cleanup_command = '%s'", archiveCleanupCommand))); } - else if (strcmp(tok1, "recovery_target_timeline") == 0) + else if (strcmp(item->name, "pause_at_recovery_target") == 0) + { + if (!parse_bool(item->value, &recoveryPauseAtTarget)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target"))); + ereport(DEBUG2, + (errmsg("pause_at_recovery_target = '%s'", item->value))); + } + else if (strcmp(item->name, "recovery_target_timeline") == 0) { rtliGiven = true; - if (strcmp(tok2, "latest") == 0) + if (strcmp(item->value, "latest") == 0) rtli = 0; else { errno = 0; - rtli = (TimeLineID) strtoul(tok2, NULL, 0); + rtli = (TimeLineID) strtoul(item->value, NULL, 0); if (errno == EINVAL || errno == ERANGE) ereport(FATAL, (errmsg("recovery_target_timeline is not a valid number: \"%s\"", - tok2))); + item->value))); } if (rtli) ereport(DEBUG2, @@ -5202,26 +5319,27 @@ readRecoveryCommandFile(void) ereport(DEBUG2, (errmsg("recovery_target_timeline = latest"))); } - else if (strcmp(tok1, "recovery_target_xid") == 0) + else if (strcmp(item->name, "recovery_target_xid") == 0) { errno = 0; - recoveryTargetXid = (TransactionId) strtoul(tok2, NULL, 0); + recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0); if (errno == EINVAL || errno == ERANGE) ereport(FATAL, (errmsg("recovery_target_xid is not a valid number: \"%s\"", - tok2))); + item->value))); ereport(DEBUG2, (errmsg("recovery_target_xid = %u", recoveryTargetXid))); recoveryTarget = RECOVERY_TARGET_XID; } - else if (strcmp(tok1, "recovery_target_time") == 0) + else if (strcmp(item->name, "recovery_target_time") == 0) { /* - * if recovery_target_xid specified, then this overrides - * recovery_target_time + * if recovery_target_xid or recovery_target_name specified, then + * this overrides recovery_target_time */ - if (recoveryTarget == RECOVERY_TARGET_XID) + if (recoveryTarget == RECOVERY_TARGET_XID || + recoveryTarget == RECOVERY_TARGET_NAME) continue; recoveryTarget = RECOVERY_TARGET_TIME; @@ -5230,7 +5348,7 @@ readRecoveryCommandFile(void) */ recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in, - CStringGetDatum(tok2), + CStringGetDatum(item->value), ObjectIdGetDatum(InvalidOid), Int32GetDatum(-1))); ereport(DEBUG2, @@ -5238,43 +5356,63 @@ readRecoveryCommandFile(void) timestamptz_to_str(recoveryTargetTime)))); } #ifdef PGXC - else if (strcmp(tok1, "recovery_target_barrier") == 0) + else if (strcmp(item->name, "recovery_target_barrier") == 0) { recoveryTarget = RECOVERY_TARGET_BARRIER; - recoveryTargetBarrierId = pstrdup(tok2); + recoveryTargetBarrierId = pstrdup(item->value); } #endif - else if (strcmp(tok1, "recovery_target_inclusive") == 0) + else if (strcmp(item->name, "recovery_target_name") == 0) + { + /* + * if recovery_target_xid specified, then this overrides + * recovery_target_name + */ + if (recoveryTarget == RECOVERY_TARGET_XID) + continue; + recoveryTarget = RECOVERY_TARGET_NAME; + + recoveryTargetName = pstrdup(item->value); + if (strlen(recoveryTargetName) >= MAXFNAMELEN) + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("recovery_target_name is too long (maximum %d characters)", MAXFNAMELEN - 1))); + + ereport(DEBUG2, + (errmsg("recovery_target_name = '%s'", + recoveryTargetName))); + } + else if (strcmp(item->name, "recovery_target_inclusive") == 0) { /* * does nothing if a recovery_target is not also set */ - if (!parse_bool(tok2, &recoveryTargetInclusive)) + if (!parse_bool(item->value, &recoveryTargetInclusive)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("parameter \"recovery_target_inclusive\" requires a Boolean value"))); + errmsg("parameter \"%s\" requires a Boolean value", "recovery_target_inclusive"))); ereport(DEBUG2, - (errmsg("recovery_target_inclusive = %s", tok2))); + (errmsg("recovery_target_inclusive = %s", item->value))); } - else if (strcmp(tok1, "standby_mode") == 0) + else if (strcmp(item->name, "standby_mode") == 0) { - if (!parse_bool(tok2, &StandbyMode)) + if (!parse_bool(item->value, &StandbyMode)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("parameter \"standby_mode\" requires a Boolean value"))); + errmsg("parameter \"%s\" requires a Boolean value", "standby_mode"))); ereport(DEBUG2, - (errmsg("standby_mode = '%s'", tok2))); + (errmsg("standby_mode = '%s'", item->value))); } - else if (strcmp(tok1, "primary_conninfo") == 0) + else if (strcmp(item->name, "primary_conninfo") == 0) { - PrimaryConnInfo = pstrdup(tok2); + PrimaryConnInfo = pstrdup(item->value); ereport(DEBUG2, (errmsg("primary_conninfo = '%s'", PrimaryConnInfo))); } - else if (strcmp(tok1, "trigger_file") == 0) + else if (strcmp(item->name, "trigger_file") == 0) { - TriggerFile = pstrdup(tok2); + TriggerFile = pstrdup(item->value); ereport(DEBUG2, (errmsg("trigger_file = '%s'", TriggerFile))); @@ -5282,17 +5420,9 @@ readRecoveryCommandFile(void) else ereport(FATAL, (errmsg("unrecognized recovery parameter \"%s\"", - tok1))); + item->name))); } - FreeFile(fd); - - if (syntaxError) - ereport(FATAL, - (errmsg("syntax error in recovery command file: %s", - cmdline), - errhint("Lines should have the format parameter = 'value'."))); - /* * Check for compulsory parameters */ @@ -5331,13 +5461,18 @@ readRecoveryCommandFile(void) (errmsg("recovery target timeline %u does not exist", rtli))); recoveryTargetTLI = rtli; + recoveryTargetIsLatest = false; } else { /* We start the "latest" search from pg_control's timeline */ recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI); + recoveryTargetIsLatest = true; } } + + FreeConfigVariables(head); + FreeFile(fd); } /* @@ -5463,8 +5598,8 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg) * Returns TRUE if we are stopping, FALSE otherwise. On TRUE return, * *includeThis is set TRUE if we should apply this record before stopping. * - * We also track the timestamp of the latest applied COMMIT/ABORT record - * in XLogCtl->recoveryLastXTime, for logging purposes. + * We also track the timestamp of the latest applied COMMIT/ABORT + * record in XLogCtl->recoveryLastXTime, for logging purposes. * Also, some information is saved in recoveryStopXid et al for use in * annotating the new timeline's history file. */ @@ -5478,13 +5613,16 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) #endif uint8 record_info; TimestampTz recordXtime; + char recordRPName[MAXFNAMELEN]; #ifdef PGXC /* We only consider stoppping at COMMIT, ABORT or BARRIER records */ - if ((record->xl_rmid != RM_XACT_ID) && (record->xl_rmid != RM_BARRIER_ID)) + if (record->xl_rmid != RM_XACT_ID && + record->xl_rmid != RM_BARRIER_ID && + record->xl_rmid != RM_XLOG_ID) #else - /* We only consider stopping at COMMIT or ABORT records */ - if (record->xl_rmid != RM_XACT_ID) + /* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */ + if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID) #endif return false; @@ -5493,14 +5631,14 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) if (record->xl_rmid == RM_XACT_ID) { #endif - if (record_info == XLOG_XACT_COMMIT) + if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT) { xl_xact_commit *recordXactCommitData; recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record); recordXtime = recordXactCommitData->xact_time; } - else if (record_info == XLOG_XACT_ABORT) + else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT) { xl_xact_abort *recordXactAbortData; @@ -5519,20 +5657,33 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) } } #endif + else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT) + { + xl_restore_point *recordRestorePointData; + + recordRestorePointData = (xl_restore_point *) XLogRecGetData(record); + recordXtime = recordRestorePointData->rp_time; + strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN); + } else return false; /* Do we have a PITR target at all? */ if (recoveryTarget == RECOVERY_TARGET_UNSET) { - SetLatestXTime(recordXtime); + /* + * Save timestamp of latest transaction commit/abort if this is a + * transaction record + */ + if (record->xl_rmid == RM_XACT_ID) + SetLatestXTime(recordXtime); return false; } if (recoveryTarget == RECOVERY_TARGET_XID) { /* - * there can be only one transaction end record with this exact + * There can be only one transaction end record with this exact * transactionid * * when testing for an xid, we MUST test for equality only, since @@ -5559,10 +5710,24 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) } } #endif + else if (recoveryTarget == RECOVERY_TARGET_NAME) + { + /* + * There can be many restore points that share the same name, so we + * stop at the first one + */ + stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0); + + /* + * Ignore recoveryTargetInclusive because this is not a transaction + * record + */ + *includeThis = false; + } else { /* - * there can be many transactions that share the same commit time, so + * There can be many transactions that share the same commit time, so * we stop after the last one, if we are inclusive, or stop at the * first one if we are exclusive */ @@ -5593,7 +5758,7 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) recoveryStopXid, timestamptz_to_str(recoveryStopTime)))); } - else + else if (record_info == XLOG_XACT_ABORT) { if (recoveryStopAfter) ereport(LOG, @@ -5606,8 +5771,22 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) recoveryStopXid, timestamptz_to_str(recoveryStopTime)))); } + else + { + strncpy(recoveryStopName, recordRPName, MAXFNAMELEN); - if (recoveryStopAfter) + ereport(LOG, + (errmsg("recovery stopping at restore point \"%s\", time %s", + recoveryStopName, + timestamptz_to_str(recoveryStopTime)))); + } + + /* + * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a + * restore point since they are timestamped, though the latest + * transaction time is not updated. + */ + if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter) SetLatestXTime(recordXtime); } #ifdef PGXC @@ -5621,13 +5800,121 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) return true; } #endif - else + else if (record->xl_rmid == RM_XACT_ID) SetLatestXTime(recordXtime); return stopsHere; } /* + * Recheck shared recoveryPause by polling. + * + * XXX Can also be done with shared latch. + */ +static void +recoveryPausesHere(void) +{ + ereport(LOG, + (errmsg("recovery has paused"), + errhint("Execute pg_xlog_replay_resume() to continue."))); + + while (RecoveryIsPaused()) + { + pg_usleep(1000000L); /* 1000 ms */ + HandleStartupProcInterrupts(); + } +} + +static bool +RecoveryIsPaused(void) +{ + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + bool recoveryPause; + + SpinLockAcquire(&xlogctl->info_lck); + recoveryPause = xlogctl->recoveryPause; + SpinLockRelease(&xlogctl->info_lck); + + return recoveryPause; +} + +static void +SetRecoveryPause(bool recoveryPause) +{ + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + SpinLockAcquire(&xlogctl->info_lck); + xlogctl->recoveryPause = recoveryPause; + SpinLockRelease(&xlogctl->info_lck); +} + +/* + * pg_xlog_replay_pause - pause recovery now + */ +Datum +pg_xlog_replay_pause(PG_FUNCTION_ARGS) +{ + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to control recovery")))); + + if (!RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errhint("Recovery control functions can only be executed during recovery."))); + + SetRecoveryPause(true); + + PG_RETURN_VOID(); +} + +/* + * pg_xlog_replay_resume - resume recovery now + */ +Datum +pg_xlog_replay_resume(PG_FUNCTION_ARGS) +{ + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to control recovery")))); + + if (!RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errhint("Recovery control functions can only be executed during recovery."))); + + SetRecoveryPause(false); + + PG_RETURN_VOID(); +} + +/* + * pg_is_xlog_replay_paused + */ +Datum +pg_is_xlog_replay_paused(PG_FUNCTION_ARGS) +{ + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to control recovery")))); + + if (!RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errhint("Recovery control functions can only be executed during recovery."))); + + PG_RETURN_BOOL(RecoveryIsPaused()); +} + +/* * Save timestamp of latest processed commit/abort record. * * We keep this in XLogCtl, not a simple static variable, so that it can be @@ -5663,6 +5950,24 @@ GetLatestXTime(void) } /* + * Returns timestamp of latest processed commit/abort record. + * + * When the server has been started normally without recovery the function + * returns NULL. + */ +Datum +pg_last_xact_replay_timestamp(PG_FUNCTION_ARGS) +{ + TimestampTz xtime; + + xtime = GetLatestXTime(); + if (xtime == 0) + PG_RETURN_NULL(); + + PG_RETURN_TIMESTAMPTZ(xtime); +} + +/* * Returns bool with current recovery mode, a global state. */ Datum @@ -5888,11 +6193,22 @@ StartupXLOG(void) (errmsg("starting point-in-time recovery to barrier %s", (recoveryTargetBarrierId)))); #endif + else if (recoveryTarget == RECOVERY_TARGET_NAME) + ereport(LOG, + (errmsg("starting point-in-time recovery to \"%s\"", + recoveryTargetName))); else ereport(LOG, (errmsg("starting archive recovery"))); } + /* + * Take ownership of the wakeup latch if we're going to sleep during + * recovery. + */ + if (StandbyMode) + OwnLatch(&XLogCtl->recoveryWakeupLatch); + if (read_backup_label(&checkPointLoc)) { /* @@ -5902,16 +6218,33 @@ StartupXLOG(void) record = ReadCheckpointRecord(checkPointLoc, 0); if (record != NULL) { + memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); + wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN); ereport(DEBUG1, (errmsg("checkpoint record is at %X/%X", checkPointLoc.xlogid, checkPointLoc.xrecoff))); InRecovery = true; /* force recovery even if SHUTDOWNED */ + + /* + * Make sure that REDO location exists. This may not be the case + * if there was a crash during an online backup, which left a + * backup_label around that references a WAL segment that's + * already been archived. + */ + if (XLByteLT(checkPoint.redo, checkPointLoc)) + { + if (!ReadRecord(&(checkPoint.redo), LOG, false)) + ereport(FATAL, + (errmsg("could not find redo location referenced by checkpoint record"), + errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir))); + } } else { - ereport(PANIC, + ereport(FATAL, (errmsg("could not locate required checkpoint record"), errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir))); + wasShutdown = false; /* keep compiler quiet */ } /* set flag to delete it later */ haveBackupLabel = true; @@ -5955,11 +6288,11 @@ StartupXLOG(void) ereport(PANIC, (errmsg("could not locate a valid checkpoint record"))); } + memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); + wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN); } LastRec = RecPtr = checkPointLoc; - memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); - wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN); ereport(DEBUG1, (errmsg("redo record is at %X/%X; shutdown %s", @@ -6052,8 +6385,7 @@ StartupXLOG(void) } /* - * set backupStartupPoint if we're starting archive recovery from a - * base backup + * set backupStartPoint if we're starting recovery from a base backup */ if (haveBackupLabel) ControlFile->backupStartPoint = checkPoint.redo; @@ -6091,6 +6423,14 @@ StartupXLOG(void) CheckRequiredParameterValues(); /* + * We're in recovery, so unlogged relations relations may be trashed + * and must be reset. This should be done BEFORE allowing Hot Standby + * connections, so that read-only backends don't try to read whatever + * garbage is left over from before. + */ + ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP); + + /* * Initialize for Hot Standby, if enabled. We won't let backends in * yet, not until we've reached the min recovery point specified in * control file and we've established a recovery snapshot from a @@ -6117,8 +6457,6 @@ StartupXLOG(void) StartupSUBTRANS(oldestActiveXID); StartupMultiXact(); - ProcArrayInitRecoveryInfo(oldestActiveXID); - /* * If we're beginning at a shutdown checkpoint, we know that * nothing was running on the master at this point. So fake-up an @@ -6175,6 +6513,7 @@ StartupXLOG(void) xlogctl->replayEndRecPtr = ReadRecPtr; xlogctl->recoveryLastRecPtr = ReadRecPtr; xlogctl->recoveryLastXTime = 0; + xlogctl->recoveryPause = false; SpinLockRelease(&xlogctl->info_lck); /* Also ensure XLogReceiptTime has a sane value */ @@ -6193,6 +6532,7 @@ StartupXLOG(void) */ if (InArchiveRecovery && IsUnderPostmaster) { + PublishStartupProcessInformation(); SetForwardFsyncRequests(); SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED); bgwriterLaunched = true; @@ -6223,6 +6563,7 @@ StartupXLOG(void) { bool recoveryContinue = true; bool recoveryApply = true; + bool recoveryPause = false; ErrorContextCallback errcontext; TimestampTz xtime; @@ -6269,6 +6610,15 @@ StartupXLOG(void) */ if (recoveryStopsHere(record, &recoveryApply)) { + /* + * Pause only if users can connect to send a resume + * message + */ + if (recoveryPauseAtTarget && standbyState == STANDBY_SNAPSHOT_READY) + { + SetRecoveryPause(true); + recoveryPausesHere(); + } reachedStopPoint = true; /* see below */ recoveryContinue = false; if (!recoveryApply) @@ -6295,9 +6645,16 @@ StartupXLOG(void) */ SpinLockAcquire(&xlogctl->info_lck); xlogctl->replayEndRecPtr = EndRecPtr; + recoveryPause = xlogctl->recoveryPause; SpinLockRelease(&xlogctl->info_lck); /* + * Pause only if users can connect to send a resume message + */ + if (recoveryPause && standbyState == STANDBY_SNAPSHOT_READY) + recoveryPausesHere(); + + /* * If we are attempting to enter Hot Standby mode, process * XIDs we see */ @@ -6346,12 +6703,18 @@ StartupXLOG(void) } /* - * If we launched a WAL receiver, it should be gone by now. It will trump - * over the startup checkpoint and subsequent records if it's still alive, - * so be extra sure that it's gone. + * Kill WAL receiver, if it's still running, before we continue to write + * the startup checkpoint record. It will trump over the checkpoint and + * subsequent records if it's still alive when we start writing WAL. + */ + ShutdownWalRcv(); + + /* + * We don't need the latch anymore. It's not strictly necessary to disown + * it, but let's do it for the sake of tidiness. */ - if (WalRcvInProgress()) - elog(PANIC, "wal receiver still active"); + if (StandbyMode) + DisownLatch(&XLogCtl->recoveryWakeupLatch); /* * We are now done reading the xlog from stream. Turn off streaming @@ -6375,16 +6738,36 @@ StartupXLOG(void) * be further ahead --- ControlFile->minRecoveryPoint cannot have been * advanced beyond the WAL we processed. */ - if (InArchiveRecovery && + if (InRecovery && (XLByteLT(EndOfLog, minRecoveryPoint) || !XLogRecPtrIsInvalid(ControlFile->backupStartPoint))) { - if (reachedStopPoint) /* stopped because of stop request */ + if (reachedStopPoint) + { + /* stopped because of stop request */ ereport(FATAL, (errmsg("requested recovery stop point is before consistent recovery point"))); - else /* ran off end of WAL */ - ereport(FATAL, - (errmsg("WAL ends before consistent recovery point"))); + } + + /* + * Ran off end of WAL before reaching end-of-backup WAL record, or + * minRecoveryPoint. That's usually a bad sign, indicating that you + * tried to recover from an online backup but never called + * pg_stop_backup(), or you didn't archive all the WAL up to that + * point. However, this also happens in crash recovery, if the system + * crashes while an online backup is in progress. We must not treat + * that as an error, or the database will refuse to start up. + */ + if (InArchiveRecovery) + { + if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint)) + ereport(FATAL, + (errmsg("WAL ends before end of online backup"), + errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery."))); + else + ereport(FATAL, + (errmsg("WAL ends before consistent recovery point"))); + } } /* @@ -6539,6 +6922,14 @@ StartupXLOG(void) PreallocXlogFiles(EndOfLog); /* + * Reset initial contents of unlogged relations. This has to be done + * AFTER recovery is complete so that any unlogged relations created + * during recovery also get picked up. + */ + if (InRecovery) + ResetUnloggedRelations(UNLOGGED_RELATION_INIT); + + /* * Okay, we're officially UP. */ InRecovery = false; @@ -6630,8 +7021,6 @@ StartupXLOG(void) static void CheckRecoveryConsistency(void) { - static bool backendsAllowed = false; - /* * Have we passed our safe starting point? */ @@ -6651,11 +7040,19 @@ CheckRecoveryConsistency(void) * enabling connections. */ if (standbyState == STANDBY_SNAPSHOT_READY && - !backendsAllowed && + !LocalHotStandbyActive && reachedMinRecoveryPoint && IsUnderPostmaster) { - backendsAllowed = true; + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + SpinLockAcquire(&xlogctl->info_lck); + xlogctl->SharedHotStandbyActive = true; + SpinLockRelease(&xlogctl->info_lck); + + LocalHotStandbyActive = true; + SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY); } } @@ -6703,6 +7100,38 @@ RecoveryInProgress(void) } /* + * Is HotStandby active yet? This is only important in special backends + * since normal backends won't ever be able to connect until this returns + * true. Postmaster knows this by way of signal, not via shared memory. + * + * Unlike testing standbyState, this works in any process that's connected to + * shared memory. + */ +bool +HotStandbyActive(void) +{ + /* + * We check shared state each time only until Hot Standby is active. We + * can't de-activate Hot Standby, so there's no need to keep checking + * after the shared variable has once been seen true. + */ + if (LocalHotStandbyActive) + return true; + else + { + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + /* spinlock is essential on machines with weak memory ordering! */ + SpinLockAcquire(&xlogctl->info_lck); + LocalHotStandbyActive = xlogctl->SharedHotStandbyActive; + SpinLockRelease(&xlogctl->info_lck); + + return LocalHotStandbyActive; + } +} + +/* * Is this process allowed to insert new WAL records? * * Ordinarily this is essentially equivalent to !RecoveryInProgress(). @@ -7078,10 +7507,15 @@ LogCheckpointEnd(bool restartpoint) { long write_secs, sync_secs, - total_secs; + total_secs, + longest_secs, + average_secs; int write_usecs, sync_usecs, - total_usecs; + total_usecs, + longest_usecs, + average_usecs; + uint64 average_sync_time; CheckpointStats.ckpt_end_t = GetCurrentTimestamp(); @@ -7097,18 +7531,43 @@ LogCheckpointEnd(bool restartpoint) CheckpointStats.ckpt_sync_end_t, &sync_secs, &sync_usecs); + /* + * Timing values returned from CheckpointStats are in microseconds. + * Convert to the second plus microsecond form that TimestampDifference + * returns for homogeneous printing. + */ + longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000); + longest_usecs = CheckpointStats.ckpt_longest_sync - + (uint64) longest_secs *1000000; + + average_sync_time = 0; + if (CheckpointStats.ckpt_sync_rels > 0) + average_sync_time = CheckpointStats.ckpt_agg_sync_time / + CheckpointStats.ckpt_sync_rels; + average_secs = (long) (average_sync_time / 1000000); + average_usecs = average_sync_time - (uint64) average_secs *1000000; + if (restartpoint) elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); " - "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", + "%d transaction log file(s) added, %d removed, %d recycled; " + "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; " + "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, + CheckpointStats.ckpt_segs_added, + CheckpointStats.ckpt_segs_removed, + CheckpointStats.ckpt_segs_recycled, write_secs, write_usecs / 1000, sync_secs, sync_usecs / 1000, - total_secs, total_usecs / 1000); + total_secs, total_usecs / 1000, + CheckpointStats.ckpt_sync_rels, + longest_secs, longest_usecs / 1000, + average_secs, average_usecs / 1000); else elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); " "%d transaction log file(s) added, %d removed, %d recycled; " - "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", + "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; " + "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_segs_added, @@ -7116,7 +7575,10 @@ LogCheckpointEnd(bool restartpoint) CheckpointStats.ckpt_segs_recycled, write_secs, write_usecs / 1000, sync_secs, sync_usecs / 1000, - total_secs, total_usecs / 1000); + total_secs, total_usecs / 1000, + CheckpointStats.ckpt_sync_rels, + longest_secs, longest_usecs / 1000, + average_secs, average_usecs / 1000); } /* @@ -7570,6 +8032,7 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointCLOG(); CheckPointSUBTRANS(); CheckPointMultiXact(); + CheckPointPredicate(); CheckPointRelationMap(); CheckPointBuffers(flags); /* performs all required fsyncs */ /* We deliberately delay 2PC checkpointing as long as possible */ @@ -7721,16 +8184,18 @@ CreateRestartPoint(int flags) SpinLockRelease(&xlogctl->info_lck); LWLockRelease(WALInsertLock); - if (log_checkpoints) - { - /* - * Prepare to accumulate statistics. - */ - MemSet(&CheckpointStats, 0, sizeof(CheckpointStats)); - CheckpointStats.ckpt_start_t = GetCurrentTimestamp(); + /* + * Prepare to accumulate statistics. + * + * Note: because it is possible for log_checkpoints to change while a + * checkpoint proceeds, we always accumulate stats, even if + * log_checkpoints is currently off. + */ + MemSet(&CheckpointStats, 0, sizeof(CheckpointStats)); + CheckpointStats.ckpt_start_t = GetCurrentTimestamp(); + if (log_checkpoints) LogCheckpointStart(flags, true); - } CheckPointGuts(lastCheckPoint.redo, flags); @@ -7785,10 +8250,14 @@ CreateRestartPoint(int flags) } /* - * Currently, there is no need to truncate pg_subtrans during recovery. If - * we did do that, we will need to have called StartupSUBTRANS() already - * and then TruncateSUBTRANS() would go here. + * Truncate pg_subtrans if possible. We can throw away all data before + * the oldest XMIN of any running transaction. No future transaction will + * attempt to reference any pg_subtrans entry older than that (see Asserts + * in subtrans.c). When hot standby is disabled, though, we mustn't do + * this because StartupSUBTRANS hasn't been called yet. */ + if (EnableHotStandby) + TruncateSUBTRANS(GetOldestXmin(true, false)); /* All real work is done, but log before releasing lock. */ if (log_checkpoints) @@ -7876,6 +8345,33 @@ RequestXLogSwitch(void) } /* + * Write a RESTORE POINT record + */ +XLogRecPtr +XLogRestorePoint(const char *rpName) +{ + XLogRecPtr RecPtr; + XLogRecData rdata; + xl_restore_point xlrec; + + xlrec.rp_time = GetCurrentTimestamp(); + strncpy(xlrec.rp_name, rpName, MAXFNAMELEN); + + rdata.buffer = InvalidBuffer; + rdata.data = (char *) &xlrec; + rdata.len = sizeof(xl_restore_point); + rdata.next = NULL; + + RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata); + + ereport(LOG, + (errmsg("restore point \"%s\" created at %X/%X", + rpName, RecPtr.xlogid, RecPtr.xrecoff))); + + return RecPtr; +} + +/* * Check if any of the GUC parameters that are critical for hot standby * have changed, and update the value in pg_control file if necessary. */ @@ -8067,6 +8563,10 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) { /* nothing to do here */ } + else if (info == XLOG_RESTORE_POINT) + { + /* nothing to do here */ + } else if (info == XLOG_BACKUP_END) { XLogRecPtr startpoint; @@ -8169,6 +8669,13 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec) { appendStringInfo(buf, "xlog switch"); } + else if (info == XLOG_RESTORE_POINT) + { + xl_restore_point *xlrec = (xl_restore_point *) rec; + + appendStringInfo(buf, "restore point: %s", xlrec->rp_name); + + } else if (info == XLOG_BACKUP_END) { XLogRecPtr startpoint; @@ -8246,7 +8753,7 @@ get_sync_bit(int method) /* * Optimize writes by bypassing kernel cache with O_DIRECT when using - * O_SYNC, O_DSYNC or O_FSYNC. But only if archiving and streaming are + * O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are * disabled, otherwise the archive command or walsender process will read * the WAL soon after writing it, which is guaranteed to cause a physical * read if we bypassed the kernel cache. We also skip the @@ -8291,12 +8798,9 @@ get_sync_bit(int method) /* * GUC support */ -bool -assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source) +void +assign_xlog_sync_method(int new_sync_method, void *extra) { - if (!doit) - return true; - if (sync_method != new_sync_method) { /* @@ -8316,8 +8820,6 @@ assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source) XLogFileClose(); } } - - return true; } @@ -8383,6 +8885,43 @@ pg_start_backup(PG_FUNCTION_ARGS) text *backupid = PG_GETARG_TEXT_P(0); bool fast = PG_GETARG_BOOL(1); char *backupidstr; + XLogRecPtr startpoint; + char startxlogstr[MAXFNAMELEN]; + + backupidstr = text_to_cstring(backupid); + + startpoint = do_pg_start_backup(backupidstr, fast, NULL); + + snprintf(startxlogstr, sizeof(startxlogstr), "%X/%X", + startpoint.xlogid, startpoint.xrecoff); + PG_RETURN_TEXT_P(cstring_to_text(startxlogstr)); +} + +/* + * do_pg_start_backup is the workhorse of the user-visible pg_start_backup() + * function. It creates the necessary starting checkpoint and constructs the + * backup label file. + * + * There are two kind of backups: exclusive and non-exclusive. An exclusive + * backup is started with pg_start_backup(), and there can be only one active + * at a time. The backup label file of an exclusive backup is written to + * $PGDATA/backup_label, and it is removed by pg_stop_backup(). + * + * A non-exclusive backup is used for the streaming base backups (see + * src/backend/replication/basebackup.c). The difference to exclusive backups + * is that the backup label file is not written to disk. Instead, its would-be + * contents are returned in *labelfile, and the caller is responsible for + * including it in the backup archive as 'backup_label'. There can be many + * non-exclusive backups active at the same time, and they don't conflict + * with an exclusive backup either. + * + * Every successfully started non-exclusive backup must be stopped by calling + * do_pg_stop_backup() or do_pg_abort_backup(). + */ +XLogRecPtr +do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile) +{ + bool exclusive = (labelfile == NULL); XLogRecPtr checkpointloc; XLogRecPtr startpoint; pg_time_t stamp_time; @@ -8392,11 +8931,12 @@ pg_start_backup(PG_FUNCTION_ARGS) uint32 _logSeg; struct stat stat_buf; FILE *fp; + StringInfoData labelfbuf; - if (!superuser()) + if (!superuser() && !is_authenticated_user_replication_role()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("must be superuser to run a backup"))); + errmsg("must be superuser or replication role to run a backup"))); if (RecoveryInProgress()) ereport(ERROR, @@ -8410,7 +8950,24 @@ pg_start_backup(PG_FUNCTION_ARGS) errmsg("WAL level not sufficient for making an online backup"), errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start."))); - backupidstr = text_to_cstring(backupid); + if (strlen(backupidstr) > MAXPGPATH) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("backup label too long (max %d bytes)", + MAXPGPATH))); + + /* + * Force an XLOG file switch before the checkpoint, to ensure that the WAL + * segment the checkpoint is written to doesn't contain pages with old + * timeline IDs. That would otherwise happen if you called + * pg_start_backup() right after restoring from a PITR archive: the first + * WAL segment containing the startup checkpoint has pages in the + * beginning with the old timeline ID. That can cause trouble at recovery: + * we won't have a history file covering the old timeline if pg_xlog + * directory was not included in the base backup and the WAL archive was + * cleared too before starting the backup. + */ + RequestXLogSwitch(); /* * Mark backup active in shared memory. We must do full-page WAL writes @@ -8430,122 +8987,168 @@ pg_start_backup(PG_FUNCTION_ARGS) * ensure adequate interlocking against XLogInsert(). */ LWLockAcquire(WALInsertLock, LW_EXCLUSIVE); - if (XLogCtl->Insert.forcePageWrites) + if (exclusive) { - LWLockRelease(WALInsertLock); - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("a backup is already in progress"), - errhint("Run pg_stop_backup() and try again."))); + if (XLogCtl->Insert.exclusiveBackup) + { + LWLockRelease(WALInsertLock); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("a backup is already in progress"), + errhint("Run pg_stop_backup() and try again."))); + } + XLogCtl->Insert.exclusiveBackup = true; } + else + XLogCtl->Insert.nonExclusiveBackups++; XLogCtl->Insert.forcePageWrites = true; LWLockRelease(WALInsertLock); - /* - * Force an XLOG file switch before the checkpoint, to ensure that the WAL - * segment the checkpoint is written to doesn't contain pages with old - * timeline IDs. That would otherwise happen if you called - * pg_start_backup() right after restoring from a PITR archive: the first - * WAL segment containing the startup checkpoint has pages in the - * beginning with the old timeline ID. That can cause trouble at recovery: - * we won't have a history file covering the old timeline if pg_xlog - * directory was not included in the base backup and the WAL archive was - * cleared too before starting the backup. - */ - RequestXLogSwitch(); - /* Ensure we release forcePageWrites if fail below */ - PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0); + PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive)); { - /* - * Force a CHECKPOINT. Aside from being necessary to prevent torn - * page problems, this guarantees that two successive backup runs will - * have different checkpoint positions and hence different history - * file names, even if nothing happened in between. - * - * We use CHECKPOINT_IMMEDIATE only if requested by user (via passing - * fast = true). Otherwise this can take awhile. - */ - RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | - (fast ? CHECKPOINT_IMMEDIATE : 0)); + bool gotUniqueStartpoint = false; - /* - * Now we need to fetch the checkpoint record location, and also its - * REDO pointer. The oldest point in WAL that would be needed to - * restore starting from the checkpoint is precisely the REDO pointer. - */ - LWLockAcquire(ControlFileLock, LW_SHARED); - checkpointloc = ControlFile->checkPoint; - startpoint = ControlFile->checkPointCopy.redo; - LWLockRelease(ControlFileLock); + do + { + /* + * Force a CHECKPOINT. Aside from being necessary to prevent torn + * page problems, this guarantees that two successive backup runs + * will have different checkpoint positions and hence different + * history file names, even if nothing happened in between. + * + * We use CHECKPOINT_IMMEDIATE only if requested by user (via + * passing fast = true). Otherwise this can take awhile. + */ + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | + (fast ? CHECKPOINT_IMMEDIATE : 0)); + + /* + * Now we need to fetch the checkpoint record location, and also + * its REDO pointer. The oldest point in WAL that would be needed + * to restore starting from the checkpoint is precisely the REDO + * pointer. + */ + LWLockAcquire(ControlFileLock, LW_SHARED); + checkpointloc = ControlFile->checkPoint; + startpoint = ControlFile->checkPointCopy.redo; + LWLockRelease(ControlFileLock); + + /* + * If two base backups are started at the same time (in WAL sender + * processes), we need to make sure that they use different + * checkpoints as starting locations, because we use the starting + * WAL location as a unique identifier for the base backup in the + * end-of-backup WAL record and when we write the backup history + * file. Perhaps it would be better generate a separate unique ID + * for each backup instead of forcing another checkpoint, but + * taking a checkpoint right after another is not that expensive + * either because only few buffers have been dirtied yet. + */ + LWLockAcquire(WALInsertLock, LW_SHARED); + if (XLByteLT(XLogCtl->Insert.lastBackupStart, startpoint)) + { + XLogCtl->Insert.lastBackupStart = startpoint; + gotUniqueStartpoint = true; + } + LWLockRelease(WALInsertLock); + } while (!gotUniqueStartpoint); XLByteToSeg(startpoint, _logId, _logSeg); XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg); + /* + * Construct backup label file + */ + initStringInfo(&labelfbuf); + /* Use the log timezone here, not the session timezone */ stamp_time = (pg_time_t) time(NULL); pg_strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&stamp_time, log_timezone)); + appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n", + startpoint.xlogid, startpoint.xrecoff, xlogfilename); + appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n", + checkpointloc.xlogid, checkpointloc.xrecoff); + appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf); + appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr); /* - * Check for existing backup label --- implies a backup is already - * running. (XXX given that we checked forcePageWrites above, maybe - * it would be OK to just unlink any such label file?) + * Okay, write the file, or return its contents to caller. */ - if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0) + if (exclusive) { - if (errno != ENOENT) + /* + * Check for existing backup label --- implies a backup is already + * running. (XXX given that we checked exclusiveBackup above, + * maybe it would be OK to just unlink any such label file?) + */ + if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0) + { + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + BACKUP_LABEL_FILE))); + } + else + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("a backup is already in progress"), + errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.", + BACKUP_LABEL_FILE))); + + fp = AllocateFile(BACKUP_LABEL_FILE, "w"); + + if (!fp) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", + errmsg("could not create file \"%s\": %m", BACKUP_LABEL_FILE))); + fwrite(labelfbuf.data, labelfbuf.len, 1, fp); + if (fflush(fp) || ferror(fp) || FreeFile(fp)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + BACKUP_LABEL_FILE))); + pfree(labelfbuf.data); } else - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("a backup is already in progress"), - errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.", - BACKUP_LABEL_FILE))); - - /* - * Okay, write the file - */ - fp = AllocateFile(BACKUP_LABEL_FILE, "w"); - if (!fp) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create file \"%s\": %m", - BACKUP_LABEL_FILE))); - fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n", - startpoint.xlogid, startpoint.xrecoff, xlogfilename); - fprintf(fp, "CHECKPOINT LOCATION: %X/%X\n", - checkpointloc.xlogid, checkpointloc.xrecoff); - fprintf(fp, "START TIME: %s\n", strfbuf); - fprintf(fp, "LABEL: %s\n", backupidstr); - if (fflush(fp) || ferror(fp) || FreeFile(fp)) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not write file \"%s\": %m", - BACKUP_LABEL_FILE))); + *labelfile = labelfbuf.data; } - PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0); + PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive)); /* * We're done. As a convenience, return the starting WAL location. */ - snprintf(xlogfilename, sizeof(xlogfilename), "%X/%X", - startpoint.xlogid, startpoint.xrecoff); - PG_RETURN_TEXT_P(cstring_to_text(xlogfilename)); + return startpoint; } /* Error cleanup callback for pg_start_backup */ static void pg_start_backup_callback(int code, Datum arg) { - /* Turn off forcePageWrites on failure */ + bool exclusive = DatumGetBool(arg); + + /* Update backup counters and forcePageWrites on failure */ LWLockAcquire(WALInsertLock, LW_EXCLUSIVE); - XLogCtl->Insert.forcePageWrites = false; + if (exclusive) + { + Assert(XLogCtl->Insert.exclusiveBackup); + XLogCtl->Insert.exclusiveBackup = false; + } + else + { + Assert(XLogCtl->Insert.nonExclusiveBackups > 0); + XLogCtl->Insert.nonExclusiveBackups--; + } + + if (!XLogCtl->Insert.exclusiveBackup && + XLogCtl->Insert.nonExclusiveBackups == 0) + { + XLogCtl->Insert.forcePageWrites = false; + } LWLockRelease(WALInsertLock); } @@ -8565,6 +9168,27 @@ pg_start_backup_callback(int code, Datum arg) Datum pg_stop_backup(PG_FUNCTION_ARGS) { + XLogRecPtr stoppoint; + char stopxlogstr[MAXFNAMELEN]; + + stoppoint = do_pg_stop_backup(NULL, true); + + snprintf(stopxlogstr, sizeof(stopxlogstr), "%X/%X", + stoppoint.xlogid, stoppoint.xrecoff); + PG_RETURN_TEXT_P(cstring_to_text(stopxlogstr)); +} + +/* + * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup() + * function. + + * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops + * the non-exclusive backup specified by 'labelfile'. + */ +XLogRecPtr +do_pg_stop_backup(char *labelfile, bool waitforarchive) +{ + bool exclusive = (labelfile == NULL); XLogRecPtr startpoint; XLogRecPtr stoppoint; XLogRecData rdata; @@ -8580,15 +9204,15 @@ pg_stop_backup(PG_FUNCTION_ARGS) FILE *lfp; FILE *fp; char ch; - int ich; int seconds_before_warning; int waits = 0; bool reported_waiting = false; + char *remaining; - if (!superuser()) + if (!superuser() && !is_authenticated_user_replication_role()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - (errmsg("must be superuser to run a backup")))); + (errmsg("must be superuser or replication role to run a backup")))); if (RecoveryInProgress()) ereport(ERROR, @@ -8603,38 +9227,88 @@ pg_stop_backup(PG_FUNCTION_ARGS) errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start."))); /* - * OK to clear forcePageWrites + * OK to update backup counters and forcePageWrites */ LWLockAcquire(WALInsertLock, LW_EXCLUSIVE); - XLogCtl->Insert.forcePageWrites = false; + if (exclusive) + XLogCtl->Insert.exclusiveBackup = false; + else + { + /* + * The user-visible pg_start/stop_backup() functions that operate on + * exclusive backups can be called at any time, but for non-exclusive + * backups, it is expected that each do_pg_start_backup() call is + * matched by exactly one do_pg_stop_backup() call. + */ + Assert(XLogCtl->Insert.nonExclusiveBackups > 0); + XLogCtl->Insert.nonExclusiveBackups--; + } + + if (!XLogCtl->Insert.exclusiveBackup && + XLogCtl->Insert.nonExclusiveBackups == 0) + { + XLogCtl->Insert.forcePageWrites = false; + } LWLockRelease(WALInsertLock); - /* - * Open the existing label file - */ - lfp = AllocateFile(BACKUP_LABEL_FILE, "r"); - if (!lfp) + if (exclusive) { - if (errno != ENOENT) + /* + * Read the existing label file into memory. + */ + struct stat statbuf; + int r; + + if (stat(BACKUP_LABEL_FILE, &statbuf)) + { + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + BACKUP_LABEL_FILE))); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("a backup is not in progress"))); + } + + lfp = AllocateFile(BACKUP_LABEL_FILE, "r"); + if (!lfp) + { ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", BACKUP_LABEL_FILE))); - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("a backup is not in progress"))); + } + labelfile = palloc(statbuf.st_size + 1); + r = fread(labelfile, statbuf.st_size, 1, lfp); + labelfile[statbuf.st_size] = '\0'; + + /* + * Close and remove the backup label file + */ + if (r != 1 || ferror(lfp) || FreeFile(lfp)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + BACKUP_LABEL_FILE))); + if (unlink(BACKUP_LABEL_FILE) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + BACKUP_LABEL_FILE))); } /* * Read and parse the START WAL LOCATION line (this code is pretty crude, * but we are not expecting any variability in the file format). */ - if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %24s)%c", + if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c", &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename, &ch) != 4 || ch != '\n') ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); + remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */ /* * Write the backup-end xlog record @@ -8677,8 +9351,7 @@ pg_stop_backup(PG_FUNCTION_ARGS) fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n", stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename); /* transfer remaining lines from label to history file */ - while ((ich = fgetc(lfp)) != EOF) - fputc(ich, fp); + fprintf(fp, "%s", remaining); fprintf(fp, "STOP TIME: %s\n", strfbuf); if (fflush(fp) || ferror(fp) || FreeFile(fp)) ereport(ERROR, @@ -8687,20 +9360,6 @@ pg_stop_backup(PG_FUNCTION_ARGS) histfilepath))); /* - * Close and remove the backup label file - */ - if (ferror(lfp) || FreeFile(lfp)) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read file \"%s\": %m", - BACKUP_LABEL_FILE))); - if (unlink(BACKUP_LABEL_FILE) != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", - BACKUP_LABEL_FILE))); - - /* * Clean out any no-longer-needed history files. As a side effect, this * will post a .ready file for the newly created history file, notifying * the archiver that history file may be archived immediately. @@ -8726,7 +9385,7 @@ pg_stop_backup(PG_FUNCTION_ARGS) * wish to wait, you can set statement_timeout. Also, some notices are * issued to clue in anyone who might be doing this interactively. */ - if (XLogArchivingActive()) + if (waitforarchive && XLogArchivingActive()) { XLByteToPrevSeg(stoppoint, _logId, _logSeg); XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg); @@ -8767,16 +9426,41 @@ pg_stop_backup(PG_FUNCTION_ARGS) ereport(NOTICE, (errmsg("pg_stop_backup complete, all required WAL segments have been archived"))); } - else + else if (waitforarchive) ereport(NOTICE, (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup"))); /* * We're done. As a convenience, return the ending WAL location. */ - snprintf(stopxlogfilename, sizeof(stopxlogfilename), "%X/%X", - stoppoint.xlogid, stoppoint.xrecoff); - PG_RETURN_TEXT_P(cstring_to_text(stopxlogfilename)); + return stoppoint; +} + + +/* + * do_pg_abort_backup: abort a running backup + * + * This does just the most basic steps of do_pg_stop_backup(), by taking the + * system out of backup mode, thus making it a lot more safe to call from + * an error handler. + * + * NB: This is only for aborting a non-exclusive backup that doesn't write + * backup_label. A backup started with pg_stop_backup() needs to be finished + * with pg_stop_backup(). + */ +void +do_pg_abort_backup(void) +{ + LWLockAcquire(WALInsertLock, LW_EXCLUSIVE); + Assert(XLogCtl->Insert.nonExclusiveBackups > 0); + XLogCtl->Insert.nonExclusiveBackups--; + + if (!XLogCtl->Insert.exclusiveBackup && + XLogCtl->Insert.nonExclusiveBackups == 0) + { + XLogCtl->Insert.forcePageWrites = false; + } + LWLockRelease(WALInsertLock); } /* @@ -8810,6 +9494,51 @@ pg_switch_xlog(PG_FUNCTION_ARGS) } /* + * pg_create_restore_point: a named point for restore + */ +Datum +pg_create_restore_point(PG_FUNCTION_ARGS) +{ + text *restore_name = PG_GETARG_TEXT_P(0); + char *restore_name_str; + XLogRecPtr restorepoint; + char location[MAXFNAMELEN]; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to create a restore point")))); + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + (errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery.")))); + + if (!XLogIsNeeded()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL level not sufficient for creating a restore point"), + errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start."))); + + restore_name_str = text_to_cstring(restore_name); + + if (strlen(restore_name_str) >= MAXFNAMELEN) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("value too long for restore point (maximum %d characters)", MAXFNAMELEN - 1))); + + restorepoint = XLogRestorePoint(restore_name_str); + + /* + * As a convenience, return the WAL location of the restore point record + */ + snprintf(location, sizeof(location), "%X/%X", + restorepoint.xlogid, restorepoint.xrecoff); + PG_RETURN_TEXT_P(cstring_to_text(location)); +} + +/* * Report the current WAL write location (same format as pg_start_backup etc) * * This is useful for determining how much of WAL is visible to an external @@ -8895,6 +9624,25 @@ pg_last_xlog_receive_location(PG_FUNCTION_ARGS) } /* + * Get latest redo apply position. + * + * Exported to allow WALReceiver to read the pointer directly. + */ +XLogRecPtr +GetXLogReplayRecPtr(void) +{ + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + XLogRecPtr recptr; + + SpinLockAcquire(&xlogctl->info_lck); + recptr = xlogctl->recoveryLastRecPtr; + SpinLockRelease(&xlogctl->info_lck); + + return recptr; +} + +/* * Report the last WAL replay location (same format as pg_start_backup etc) * * This is useful for determining how much of WAL is visible to read-only @@ -8903,14 +9651,10 @@ pg_last_xlog_receive_location(PG_FUNCTION_ARGS) Datum pg_last_xlog_replay_location(PG_FUNCTION_ARGS) { - /* use volatile pointer to prevent code rearrangement */ - volatile XLogCtlData *xlogctl = XLogCtl; XLogRecPtr recptr; char location[MAXFNAMELEN]; - SpinLockAcquire(&xlogctl->info_lck); - recptr = xlogctl->recoveryLastRecPtr; - SpinLockRelease(&xlogctl->info_lck); + recptr = GetXLogReplayRecPtr(); if (recptr.xlogid == 0 && recptr.xrecoff == 0) PG_RETURN_NULL(); @@ -9214,11 +9958,27 @@ startupproc_quickdie(SIGNAL_ARGS) } +/* SIGUSR1: let latch facility handle the signal */ +static void +StartupProcSigUsr1Handler(SIGNAL_ARGS) +{ + latch_sigusr1_handler(); +} + +/* SIGUSR2: set flag to finish recovery */ +static void +StartupProcTriggerHandler(SIGNAL_ARGS) +{ + promote_triggered = true; + WakeupRecovery(); +} + /* SIGHUP: set flag to re-read config file at next convenient time */ static void StartupProcSigHupHandler(SIGNAL_ARGS) { got_SIGHUP = true; + WakeupRecovery(); } /* SIGTERM: set flag to abort redo and exit */ @@ -9229,6 +9989,7 @@ StartupProcShutdownHandler(SIGNAL_ARGS) proc_exit(1); else shutdown_requested = true; + WakeupRecovery(); } /* Handle SIGHUP and SIGTERM signals of startup process */ @@ -9288,8 +10049,8 @@ StartupProcessMain(void) else pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); - pqsignal(SIGUSR1, SIG_IGN); - pqsignal(SIGUSR2, SIG_IGN); + pqsignal(SIGUSR1, StartupProcSigUsr1Handler); + pqsignal(SIGUSR2, StartupProcTriggerHandler); /* * Reset some signals that are accepted by postmaster but not here @@ -9472,16 +10233,17 @@ retry: } /* - * Data not here yet, so check for trigger then sleep. + * Data not here yet, so check for trigger then sleep for + * five seconds like in the WAL file polling case below. */ if (CheckForStandbyTrigger()) - goto triggered; + goto retry; /* - * When streaming is active, we want to react quickly when - * the next WAL record arrives, so sleep only a bit. + * Wait for more WAL to arrive, or timeout to be reached */ - pg_usleep(100000L); /* 100ms */ + WaitLatch(&XLogCtl->recoveryWakeupLatch, 5000000L); + ResetLatch(&XLogCtl->recoveryWakeupLatch); } else { @@ -9510,13 +10272,24 @@ retry: { /* * We've exhausted all options for retrieving the - * file. Retry ... + * file. Retry. */ failedSources = 0; /* - * ... but sleep first if it hasn't been long since - * last attempt. + * Before we sleep, re-scan for possible new timelines + * if we were requested to recover to the latest + * timeline. + */ + if (recoveryTargetIsLatest) + { + if (rescanLatestTimeLine()) + continue; + } + + /* + * If it hasn't been long since last attempt, sleep to + * avoid busy-waiting. */ now = (pg_time_t) time(NULL); if ((now - last_fail_time) < 5) @@ -9734,14 +10507,28 @@ emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) } /* - * Check to see if the trigger file exists. If it does, request postmaster - * to shut down walreceiver, wait for it to exit, remove the trigger - * file, and return true. + * Check to see whether the user-specified trigger file exists and whether a + * promote request has arrived. If either condition holds, request postmaster + * to shut down walreceiver, wait for it to exit, and return true. */ static bool CheckForStandbyTrigger(void) { struct stat stat_buf; + static bool triggered = false; + + if (triggered) + return true; + + if (promote_triggered) + { + ereport(LOG, + (errmsg("received promote request"))); + ShutdownWalRcv(); + promote_triggered = false; + triggered = true; + return true; + } if (TriggerFile == NULL) return false; @@ -9752,7 +10539,39 @@ CheckForStandbyTrigger(void) (errmsg("trigger file found: %s", TriggerFile))); ShutdownWalRcv(); unlink(TriggerFile); + triggered = true; return true; } return false; } + +/* + * Check to see if a promote request has arrived. Should be + * called by postmaster after receiving SIGUSR1. + */ +bool +CheckPromoteSignal(void) +{ + struct stat stat_buf; + + if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0) + { + /* + * Since we are in a signal handler, it's not safe to elog. We + * silently ignore any error from unlink. + */ + unlink(PROMOTE_SIGNAL_FILE); + return true; + } + return false; +} + +/* + * Wake up startup process to replay newly arrived WAL, or to notice that + * failover has been requested. + */ +void +WakeupRecovery(void) +{ + SetLatch(&XLogCtl->recoveryWakeupLatch); +} diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 3d7c7cf69e..cbb61bb899 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -8,10 +8,10 @@ * None of this code is used during normal system operation. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlogutils.c,v 1.71 2010/07/08 16:08:30 tgl Exp $ + * src/backend/access/transam/xlogutils.c * *------------------------------------------------------------------------- */ @@ -68,7 +68,7 @@ log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno, */ if (log_min_messages <= DEBUG1 || client_min_messages <= DEBUG1) { - char *path = relpath(node, forkno); + char *path = relpathperm(node, forkno); if (present) elog(DEBUG1, "page %u of relation %s is uninitialized", @@ -133,7 +133,7 @@ forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno) { if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2) { - char *path = relpath(hentry->key.node, forkno); + char *path = relpathperm(hentry->key.node, forkno); elog(DEBUG2, "page %u of relation %s has been dropped", hentry->key.blkno, path); @@ -166,7 +166,7 @@ forget_invalid_pages_db(Oid dbid) { if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2) { - char *path = relpath(hentry->key.node, hentry->key.forkno); + char *path = relpathperm(hentry->key.node, hentry->key.forkno); elog(DEBUG2, "page %u of relation %s has been dropped", hentry->key.blkno, path); @@ -200,7 +200,7 @@ XLogCheckInvalidPages(void) */ while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) { - char *path = relpath(hentry->key.node, hentry->key.forkno); + char *path = relpathperm(hentry->key.node, hentry->key.forkno); if (hentry->present) elog(WARNING, "page %u of relation %s was uninitialized", @@ -276,7 +276,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, Assert(blkno != P_NEW); /* Open the relation at smgr level */ - smgr = smgropen(rnode); + smgr = smgropen(rnode, InvalidBackendId); /* * Create the target file if it doesn't already exist. This lets us cope @@ -293,7 +293,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, if (blkno < lastblock) { /* page exists in file */ - buffer = ReadBufferWithoutRelcache(rnode, false, forknum, blkno, + buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, mode, NULL); } else @@ -312,7 +312,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, { if (buffer != InvalidBuffer) ReleaseBuffer(buffer); - buffer = ReadBufferWithoutRelcache(rnode, false, forknum, + buffer = ReadBufferWithoutRelcache(rnode, forknum, P_NEW, mode, NULL); lastblock++; } @@ -377,6 +377,8 @@ CreateFakeRelcacheEntry(RelFileNode rnode) rel->rd_rel = &fakeentry->pgc; rel->rd_node = rnode; + /* We will never be working with temp rels during recovery */ + rel->rd_backend = InvalidBackendId; /* We don't know the name of the relation; use relfilenode instead */ sprintf(RelationGetRelationName(rel), "%u", rnode.relNode); diff --git a/src/backend/bootstrap/.cvsignore b/src/backend/bootstrap/.cvsignore deleted file mode 100644 index 5201be9886..0000000000 --- a/src/backend/bootstrap/.cvsignore +++ /dev/null @@ -1,2 +0,0 @@ -bootparse.c -bootscanner.c diff --git a/src/backend/bootstrap/.gitignore b/src/backend/bootstrap/.gitignore new file mode 100644 index 0000000000..1ffe8ca39e --- /dev/null +++ b/src/backend/bootstrap/.gitignore @@ -0,0 +1,2 @@ +/bootparse.c +/bootscanner.c diff --git a/src/backend/bootstrap/Makefile b/src/backend/bootstrap/Makefile index f1dd7e12c3..a77d864800 100644 --- a/src/backend/bootstrap/Makefile +++ b/src/backend/bootstrap/Makefile @@ -2,7 +2,7 @@ # # Makefile for the bootstrap module # -# $PostgreSQL: pgsql/src/backend/bootstrap/Makefile,v 1.39 2010/01/05 03:56:52 tgl Exp $ +# src/backend/bootstrap/Makefile # #------------------------------------------------------------------------- @@ -12,7 +12,7 @@ include $(top_builddir)/src/Makefile.global override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS) -OBJS= bootparse.o bootstrap.o +OBJS= bootparse.o bootstrap.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y index 9cc68501ff..a9d2428698 100644 --- a/src/backend/bootstrap/bootparse.y +++ b/src/backend/bootstrap/bootparse.y @@ -4,12 +4,12 @@ * bootparse.y * yacc grammar for the "bootstrap" mode (BKI file format) * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/bootstrap/bootparse.y,v 1.105 2010/02/07 20:48:09 tgl Exp $ + * src/backend/bootstrap/bootparse.y * *------------------------------------------------------------------------- */ @@ -219,6 +219,7 @@ Boot_CreateStmt: $3, tupdesc, RELKIND_RELATION, + RELPERSISTENCE_PERMANENT, shared_relation, mapped_relation, true); @@ -238,6 +239,7 @@ Boot_CreateStmt: tupdesc, NIL, RELKIND_RELATION, + RELPERSISTENCE_PERMANENT, shared_relation, mapped_relation, true, @@ -342,6 +344,7 @@ boot_index_param: n->name = $1; n->expr = NULL; n->indexcolname = NULL; + n->collation = NIL; n->opclass = list_make1(makeString($2)); n->ordering = SORTBY_DEFAULT; n->nulls_ordering = SORTBY_NULLS_DEFAULT; diff --git a/src/backend/bootstrap/bootscanner.l b/src/backend/bootstrap/bootscanner.l index f2c8f824bd..fba020008f 100644 --- a/src/backend/bootstrap/bootscanner.l +++ b/src/backend/bootstrap/bootscanner.l @@ -4,12 +4,12 @@ * bootscanner.l * a lexical scanner for the bootstrap parser * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/bootstrap/bootscanner.l,v 1.51 2010/01/02 16:57:36 momjian Exp $ + * src/backend/bootstrap/bootscanner.l * *------------------------------------------------------------------------- */ diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 058542fecc..27e0e482c9 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -4,12 +4,12 @@ * routines to support running postgres in 'bootstrap' mode * bootstrap mode is used to create the initial template database * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.261 2010/04/20 01:38:52 tgl Exp $ + * src/backend/bootstrap/bootstrap.c * *------------------------------------------------------------------------- */ @@ -27,6 +27,7 @@ #include "access/xact.h" #include "bootstrap/bootstrap.h" #include "catalog/index.h" +#include "catalog/pg_collation.h" #include "catalog/pg_type.h" #include "libpq/pqsignal.h" #include "miscadmin.h" @@ -77,7 +78,8 @@ int numattr; /* number of attributes for cur. rel */ /* * Basic information associated with each type. This is used before - * pg_type is created. + * pg_type is filled, so it has to cover the datatypes used as column types + * in the core "bootstrapped" catalogs. * * XXX several of these input/output functions do catalog scans * (e.g., F_REGPROCIN scans pg_proc). this obviously creates some @@ -92,54 +94,57 @@ struct typinfo bool byval; char align; char storage; + Oid collation; Oid inproc; Oid outproc; }; static const struct typinfo TypInfo[] = { - {"bool", BOOLOID, 0, 1, true, 'c', 'p', + {"bool", BOOLOID, 0, 1, true, 'c', 'p', InvalidOid, F_BOOLIN, F_BOOLOUT}, - {"bytea", BYTEAOID, 0, -1, false, 'i', 'x', + {"bytea", BYTEAOID, 0, -1, false, 'i', 'x', InvalidOid, F_BYTEAIN, F_BYTEAOUT}, - {"char", CHAROID, 0, 1, true, 'c', 'p', + {"char", CHAROID, 0, 1, true, 'c', 'p', InvalidOid, F_CHARIN, F_CHAROUT}, - {"int2", INT2OID, 0, 2, true, 's', 'p', + {"int2", INT2OID, 0, 2, true, 's', 'p', InvalidOid, F_INT2IN, F_INT2OUT}, - {"int4", INT4OID, 0, 4, true, 'i', 'p', + {"int4", INT4OID, 0, 4, true, 'i', 'p', InvalidOid, F_INT4IN, F_INT4OUT}, - {"float4", FLOAT4OID, 0, 4, FLOAT4PASSBYVAL, 'i', 'p', + {"float4", FLOAT4OID, 0, 4, FLOAT4PASSBYVAL, 'i', 'p', InvalidOid, F_FLOAT4IN, F_FLOAT4OUT}, - {"name", NAMEOID, CHAROID, NAMEDATALEN, false, 'c', 'p', + {"name", NAMEOID, CHAROID, NAMEDATALEN, false, 'c', 'p', InvalidOid, F_NAMEIN, F_NAMEOUT}, - {"regclass", REGCLASSOID, 0, 4, true, 'i', 'p', + {"regclass", REGCLASSOID, 0, 4, true, 'i', 'p', InvalidOid, F_REGCLASSIN, F_REGCLASSOUT}, - {"regproc", REGPROCOID, 0, 4, true, 'i', 'p', + {"regproc", REGPROCOID, 0, 4, true, 'i', 'p', InvalidOid, F_REGPROCIN, F_REGPROCOUT}, - {"regtype", REGTYPEOID, 0, 4, true, 'i', 'p', + {"regtype", REGTYPEOID, 0, 4, true, 'i', 'p', InvalidOid, F_REGTYPEIN, F_REGTYPEOUT}, - {"text", TEXTOID, 0, -1, false, 'i', 'x', + {"text", TEXTOID, 0, -1, false, 'i', 'x', DEFAULT_COLLATION_OID, F_TEXTIN, F_TEXTOUT}, - {"oid", OIDOID, 0, 4, true, 'i', 'p', + {"oid", OIDOID, 0, 4, true, 'i', 'p', InvalidOid, F_OIDIN, F_OIDOUT}, - {"tid", TIDOID, 0, 6, false, 's', 'p', + {"tid", TIDOID, 0, 6, false, 's', 'p', InvalidOid, F_TIDIN, F_TIDOUT}, - {"xid", XIDOID, 0, 4, true, 'i', 'p', + {"xid", XIDOID, 0, 4, true, 'i', 'p', InvalidOid, F_XIDIN, F_XIDOUT}, - {"cid", CIDOID, 0, 4, true, 'i', 'p', + {"cid", CIDOID, 0, 4, true, 'i', 'p', InvalidOid, F_CIDIN, F_CIDOUT}, - {"int2vector", INT2VECTOROID, INT2OID, -1, false, 'i', 'p', + {"pg_node_tree", PGNODETREEOID, 0, -1, false, 'i', 'x', DEFAULT_COLLATION_OID, + F_PG_NODE_TREE_IN, F_PG_NODE_TREE_OUT}, + {"int2vector", INT2VECTOROID, INT2OID, -1, false, 'i', 'p', InvalidOid, F_INT2VECTORIN, F_INT2VECTOROUT}, - {"oidvector", OIDVECTOROID, OIDOID, -1, false, 'i', 'p', + {"oidvector", OIDVECTOROID, OIDOID, -1, false, 'i', 'p', InvalidOid, F_OIDVECTORIN, F_OIDVECTOROUT}, - {"_int4", INT4ARRAYOID, INT4OID, -1, false, 'i', 'x', + {"_int4", INT4ARRAYOID, INT4OID, -1, false, 'i', 'x', InvalidOid, F_ARRAY_IN, F_ARRAY_OUT}, - {"_text", 1009, TEXTOID, -1, false, 'i', 'x', + {"_text", 1009, TEXTOID, -1, false, 'i', 'x', DEFAULT_COLLATION_OID, F_ARRAY_IN, F_ARRAY_OUT}, - {"_oid", 1028, OIDOID, -1, false, 'i', 'x', + {"_oid", 1028, OIDOID, -1, false, 'i', 'x', InvalidOid, F_ARRAY_IN, F_ARRAY_OUT}, - {"_char", 1002, CHAROID, -1, false, 'i', 'x', + {"_char", 1002, CHAROID, -1, false, 'i', 'x', InvalidOid, F_ARRAY_IN, F_ARRAY_OUT}, - {"_aclitem", 1034, ACLITEMOID, -1, false, 'i', 'x', + {"_aclitem", 1034, ACLITEMOID, -1, false, 'i', 'x', InvalidOid, F_ARRAY_IN, F_ARRAY_OUT} }; @@ -724,6 +729,7 @@ DefineAttr(char *name, char *type, int attnum) attrtypes[attnum]->attbyval = Ap->am_typ.typbyval; attrtypes[attnum]->attstorage = Ap->am_typ.typstorage; attrtypes[attnum]->attalign = Ap->am_typ.typalign; + attrtypes[attnum]->attcollation = Ap->am_typ.typcollation; /* if an array type, assume 1-dimensional attribute */ if (Ap->am_typ.typelem != InvalidOid && Ap->am_typ.typlen < 0) attrtypes[attnum]->attndims = 1; @@ -737,6 +743,7 @@ DefineAttr(char *name, char *type, int attnum) attrtypes[attnum]->attbyval = TypInfo[typeoid].byval; attrtypes[attnum]->attstorage = TypInfo[typeoid].storage; attrtypes[attnum]->attalign = TypInfo[typeoid].align; + attrtypes[attnum]->attcollation = TypInfo[typeoid].collation; /* if an array type, assume 1-dimensional attribute */ if (TypInfo[typeoid].elem != InvalidOid && attrtypes[attnum]->attlen < 0) @@ -1144,7 +1151,7 @@ build_indices(void) heap = heap_open(ILHead->il_heap, NoLock); ind = index_open(ILHead->il_ind, NoLock); - index_build(heap, ind, ILHead->il_info, false); + index_build(heap, ind, ILHead->il_info, false, false); index_close(ind, NoLock); heap_close(heap, NoLock); diff --git a/src/backend/catalog/.cvsignore b/src/backend/catalog/.cvsignore deleted file mode 100644 index 242e32155f..0000000000 --- a/src/backend/catalog/.cvsignore +++ /dev/null @@ -1,4 +0,0 @@ -postgres.bki -postgres.description -postgres.shdescription -schemapg.h diff --git a/src/backend/catalog/.gitignore b/src/backend/catalog/.gitignore new file mode 100644 index 0000000000..557af3c0e5 --- /dev/null +++ b/src/backend/catalog/.gitignore @@ -0,0 +1,4 @@ +/postgres.bki +/postgres.description +/postgres.shdescription +/schemapg.h diff --git a/src/backend/catalog/Catalog.pm b/src/backend/catalog/Catalog.pm index bcd4e31fb9..8526d7d555 100644 --- a/src/backend/catalog/Catalog.pm +++ b/src/backend/catalog/Catalog.pm @@ -4,10 +4,10 @@ # Perl module that extracts info from catalog headers into Perl # data structures # -# Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group +# Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group # Portions Copyright (c) 1994, Regents of the University of California # -# $PostgreSQL: pgsql/src/backend/catalog/Catalog.pm,v 1.3 2010/01/05 20:23:32 tgl Exp $ +# src/backend/catalog/Catalog.pm # #---------------------------------------------------------------------- diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index a84d4563f5..82adb5be89 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -2,7 +2,7 @@ # # Makefile for backend/catalog # -# $PostgreSQL: pgsql/src/backend/catalog/Makefile,v 1.78 2010/05/13 11:49:48 petere Exp $ +# src/backend/catalog/Makefile # #------------------------------------------------------------------------- @@ -11,9 +11,10 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global OBJS = catalog.o dependency.o heap.o index.o indexing.o namespace.o aclchk.o \ - pg_aggregate.o pg_constraint.o pg_conversion.o pg_depend.o pg_enum.o \ - pg_inherits.o pg_largeobject.o pg_namespace.o pg_operator.o pg_proc.o \ - pg_db_role_setting.o pg_shdepend.o pg_type.o pgxc_class.o storage.o toasting.o + objectaddress.o pg_aggregate.o pg_collation.o pg_constraint.o pg_conversion.o \ + pg_depend.o pg_enum.o pg_inherits.o pg_largeobject.o pg_namespace.o \ + pg_operator.o pg_proc.o pg_db_role_setting.o pg_shdepend.o pg_type.o \ + pgxc_class.o storage.o toasting.o BKIFILES = postgres.bki postgres.description postgres.shdescription @@ -35,10 +36,11 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\ pg_database.h pg_db_role_setting.h pg_tablespace.h pg_pltemplate.h \ pg_authid.h pg_auth_members.h pg_shdepend.h pg_shdescription.h \ pg_ts_config.h pg_ts_config_map.h pg_ts_dict.h \ - pg_ts_parser.h pg_ts_template.h \ + pg_ts_parser.h pg_ts_template.h pg_extension.h \ pg_foreign_data_wrapper.h pg_foreign_server.h pg_user_mapping.h \ pgxc_class.h \ - pg_default_acl.h \ + pg_foreign_table.h \ + pg_default_acl.h pg_seclabel.h pg_collation.h \ toasting.h indexing.h \ ) diff --git a/src/backend/catalog/README b/src/backend/catalog/README index 6d7e0e23c1..fce01ea431 100644 --- a/src/backend/catalog/README +++ b/src/backend/catalog/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/catalog/README,v 1.14 2010/01/05 01:06:56 tgl Exp $ +src/backend/catalog/README System Catalog ============== diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c index 0470a70273..df32731b87 100644 --- a/src/backend/catalog/aclchk.c +++ b/src/backend/catalog/aclchk.c @@ -3,12 +3,12 @@ * aclchk.c * Routines to check access control permissions. * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyr |