2431 files changed, 417339 insertions, 477184 deletions
diff --git a/src/Makefile b/src/Makefile
index 5706bb1335..79cfeeb710 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -23,11 +23,13 @@ SUBDIRS = \
 	backend/snowball \
 	include \
 	backend/replication/libpqwalreceiver \
+	backend/replication/pgoutput \
 	fe_utils \
 	bin \
 	pl \
 	makefiles \
-	test/regress
+	test/regress \
+	test/perl
 
 # There are too many interdependencies between the subdirectories, so
 # don't attempt parallel make here.
@@ -69,16 +71,5 @@ distclean maintainer-clean:
 	$(MAKE) -C test/thread $@
 	rm -f Makefile.port Makefile.global
 
-coverage:
-	$(MAKE) -C timezone $@
-	$(MAKE) -C gtm $@
-	$(MAKE) -C backend $@
-	$(MAKE) -C backend/utils/mb/conversion_procs $@
-	$(MAKE) -C backend/snowball $@
-	$(MAKE) -C interfaces $@
-	$(MAKE) -C backend/replication/libpqwalreceiver $@
-	$(MAKE) -C bin $@
-	$(MAKE) -C pl $@
-
 
 .PHONY: install-local installdirs-local uninstall-local
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 64b7b73d16..dc7b801dff 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -180,6 +180,7 @@ pgxsdir = $(pkglibdir)/pgxs
 #
 # Records the choice of the various --enable-xxx and --with-xxx options.
 
+with_icu	= @with_icu@
 with_perl	= @with_perl@
 with_python	= @with_python@
 with_tcl	= @with_tcl@
@@ -198,6 +199,7 @@ enable_dtrace	= @enable_dtrace@
 enable_coverage	= @enable_coverage@
 enable_tap_tests	= @enable_tap_tests@
 enable_thread_safety	= @enable_thread_safety@
+enable_strong_random	= @enable_strong_random@
 
 python_includespec	= @python_includespec@
 python_libdir		= @python_libdir@
@@ -208,6 +210,9 @@ python_version		= @python_version@
 
 krb_srvtab = @krb_srvtab@
 
+ICU_CFLAGS		= @ICU_CFLAGS@
+ICU_LIBS		= @ICU_LIBS@
+
 TCLSH			= @TCLSH@
 TCL_LIBS		= @TCL_LIBS@
 TCL_LIB_SPEC		= @TCL_LIB_SPEC@
@@ -266,7 +271,6 @@ UUID_LIBS = @UUID_LIBS@
 UUID_EXTRA_OBJS = @UUID_EXTRA_OBJS@
 LD = @LD@
 with_gnu_ld = @with_gnu_ld@
-ld_R_works = @ld_R_works@
 
 # We want -L for libpgport.a and libpgcommon.a to be first in LDFLAGS.  We
 # also need LDFLAGS to be a "recursively expanded" variable, else adjustments
@@ -335,8 +339,11 @@ endif
 endif
 
 PROVE = @PROVE@
-PG_PROVE_FLAGS = -I $(top_srcdir)/src/test/perl/
-PROVE_FLAGS = --verbose
+# There are common routines in src/test/perl, and some test suites have
+# extra perl modules in their own directory.
+PG_PROVE_FLAGS = -I $(top_srcdir)/src/test/perl/ -I $(srcdir)
+# User-supplied prove flags such as --verbose can be provided in PROVE_FLAGS.
+
 
 # prepend to path if already set, else just set it
 define add_to_path
@@ -345,7 +352,7 @@ endef
 
 # platform-specific environment variable to set shared library path
 define ld_library_path_var
-$(if $(filter $(PORTNAME),darwin),DYLD_LIBRARY_PATH,$(if $(filter $(PORTNAME),aix),LIBPATH,LD_LIBRARY_PATH))
+$(if $(filter $(PORTNAME),darwin),DYLD_LIBRARY_PATH,$(if $(filter $(PORTNAME),aix),LIBPATH,$(if $(filter $(PORTNAME),hpux),SHLIB_PATH,LD_LIBRARY_PATH)))
 endef
 
 define with_temp_install
@@ -356,12 +363,12 @@ ifeq ($(enable_tap_tests),yes)
 
 define prove_installcheck
 rm -rf $(CURDIR)/tmp_check/log
-cd $(srcdir) && TESTDIR='$(CURDIR)' PATH="$(bindir):$$PATH" PGPORT='6$(DEF_PGPORT)' top_builddir='$(CURDIR)/$(top_builddir)' PG_REGRESS='$(CURDIR)/$(top_builddir)/src/test/regress/pg_regress' $(PROVE) $(PG_PROVE_FLAGS) $(PROVE_FLAGS) t/*.pl
+cd $(srcdir) && TESTDIR='$(CURDIR)' PATH="$(bindir):$$PATH" PGPORT='6$(DEF_PGPORT)' top_builddir='$(CURDIR)/$(top_builddir)' PG_REGRESS='$(CURDIR)/$(top_builddir)/src/test/regress/pg_regress' $(PROVE) $(PG_PROVE_FLAGS) $(PROVE_FLAGS) $(if $(PROVE_TESTS),$(PROVE_TESTS),t/*.pl)
 endef
 
 define prove_check
 rm -rf $(CURDIR)/tmp_check/log
-cd $(srcdir) && TESTDIR='$(CURDIR)' $(with_temp_install) PGPORT='6$(DEF_PGPORT)' PG_REGRESS='$(CURDIR)/$(top_builddir)/src/test/regress/pg_regress' $(PROVE) $(PG_PROVE_FLAGS) $(PROVE_FLAGS) t/*.pl
+cd $(srcdir) && TESTDIR='$(CURDIR)' $(with_temp_install) PGPORT='6$(DEF_PGPORT)' PG_REGRESS='$(CURDIR)/$(top_builddir)/src/test/regress/pg_regress' $(PROVE) $(PG_PROVE_FLAGS) $(PROVE_FLAGS) $(if $(PROVE_TESTS),$(PROVE_TESTS),t/*.pl)
 endef
 
 else
@@ -395,11 +402,8 @@ STRIP_SHARED_LIB = @STRIP_SHARED_LIB@
 
 # Documentation
 
-have_docbook	= @have_docbook@
-COLLATEINDEX	= @COLLATEINDEX@
 DBTOEPUB	= @DBTOEPUB@
-DOCBOOKSTYLE	= @DOCBOOKSTYLE@
-JADE			= @JADE@
+FOP				= @FOP@
 NSGMLS			= @NSGMLS@
 OSX				= @OSX@
 XMLLINT			= @XMLLINT@
@@ -546,14 +550,35 @@ TEMP_CONF += --temp-config=$(TEMP_CONFIG)
 endif
 
 pg_regress_locale_flags = $(if $(ENCODING),--encoding=$(ENCODING)) $(NOLOCALE)
-
-pg_regress_check = $(with_temp_install) $(top_builddir)/src/test/regress/pg_regress --inputdir=$(srcdir) --temp-instance=./tmp_check $(TEMP_CONF) --bindir= $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS)
-pg_regress_installcheck = $(top_builddir)/src/test/regress/pg_regress --inputdir=$(srcdir) --bindir='$(bindir)' $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS)
-
-pg_regress_clean_files = results/ regression.diffs regression.out tmp_check/ log/
-
-pg_isolation_regress_check = $(with_temp_install) $(top_builddir)/src/test/isolation/pg_isolation_regress --inputdir=$(srcdir) --temp-instance=./tmp_check $(TEMP_CONF) --bindir= $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS)
-pg_isolation_regress_installcheck = $(top_builddir)/src/test/isolation/pg_isolation_regress --inputdir=$(srcdir) $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS)
+pg_regress_clean_files = results/ regression.diffs regression.out tmp_check/ tmp_check_iso/ log/ output_iso/
+
+pg_regress_check = \
+    $(with_temp_install) \
+    $(top_builddir)/src/test/regress/pg_regress \
+    --temp-instance=./tmp_check \
+    --inputdir=$(srcdir) \
+    --bindir= \
+    $(TEMP_CONF) \
+    $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS)
+pg_regress_installcheck = \
+    $(top_builddir)/src/test/regress/pg_regress \
+    --inputdir=$(srcdir) \
+    --bindir='$(bindir)' \
+    $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS)
+
+pg_isolation_regress_check = \
+    $(with_temp_install) \
+    $(top_builddir)/src/test/isolation/pg_isolation_regress \
+    --temp-instance=./tmp_check_iso \
+    --inputdir=$(srcdir) --outputdir=output_iso \
+    --bindir= \
+    $(TEMP_CONF) \
+    $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS)
+pg_isolation_regress_installcheck = \
+    $(top_builddir)/src/test/isolation/pg_isolation_regress \
+    --inputdir=$(srcdir) --outputdir=output_iso \
+    --bindir='$(bindir)' \
+    $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS)
 
 ##########################################################################
 #
@@ -583,6 +608,11 @@ ifneq ($(CUSTOM_COPT),)
   COPT= $(CUSTOM_COPT)
 endif
 
+#
+# These variables are meant to be set in the environment of "make"
+# to add flags to whatever configure picked.  Unlike the ones above,
+# they are documented.
+#
 ifdef COPT
    CFLAGS += $(COPT)
    LDFLAGS += $(COPT)
@@ -623,6 +653,7 @@ TAS         = @TAS@
 ifdef FLEX
 	$(FLEX) $(if $(FLEX_NO_BACKUP),-b) $(FLEXFLAGS) -o'$@' $<
 	@$(if $(FLEX_NO_BACKUP),if [ `wc -l <lex.backup` -eq 1 ]; then rm lex.backup; else echo "Scanner requires backup; see lex.backup." 1>&2; exit 1; fi)
+	$(if $(FLEX_FIX_WARNING),$(PERL) $(top_srcdir)/src/tools/fix-old-flex-code.pl '$@')
 else
 	@$(missing) flex $< '$@'
 endif
diff --git a/src/Makefile.shlib b/src/Makefile.shlib
index 66452cc2bd..866a2572d4 100644
--- a/src/Makefile.shlib
+++ b/src/Makefile.shlib
@@ -47,9 +47,8 @@
 # clean-lib             delete the static and shared libraries from the build dir
 # maintainer-clean-lib  delete .def files built for win32
 #
-# Since `all-lib' is the first rule in this file you probably want to
-# have the `all' target before including this file. In the most simple
-# case it would look like this:
+# Typically you would add `all-lib' to the `all' target so that `make all'
+# builds the libraries.  In the most simple case it would look like this:
 #
 #     all: all-lib
 #
@@ -128,7 +127,7 @@ ifeq ($(PORTNAME), darwin)
   else
     # loadable module
     DLSUFFIX		= .so
-    LINK.shared		= $(COMPILER) -bundle -multiply_defined suppress -Wl,-undefined,dynamic_lookup
+    LINK.shared		= $(COMPILER) -bundle -multiply_defined suppress
   endif
   BUILD.exports		= $(AWK) '/^[^\#]/ {printf "_%s\n",$$1}' $< >$@
   exports_file		= $(SHLIB_EXPORTS:%.txt=%.list)
@@ -194,7 +193,7 @@ ifeq ($(PORTNAME), hpux)
     # can't use the CC-syntax rpath pattern here, so instead:
     rpath =
     ifeq ($(enable_rpath), yes)
-      LINK.shared	+= +b '$(rpathdir)'
+      LINK.shared	+= +s +b '$(rpathdir)'
     endif
     # On HPUX platforms, gcc is usually configured to search for libraries
     # in /usr/local/lib, but ld won't do so.  Add an explicit -L switch so
@@ -237,30 +236,6 @@ ifeq ($(PORTNAME), solaris)
   endif
 endif
 
-ifeq ($(PORTNAME), sco)
-  ifeq ($(GCC), yes)
-    LINK.shared		= $(CC) -shared
-  else
-    LINK.shared		= $(CC) -G
-    endif
-  LINK.shared		+= -Wl,-z,text
-  ifdef soname
-    LINK.shared		+= -Wl,-h,$(soname)
-  endif
-endif
-
-ifeq ($(PORTNAME), unixware)
-  ifeq ($(GCC), yes)
-    LINK.shared		= $(CC) -shared
-  else
-    LINK.shared		= $(CC) -G
-  endif
-  LINK.shared		+= -Wl,-z,text
-  ifdef soname
-    LINK.shared		+= -Wl,-h,$(soname)
-  endif
-endif
-
 ifeq ($(PORTNAME), cygwin)
   LINK.shared		= $(CC) -shared
   ifdef SO_MAJOR_VERSION
@@ -323,7 +298,7 @@ endif
 endif # shlib_major
 
 # Where possible, restrict the symbols exported by the library to just the
-# official list, so as to avoid unintentional ABI changes.  On recent Darwin
+# official list, so as to avoid unintentional ABI changes.  On recent macOS
 # this also quiets multiply-defined-symbol warnings in programs that use
 # libpgport along with libpq.
 ifneq (,$(SHLIB_EXPORTS))
@@ -402,7 +377,7 @@ $(shlib): $(OBJS) $(DLL_DEFFILE) | $(SHLIB_PREREQS)
 	$(CC) $(CFLAGS)  -shared -static-libgcc -o $@  $(OBJS) $(DLL_DEFFILE) $(LDFLAGS) $(LDFLAGS_SL) $(SHLIB_LINK) $(LIBS) -Wl,--out-implib=$(stlib)
 endif
 
-endif # PORTNAME == cgywin
+endif # PORTNAME == cygwin
 endif # PORTNAME == cygwin || PORTNAME == win32
 
 
@@ -430,30 +405,22 @@ endif # PORTNAME == cygwin || PORTNAME == win32
 # tarballs.
 
 ifneq (,$(SHLIB_EXPORTS))
-distprep: lib$(NAME)dll.def lib$(NAME)ddll.def blib$(NAME)dll.def
+distprep: lib$(NAME)dll.def lib$(NAME)ddll.def
 
 UC_NAME = $(shell echo $(NAME) | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
 
 lib$(NAME)dll.def: $(SHLIB_EXPORTS)
-	echo '; DEF file for win32.mak release build and for Makefile.shlib (MinGW)' >$@
+	echo '; DEF file for Makefile.shlib (MinGW)' >$@
 	echo 'LIBRARY LIB$(UC_NAME).dll' >>$@
 	echo 'EXPORTS' >>$@
 	sed -e '/^#/d' -e 's/^\(.*[ 	]\)\([0-9][0-9]*\)/    \1@ \2/' $< >>$@
 
 lib$(NAME)ddll.def: $(SHLIB_EXPORTS)
-	echo '; DEF file for win32.mak debug build' >$@
+	echo '; DEF file for Makefile.shlib (MinGW)' >$@
 	echo 'LIBRARY LIB$(UC_NAME)D.dll' >>$@
 	echo 'EXPORTS' >>$@
 	sed -e '/^#/d' -e 's/^\(.*[ 	]\)\([0-9][0-9]*\)/    \1@ \2/' $< >>$@
 
-blib$(NAME)dll.def: $(SHLIB_EXPORTS)
-	echo '; DEF file for bcc32.mak (Borland C++ Builder)' >$@
-	echo 'LIBRARY BLIB$(UC_NAME)' >>$@
-	echo 'EXPORTS' >>$@
-	sed -e '/^#/d' -e 's/^\(.*[ 	]\)\([0-9][0-9]*\)/    _\1@ \2/' $< >>$@
-	echo >>$@
-	echo '; Aliases for MS compatible names' >> $@
-	sed -e '/^#/d' -e 's/^\(.*[ 	]\)\([0-9][0-9]*\)/    \1= _\1/' $< | sed 's/ *$$//' >>$@
 endif # SHLIB_EXPORTS
 
 
@@ -542,5 +509,5 @@ clean-lib:
 
 ifneq (,$(SHLIB_EXPORTS))
 maintainer-clean-lib:
-	rm -f lib$(NAME)dll.def lib$(NAME)ddll.def blib$(NAME)dll.def
+	rm -f lib$(NAME)dll.def lib$(NAME)ddll.def
 endif
diff --git a/src/backend/Makefile b/src/backend/Makefile
index faec8d8523..d9aec0e0a4 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for the postgres backend
 #
-# Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+# Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
 # Portions Copyright (c) 1994, Regents of the University of California
 #
 # src/backend/Makefile
@@ -23,15 +23,15 @@ endif
 
 SUBDIRS = access bootstrap catalog parser commands executor foreign lib libpq \
 	pgxc main nodes optimizer port postmaster regex replication rewrite \
-	storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq
+	statistics storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq
 
 include $(srcdir)/common.mk
 
 # As of 1/2010:
 # The probes.o file is necessary for dtrace support on Solaris, and on recent
 # versions of systemtap.  (Older systemtap releases just produce an empty
-# file, but that's okay.)  However, OS X's dtrace doesn't use it and doesn't
-# even recognize the -G option.  So, build probes.o except on Darwin.
+# file, but that's okay.)  However, macOS's dtrace doesn't use it and doesn't
+# even recognize the -G option.  So, build probes.o except on macOS.
 # This might need adjustment as other platforms add dtrace support.
 ifneq ($(PORTNAME), darwin)
 ifeq ($(enable_dtrace), yes)
@@ -50,6 +50,7 @@ OBJS = $(SUBDIROBJS) $(LOCALOBJS) \
 	$(top_builddir)/src/interfaces/libpq/fe-exec.o \
 	$(top_builddir)/src/interfaces/libpq/fe-auth.o \
 	$(top_builddir)/src/interfaces/libpq/pqexpbuffer.o \
+	$(top_builddir)/src/interfaces/libpq/fe-auth-scram.o \
 	$(top_builddir)/src/gtm/client/libgtmclient.a \
 	$(top_builddir)/src/gtm/common/libgtm.a \
 	$(top_builddir)/src/gtm/libpq/libpqcomm.a
@@ -78,7 +79,7 @@ ifneq ($(PORTNAME), win32)
 ifneq ($(PORTNAME), aix)
 
 postgres: $(OBJS)
-	$(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -L$(top_builddir)/src/gtm/libpg $(export_dynamic) $(call expand_subsys,$^) $(LIBS) -o $@
+	$(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -L$(top_builddir)/src/gtm/libpg $(export_dynamic) $(call expand_subsys,$^) $(LIBS) $(ICU_LIBS) -o $@
 
 endif
 endif
@@ -159,8 +160,11 @@ storage/lmgr/lwlocknames.h: storage/lmgr/generate-lwlocknames.pl storage/lmgr/lw
 utils/errcodes.h: utils/generate-errcodes.pl utils/errcodes.txt
 	$(MAKE) -C utils errcodes.h
 
+# see explanation in parser/Makefile
+utils/fmgrprotos.h: utils/fmgroids.h ;
+
 utils/fmgroids.h: utils/Gen_fmgrtab.pl catalog/Catalog.pm $(top_srcdir)/src/include/catalog/pg_proc.h
-	$(MAKE) -C utils fmgroids.h
+	$(MAKE) -C utils $(notdir $@)
 
 utils/probes.h: utils/probes.d
 	$(MAKE) -C utils probes.h
@@ -186,7 +190,7 @@ submake-schemapg:
 
 .PHONY: generated-headers
 
-generated-headers: $(top_builddir)/src/include/parser/gram.h $(top_builddir)/src/include/catalog/schemapg.h $(top_builddir)/src/include/storage/lwlocknames.h $(top_builddir)/src/include/utils/errcodes.h $(top_builddir)/src/include/utils/fmgroids.h $(top_builddir)/src/include/utils/probes.h
+generated-headers: $(top_builddir)/src/include/parser/gram.h $(top_builddir)/src/include/catalog/schemapg.h $(top_builddir)/src/include/storage/lwlocknames.h $(top_builddir)/src/include/utils/errcodes.h $(top_builddir)/src/include/utils/fmgroids.h $(top_builddir)/src/include/utils/fmgrprotos.h $(top_builddir)/src/include/utils/probes.h
 
 $(top_builddir)/src/include/parser/gram.h: parser/gram.h
 	prereqdir=`cd '$(dir $<)' >/dev/null && pwd` && \
@@ -213,6 +217,11 @@ $(top_builddir)/src/include/utils/fmgroids.h: utils/fmgroids.h
 	  cd '$(dir $@)' && rm -f $(notdir $@) && \
 	  $(LN_S) "$$prereqdir/$(notdir $<)" .
 
+$(top_builddir)/src/include/utils/fmgrprotos.h: utils/fmgrprotos.h
+	prereqdir=`cd '$(dir $<)' >/dev/null && pwd` && \
+	  cd '$(dir $@)' && rm -f $(notdir $@) && \
+	  $(LN_S) "$$prereqdir/$(notdir $<)" .
+
 $(top_builddir)/src/include/utils/probes.h: utils/probes.h
 	cd '$(dir $@)' && rm -f $(notdir $@) && \
 	    $(LN_S) "../../../$(subdir)/utils/probes.h" .
@@ -231,7 +240,7 @@ distprep:
 	$(MAKE) -C catalog	schemapg.h postgres.bki postgres.description postgres.shdescription
 	$(MAKE) -C replication	repl_gram.c repl_scanner.c syncrep_gram.c syncrep_scanner.c
 	$(MAKE) -C storage/lmgr	lwlocknames.h
-	$(MAKE) -C utils	fmgrtab.c fmgroids.h errcodes.h
+	$(MAKE) -C utils	fmgrtab.c fmgroids.h fmgrprotos.h errcodes.h
 	$(MAKE) -C utils/misc	guc-file.c
 	$(MAKE) -C utils/sort	qsort_tuple.c
 
@@ -323,6 +332,7 @@ clean:
 		$(top_builddir)/src/include/catalog/schemapg.h \
 		$(top_builddir)/src/include/storage/lwlocknames.h \
 		$(top_builddir)/src/include/utils/fmgroids.h \
+		$(top_builddir)/src/include/utils/fmgrprotos.h \
 		$(top_builddir)/src/include/utils/probes.h
 ifeq ($(PORTNAME), cygwin)
 	rm -f postgres.dll libpostgres.a
@@ -351,6 +361,7 @@ maintainer-clean: distclean
 	      storage/lmgr/lwlocknames.c \
 	      storage/lmgr/lwlocknames.h \
 	      utils/fmgroids.h \
+	      utils/fmgrprotos.h \
 	      utils/fmgrtab.c \
 	      utils/errcodes.h \
 	      utils/misc/guc-file.c \
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index b194d33cc5..442a46140d 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -4,7 +4,7 @@
  *
  * See src/backend/access/brin/README for details.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -26,8 +26,10 @@
 #include "catalog/pg_am.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "postmaster/autovacuum.h"
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
+#include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
@@ -59,10 +61,12 @@ typedef struct BrinOpaque
 	BrinDesc   *bo_bdesc;
 } BrinOpaque;
 
+#define BRIN_ALL_BLOCKRANGES	InvalidBlockNumber
+
 static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
 						   BrinRevmap *revmap, BlockNumber pagesPerRange);
 static void terminate_brin_buildstate(BrinBuildState *state);
-static void brinsummarize(Relation index, Relation heapRel,
+static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
 			  double *numSummarized, double *numExisting);
 static void form_and_insert_tuple(BrinBuildState *state);
 static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
@@ -92,6 +96,7 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->amstorage = true;
 	amroutine->amclusterable = false;
 	amroutine->ampredlocks = false;
+	amroutine->amcanparallel = false;
 	amroutine->amkeytype = InvalidOid;
 
 	amroutine->ambuild = brinbuild;
@@ -111,6 +116,9 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->amendscan = brinendscan;
 	amroutine->ammarkpos = NULL;
 	amroutine->amrestrpos = NULL;
+	amroutine->amestimateparallelscan = NULL;
+	amroutine->aminitparallelscan = NULL;
+	amroutine->amparallelrescan = NULL;
 
 	PG_RETURN_POINTER(amroutine);
 }
@@ -121,57 +129,95 @@ brinhandler(PG_FUNCTION_ARGS)
  * with those of the new tuple.  If the tuple values are not consistent with
  * the summary tuple, we need to update the index tuple.
  *
+ * If autosummarization is enabled, check if we need to summarize the previous
+ * page range.
+ *
  * If the range is not currently summarized (i.e. the revmap returns NULL for
- * it), there's nothing to do.
+ * it), there's nothing to do for this tuple.
  */
 bool
 brininsert(Relation idxRel, Datum *values, bool *nulls,
 		   ItemPointer heaptid, Relation heapRel,
-		   IndexUniqueCheck checkUnique)
+		   IndexUniqueCheck checkUnique,
+		   IndexInfo *indexInfo)
 {
 	BlockNumber pagesPerRange;
-	BrinDesc   *bdesc = NULL;
+	BlockNumber origHeapBlk;
+	BlockNumber heapBlk;
+	BrinDesc   *bdesc = (BrinDesc *) indexInfo->ii_AmCache;
 	BrinRevmap *revmap;
 	Buffer		buf = InvalidBuffer;
 	MemoryContext tupcxt = NULL;
-	MemoryContext oldcxt = NULL;
+	MemoryContext oldcxt = CurrentMemoryContext;
+	bool		autosummarize = BrinGetAutoSummarize(idxRel);
 
 	revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL);
 
+	/*
+	 * origHeapBlk is the block number where the insertion occurred.  heapBlk
+	 * is the first block in the corresponding page range.
+	 */
+	origHeapBlk = ItemPointerGetBlockNumber(heaptid);
+	heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange;
+
 	for (;;)
 	{
 		bool		need_insert = false;
 		OffsetNumber off;
 		BrinTuple  *brtup;
 		BrinMemTuple *dtup;
-		BlockNumber heapBlk;
 		int			keyno;
 
 		CHECK_FOR_INTERRUPTS();
 
-		heapBlk = ItemPointerGetBlockNumber(heaptid);
-		/* normalize the block number to be the first block in the range */
-		heapBlk = (heapBlk / pagesPerRange) * pagesPerRange;
-		brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL,
-										 BUFFER_LOCK_SHARE, NULL);
+		/*
+		 * If auto-summarization is enabled and we just inserted the first
+		 * tuple into the first block of a new non-first page range, request a
+		 * summarization run of the previous range.
+		 */
+		if (autosummarize &&
+			heapBlk > 0 &&
+			heapBlk == origHeapBlk &&
+			ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber)
+		{
+			BlockNumber lastPageRange = heapBlk - 1;
+			BrinTuple  *lastPageTuple;
+
+			lastPageTuple =
+				brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off,
+										 NULL, BUFFER_LOCK_SHARE, NULL);
+			if (!lastPageTuple)
+				AutoVacuumRequestWork(AVW_BRINSummarizeRange,
+									  RelationGetRelid(idxRel),
+									  lastPageRange);
+			brin_free_tuple(lastPageTuple);
+		}
+
+		brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off,
+										 NULL, BUFFER_LOCK_SHARE, NULL);
 
 		/* if range is unsummarized, there's nothing to do */
 		if (!brtup)
 			break;
 
-		/* First time through? */
+		/* First time through in this statement? */
 		if (bdesc == NULL)
 		{
+			MemoryContextSwitchTo(indexInfo->ii_Context);
 			bdesc = brin_build_desc(idxRel);
+			indexInfo->ii_AmCache = (void *) bdesc;
+			MemoryContextSwitchTo(oldcxt);
+		}
+		/* First time through in this brininsert call? */
+		if (tupcxt == NULL)
+		{
 			tupcxt = AllocSetContextCreate(CurrentMemoryContext,
 										   "brininsert cxt",
-										   ALLOCSET_DEFAULT_MINSIZE,
-										   ALLOCSET_DEFAULT_INITSIZE,
-										   ALLOCSET_DEFAULT_MAXSIZE);
-			oldcxt = MemoryContextSwitchTo(tupcxt);
+										   ALLOCSET_DEFAULT_SIZES);
+			MemoryContextSwitchTo(tupcxt);
 		}
 
-		dtup = brin_deform_tuple(bdesc, brtup);
+		dtup = brin_deform_tuple(bdesc, brtup, NULL);
 
 		/*
 		 * Compare the key values of the new tuple to the stored index values;
@@ -222,7 +268,7 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 			 * re-acquiring the lock.
 			 */
 			origsz = ItemIdGetLength(lp);
-			origtup = brin_copy_tuple(brtup, origsz);
+			origtup = brin_copy_tuple(brtup, origsz, NULL, NULL);
 
 			/*
 			 * Before releasing the lock, check if we can attempt a same-page
@@ -259,12 +305,9 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 	brinRevmapTerminate(revmap);
 	if (BufferIsValid(buf))
 		ReleaseBuffer(buf);
-	if (bdesc != NULL)
-	{
-		brin_free_desc(bdesc);
-		MemoryContextSwitchTo(oldcxt);
+	MemoryContextSwitchTo(oldcxt);
+	if (tupcxt != NULL)
 		MemoryContextDelete(tupcxt);
-	}
 
 	return false;
 }
@@ -320,6 +363,9 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	FmgrInfo   *consistentFn;
 	MemoryContext oldcxt;
 	MemoryContext perRangeCxt;
+	BrinMemTuple *dtup;
+	BrinTuple  *btup = NULL;
+	Size		btupsz = 0;
 
 	opaque = (BrinOpaque *) scan->opaque;
 	bdesc = opaque->bo_bdesc;
@@ -341,15 +387,16 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	 */
 	consistentFn = palloc0(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts);
 
+	/* allocate an initial in-memory tuple, out of the per-range memcxt */
+	dtup = brin_new_memtuple(bdesc);
+
 	/*
 	 * Setup and use a per-range memory context, which is reset every time we
 	 * loop below.  This avoids having to free the tuples within the loop.
 	 */
 	perRangeCxt = AllocSetContextCreate(CurrentMemoryContext,
 										"bringetbitmap cxt",
-										ALLOCSET_DEFAULT_MINSIZE,
-										ALLOCSET_DEFAULT_INITSIZE,
-										ALLOCSET_DEFAULT_MAXSIZE);
+										ALLOCSET_DEFAULT_SIZES);
 	oldcxt = MemoryContextSwitchTo(perRangeCxt);
 
 	/*
@@ -360,6 +407,7 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
 	{
 		bool		addrange;
+		bool		gottuple = false;
 		BrinTuple  *tup;
 		OffsetNumber off;
 		Size		size;
@@ -373,7 +421,8 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 									   scan->xs_snapshot);
 		if (tup)
 		{
-			tup = brin_copy_tuple(tup, size);
+			gottuple = true;
+			btup = brin_copy_tuple(tup, size, btup, &btupsz);
 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 		}
 
@@ -381,15 +430,13 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 		 * For page ranges with no indexed tuple, we must return the whole
 		 * range; otherwise, compare it to the scan keys.
 		 */
-		if (tup == NULL)
+		if (!gottuple)
 		{
 			addrange = true;
 		}
 		else
 		{
-			BrinMemTuple *dtup;
-
-			dtup = brin_deform_tuple(bdesc, tup);
+			dtup = brin_deform_tuple(bdesc, btup, dtup);
 			if (dtup->bt_placeholder)
 			{
 				/*
@@ -741,7 +788,7 @@ brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 	brin_vacuum_scan(info->index, info->strategy);
 
-	brinsummarize(info->index, heapRel,
+	brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES,
 				  &stats->num_index_tuples, &stats->num_index_tuples);
 
 	heap_close(heapRel, AccessShareLock);
@@ -759,7 +806,8 @@ brinoptions(Datum reloptions, bool validate)
 	BrinOptions *rdopts;
 	int			numoptions;
 	static const relopt_parse_elt tab[] = {
-		{"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)}
+		{"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)},
+		{"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)}
 	};
 
 	options = parseRelOptions(reloptions, validate, RELOPT_KIND_BRIN,
@@ -786,12 +834,39 @@ brinoptions(Datum reloptions, bool validate)
 Datum
 brin_summarize_new_values(PG_FUNCTION_ARGS)
 {
+	Datum		relation = PG_GETARG_DATUM(0);
+
+	return DirectFunctionCall2(brin_summarize_range,
+							   relation,
+							   Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES));
+}
+
+/*
+ * SQL-callable function to summarize the indicated page range, if not already
+ * summarized.  If the second argument is BRIN_ALL_BLOCKRANGES, all
+ * unsummarized ranges are summarized.
+ */
+Datum
+brin_summarize_range(PG_FUNCTION_ARGS)
+{
 	Oid			indexoid = PG_GETARG_OID(0);
+	int64		heapBlk64 = PG_GETARG_INT64(1);
+	BlockNumber heapBlk;
 	Oid			heapoid;
 	Relation	indexRel;
 	Relation	heapRel;
 	double		numSummarized = 0;
 
+	if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0)
+	{
+		char	   *blk = psprintf(INT64_FORMAT, heapBlk64);
+
+		ereport(ERROR,
+				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+				 errmsg("block number out of range: %s", blk)));
+	}
+	heapBlk = (BlockNumber) heapBlk64;
+
 	/*
 	 * We must lock table before index to avoid deadlocks.  However, if the
 	 * passed indexoid isn't an index then IndexGetRelation() will fail.
@@ -831,7 +906,7 @@ brin_summarize_new_values(PG_FUNCTION_ARGS)
 						RelationGetRelationName(indexRel))));
 
 	/* OK, do it */
-	brinsummarize(indexRel, heapRel, &numSummarized, NULL);
+	brinsummarize(indexRel, heapRel, heapBlk, &numSummarized, NULL);
 
 	relation_close(indexRel, ShareUpdateExclusiveLock);
 	relation_close(heapRel, ShareUpdateExclusiveLock);
@@ -840,6 +915,81 @@ brin_summarize_new_values(PG_FUNCTION_ARGS)
 }
 
 /*
+ * SQL-callable interface to mark a range as no longer summarized
+ */
+Datum
+brin_desummarize_range(PG_FUNCTION_ARGS)
+{
+	Oid			indexoid = PG_GETARG_OID(0);
+	int64		heapBlk64 = PG_GETARG_INT64(1);
+	BlockNumber heapBlk;
+	Oid			heapoid;
+	Relation	heapRel;
+	Relation	indexRel;
+	bool		done;
+
+	if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0)
+	{
+		char	   *blk = psprintf(INT64_FORMAT, heapBlk64);
+
+		ereport(ERROR,
+				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+				 errmsg("block number out of range: %s", blk)));
+	}
+	heapBlk = (BlockNumber) heapBlk64;
+
+	/*
+	 * We must lock table before index to avoid deadlocks.  However, if the
+	 * passed indexoid isn't an index then IndexGetRelation() will fail.
+	 * Rather than emitting a not-very-helpful error message, postpone
+	 * complaining, expecting that the is-it-an-index test below will fail.
+	 */
+	heapoid = IndexGetRelation(indexoid, true);
+	if (OidIsValid(heapoid))
+		heapRel = heap_open(heapoid, ShareUpdateExclusiveLock);
+	else
+		heapRel = NULL;
+
+	indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
+
+	/* Must be a BRIN index */
+	if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
+		indexRel->rd_rel->relam != BRIN_AM_OID)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("\"%s\" is not a BRIN index",
+						RelationGetRelationName(indexRel))));
+
+	/* User must own the index (comparable to privileges needed for VACUUM) */
+	if (!pg_class_ownercheck(indexoid, GetUserId()))
+		aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
+					   RelationGetRelationName(indexRel));
+
+	/*
+	 * Since we did the IndexGetRelation call above without any lock, it's
+	 * barely possible that a race against an index drop/recreation could have
+	 * netted us the wrong table.  Recheck.
+	 */
+	if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
+		ereport(ERROR,
+				(errcode(ERRCODE_UNDEFINED_TABLE),
+				 errmsg("could not open parent table of index %s",
+						RelationGetRelationName(indexRel))));
+
+	/* the revmap does the hard work */
+	do
+	{
+		done = brinRevmapDesummarizeRange(indexRel, heapBlk);
+	}
+	while (!done);
+
+	relation_close(indexRel, ShareUpdateExclusiveLock);
+	relation_close(heapRel, ShareUpdateExclusiveLock);
+
+	PG_RETURN_VOID();
+}
+
+/*
  * Build a BrinDesc used to create or scan a BRIN index
  */
 BrinDesc *
@@ -856,9 +1006,7 @@ brin_build_desc(Relation rel)
 
 	cxt = AllocSetContextCreate(CurrentMemoryContext,
 								"brin desc cxt",
-								ALLOCSET_SMALL_INITSIZE,
-								ALLOCSET_SMALL_MINSIZE,
-								ALLOCSET_SMALL_MAXSIZE);
+								ALLOCSET_SMALL_SIZES);
 	oldcxt = MemoryContextSwitchTo(cxt);
 	tupdesc = RelationGetDescr(rel);
 
@@ -909,6 +1057,27 @@ brin_free_desc(BrinDesc *bdesc)
 }
 
 /*
+ * Fetch index's statistical data into *stats
+ */
+void
+brinGetStats(Relation index, BrinStatsData *stats)
+{
+	Buffer		metabuffer;
+	Page		metapage;
+	BrinMetaPageData *metadata;
+
+	metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
+	LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
+	metapage = BufferGetPage(metabuffer);
+	metadata = (BrinMetaPageData *) PageGetContents(metapage);
+
+	stats->pagesPerRange = metadata->pagesPerRange;
+	stats->revmapNumPages = metadata->lastRevmapPage - 1;
+
+	UnlockReleaseBuffer(metabuffer);
+}
+
+/*
  * Initialize a BrinBuildState appropriate to create tuples on the given index.
  */
 static BrinBuildState *
@@ -1048,7 +1217,7 @@ summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
 		/* the placeholder tuple must exist */
 		if (phtup == NULL)
 			elog(ERROR, "missing placeholder tuple");
-		phtup = brin_copy_tuple(phtup, phsz);
+		phtup = brin_copy_tuple(phtup, phsz, NULL, NULL);
 		LockBuffer(phbuf, BUFFER_LOCK_UNLOCK);
 
 		/* merge it into the tuple from the heap scan */
@@ -1059,17 +1228,17 @@ summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
 }
 
 /*
- * Scan a complete BRIN index, and summarize each page range that's not already
- * summarized.  The index and heap must have been locked by caller in at
- * least ShareUpdateExclusiveLock mode.
+ * Summarize page ranges that are not already summarized.  If pageRange is
+ * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the
+ * page range containing the given heap page number is scanned.
  *
  * For each new index tuple inserted, *numSummarized (if not NULL) is
  * incremented; for each existing tuple, *numExisting (if not NULL) is
  * incremented.
  */
 static void
-brinsummarize(Relation index, Relation heapRel, double *numSummarized,
-			  double *numExisting)
+brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
+			  double *numSummarized, double *numExisting)
 {
 	BrinRevmap *revmap;
 	BrinBuildState *state = NULL;
@@ -1078,15 +1247,40 @@ brinsummarize(Relation index, Relation heapRel, double *numSummarized,
 	BlockNumber heapBlk;
 	BlockNumber pagesPerRange;
 	Buffer		buf;
+	BlockNumber startBlk;
+	BlockNumber endBlk;
+
+	/* determine range of pages to process; nothing to do for an empty table */
+	heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
+	if (heapNumBlocks == 0)
+		return;
 
 	revmap = brinRevmapInitialize(index, &pagesPerRange, NULL);
 
+	if (pageRange == BRIN_ALL_BLOCKRANGES)
+	{
+		startBlk = 0;
+		endBlk = heapNumBlocks;
+	}
+	else
+	{
+		startBlk = (pageRange / pagesPerRange) * pagesPerRange;
+		/* Nothing to do if start point is beyond end of table */
+		if (startBlk > heapNumBlocks)
+		{
+			brinRevmapTerminate(revmap);
+			return;
+		}
+		endBlk = startBlk + pagesPerRange;
+		if (endBlk > heapNumBlocks)
+			endBlk = heapNumBlocks;
+	}
+
 	/*
 	 * Scan the revmap to find unsummarized items.
 	 */
 	buf = InvalidBuffer;
-	heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
-	for (heapBlk = 0; heapBlk < heapNumBlocks; heapBlk += pagesPerRange)
+	for (heapBlk = startBlk; heapBlk < endBlk; heapBlk += pagesPerRange)
 	{
 		BrinTuple  *tup;
 		OffsetNumber off;
@@ -1169,11 +1363,9 @@ union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
 	/* Use our own memory context to avoid retail pfree */
 	cxt = AllocSetContextCreate(CurrentMemoryContext,
 								"brin union",
-								ALLOCSET_DEFAULT_MINSIZE,
-								ALLOCSET_DEFAULT_INITSIZE,
-								ALLOCSET_DEFAULT_MAXSIZE);
+								ALLOCSET_DEFAULT_SIZES);
 	oldcxt = MemoryContextSwitchTo(cxt);
-	db = brin_deform_tuple(bdesc, b);
+	db = brin_deform_tuple(bdesc, b, NULL);
 	MemoryContextSwitchTo(oldcxt);
 
 	for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
diff --git a/src/backend/access/brin/brin_inclusion.c b/src/backend/access/brin/brin_inclusion.c
index 0ae7a72996..bc16dd7981 100644
--- a/src/backend/access/brin/brin_inclusion.c
+++ b/src/backend/access/brin/brin_inclusion.c
@@ -16,7 +16,7 @@
  * writing is the INET type, where IPv6 values cannot be merged with IPv4
  * values.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -30,6 +30,7 @@
 #include "access/skey.h"
 #include "catalog/pg_amop.h"
 #include "catalog/pg_type.h"
+#include "utils/builtins.h"
 #include "utils/datum.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
@@ -76,10 +77,6 @@ typedef struct InclusionOpaque
 	FmgrInfo	strategy_procinfos[RTMaxStrategyNumber];
 } InclusionOpaque;
 
-Datum		brin_inclusion_opcinfo(PG_FUNCTION_ARGS);
-Datum		brin_inclusion_add_value(PG_FUNCTION_ARGS);
-Datum		brin_inclusion_consistent(PG_FUNCTION_ARGS);
-Datum		brin_inclusion_union(PG_FUNCTION_ARGS);
 static FmgrInfo *inclusion_get_procinfo(BrinDesc *bdesc, uint16 attno,
 					   uint16 procnum);
 static FmgrInfo *inclusion_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno,
@@ -431,7 +428,7 @@ brin_inclusion_consistent(PG_FUNCTION_ARGS)
 			 * It is straightforward to support the equality strategies with
 			 * the contains operator.  Generally, inequality strategies do not
 			 * make much sense for the types which will be used with the
-			 * inclusion BRIN family of opclasses, but is is possible to
+			 * inclusion BRIN family of opclasses, but is possible to
 			 * implement them with logical negation of the left-of and
 			 * right-of operators.
 			 *
diff --git a/src/backend/access/brin/brin_minmax.c b/src/backend/access/brin/brin_minmax.c
index b7c76e6eda..8f7a0c75b8 100644
--- a/src/backend/access/brin/brin_minmax.c
+++ b/src/backend/access/brin/brin_minmax.c
@@ -2,7 +2,7 @@
  * brin_minmax.c
  *		Implementation of Min/Max opclass for BRIN
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -16,6 +16,7 @@
 #include "access/stratnum.h"
 #include "catalog/pg_type.h"
 #include "catalog/pg_amop.h"
+#include "utils/builtins.h"
 #include "utils/datum.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
@@ -28,10 +29,6 @@ typedef struct MinmaxOpaque
 	FmgrInfo	strategy_procinfos[BTMaxStrategyNumber];
 } MinmaxOpaque;
 
-Datum		brin_minmax_opcinfo(PG_FUNCTION_ARGS);
-Datum		brin_minmax_add_value(PG_FUNCTION_ARGS);
-Datum		brin_minmax_consistent(PG_FUNCTION_ARGS);
-Datum		brin_minmax_union(PG_FUNCTION_ARGS);
 static FmgrInfo *minmax_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno,
 							 Oid subtype, uint16 strategynum);
 
diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c
index 6ebfedd6a9..1725591b05 100644
--- a/src/backend/access/brin/brin_pageops.c
+++ b/src/backend/access/brin/brin_pageops.c
@@ -2,7 +2,7 @@
  * brin_pageops.c
  *		Page-handling routines for BRIN indexes
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -178,10 +178,8 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 		}
 
 		START_CRIT_SECTION();
-		PageIndexDeleteNoCompact(oldpage, &oldoff, 1);
-		if (PageAddItemExtended(oldpage, (Item) newtup, newsz, oldoff,
-				PAI_OVERWRITE | PAI_ALLOW_FAR_OFFSET) == InvalidOffsetNumber)
-			elog(ERROR, "failed to add BRIN tuple");
+		if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) newtup, newsz))
+			elog(ERROR, "failed to replace BRIN tuple");
 		MarkBufferDirty(oldbuf);
 
 		/* XLOG stuff */
@@ -247,7 +245,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 		if (extended)
 			brin_page_init(BufferGetPage(newbuf), BRIN_PAGETYPE_REGULAR);
 
-		PageIndexDeleteNoCompact(oldpage, &oldoff, 1);
+		PageIndexTupleDeleteNoCompact(oldpage, oldoff);
 		newoff = PageAddItem(newpage, (Item) newtup, newsz,
 							 InvalidOffsetNumber, false, false);
 		if (newoff == InvalidOffsetNumber)
@@ -289,7 +287,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 			XLogRegisterBufData(0, (char *) newtup, newsz);
 
 			/* revmap page */
-			XLogRegisterBuffer(1, revmapbuf, REGBUF_STANDARD);
+			XLogRegisterBuffer(1, revmapbuf, 0);
 
 			/* old page */
 			XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD);
@@ -550,6 +548,8 @@ brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
 	OffsetNumber off;
 	OffsetNumber maxoff;
 	Page		page;
+	BrinTuple  *btup = NULL;
+	Size		btupsz = 0;
 
 	page = BufferGetPage(buf);
 
@@ -569,7 +569,7 @@ brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
 		{
 			sz = ItemIdGetLength(lp);
 			tup = (BrinTuple *) PageGetItem(page, lp);
-			tup = brin_copy_tuple(tup, sz);
+			tup = brin_copy_tuple(tup, sz, btup, &btupsz);
 
 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 
diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c
index 853181b3fa..fc8b10ab39 100644
--- a/src/backend/access/brin/brin_revmap.c
+++ b/src/backend/access/brin/brin_revmap.c
@@ -12,7 +12,7 @@
  * the metapage.  When the revmap needs to be expanded, all tuples on the
  * regular BRIN page at that block (if any) are moved out of the way.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -168,9 +168,12 @@ brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange,
 	iptr = (ItemPointerData *) contents->rm_tids;
 	iptr += HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk);
 
-	ItemPointerSet(iptr,
-				   ItemPointerGetBlockNumber(&tid),
-				   ItemPointerGetOffsetNumber(&tid));
+	if (ItemPointerIsValid(&tid))
+		ItemPointerSet(iptr,
+					   ItemPointerGetBlockNumber(&tid),
+					   ItemPointerGetOffsetNumber(&tid));
+	else
+		ItemPointerSetInvalid(iptr);
 }
 
 /*
@@ -205,7 +208,11 @@ brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
 	/* normalize the heap block number to be the first page in the range */
 	heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange;
 
-	/* Compute the revmap page number we need */
+	/*
+	 * Compute the revmap page number we need.  If Invalid is returned (i.e.,
+	 * the revmap page hasn't been created yet), the requested page range is
+	 * not summarized.
+	 */
 	mapBlk = revmap_get_blkno(revmap, heapBlk);
 	if (mapBlk == InvalidBlockNumber)
 	{
@@ -301,6 +308,138 @@ brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
 }
 
 /*
+ * Delete an index tuple, marking a page range as unsummarized.
+ *
+ * Index must be locked in ShareUpdateExclusiveLock mode.
+ *
+ * Return FALSE if caller should retry.
+ */
+bool
+brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk)
+{
+	BrinRevmap *revmap;
+	BlockNumber pagesPerRange;
+	RevmapContents *contents;
+	ItemPointerData *iptr;
+	ItemPointerData invalidIptr;
+	BlockNumber revmapBlk;
+	Buffer		revmapBuf;
+	Buffer		regBuf;
+	Page		revmapPg;
+	Page		regPg;
+	OffsetNumber revmapOffset;
+	OffsetNumber regOffset;
+	ItemId		lp;
+	BrinTuple  *tup;
+
+	revmap = brinRevmapInitialize(idxrel, &pagesPerRange, NULL);
+
+	revmapBlk = revmap_get_blkno(revmap, heapBlk);
+	if (!BlockNumberIsValid(revmapBlk))
+	{
+		/* revmap page doesn't exist: range not summarized, we're done */
+		brinRevmapTerminate(revmap);
+		return true;
+	}
+
+	/* Lock the revmap page, obtain the index tuple pointer from it */
+	revmapBuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
+	revmapPg = BufferGetPage(revmapBuf);
+	revmapOffset = HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk);
+
+	contents = (RevmapContents *) PageGetContents(revmapPg);
+	iptr = contents->rm_tids;
+	iptr += revmapOffset;
+
+	if (!ItemPointerIsValid(iptr))
+	{
+		/* no index tuple: range not summarized, we're done */
+		LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK);
+		brinRevmapTerminate(revmap);
+		return true;
+	}
+
+	regBuf = ReadBuffer(idxrel, ItemPointerGetBlockNumber(iptr));
+	LockBuffer(regBuf, BUFFER_LOCK_EXCLUSIVE);
+	regPg = BufferGetPage(regBuf);
+
+	/* if this is no longer a regular page, tell caller to start over */
+	if (!BRIN_IS_REGULAR_PAGE(regPg))
+	{
+		LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK);
+		LockBuffer(regBuf, BUFFER_LOCK_UNLOCK);
+		brinRevmapTerminate(revmap);
+		return false;
+	}
+
+	regOffset = ItemPointerGetOffsetNumber(iptr);
+	if (regOffset > PageGetMaxOffsetNumber(regPg))
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg("corrupted BRIN index: inconsistent range map")));
+
+	lp = PageGetItemId(regPg, regOffset);
+	if (!ItemIdIsUsed(lp))
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg("corrupted BRIN index: inconsistent range map")));
+	tup = (BrinTuple *) PageGetItem(regPg, lp);
+	/* XXX apply sanity checks?  Might as well delete a bogus tuple ... */
+
+	/*
+	 * We're only removing data, not reading it, so there's no need to
+	 * TestForOldSnapshot here.
+	 */
+
+	/*
+	 * Because of SUE lock, this function shouldn't run concurrently with
+	 * summarization.  Placeholder tuples can only exist as leftovers from
+	 * crashed summarization, so if we detect any, we complain but proceed.
+	 */
+	if (BrinTupleIsPlaceholder(tup))
+		ereport(WARNING,
+				(errmsg("leftover placeholder tuple detected in BRIN index \"%s\", deleting",
+						RelationGetRelationName(idxrel))));
+
+	START_CRIT_SECTION();
+
+	ItemPointerSetInvalid(&invalidIptr);
+	brinSetHeapBlockItemptr(revmapBuf, revmap->rm_pagesPerRange, heapBlk,
+							invalidIptr);
+	PageIndexTupleDeleteNoCompact(regPg, regOffset);
+	/* XXX record free space in FSM? */
+
+	MarkBufferDirty(regBuf);
+	MarkBufferDirty(revmapBuf);
+
+	if (RelationNeedsWAL(idxrel))
+	{
+		xl_brin_desummarize xlrec;
+		XLogRecPtr	recptr;
+
+		xlrec.pagesPerRange = revmap->rm_pagesPerRange;
+		xlrec.heapBlk = heapBlk;
+		xlrec.regOffset = regOffset;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, SizeOfBrinDesummarize);
+		XLogRegisterBuffer(0, revmapBuf, 0);
+		XLogRegisterBuffer(1, regBuf, REGBUF_STANDARD);
+		recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_DESUMMARIZE);
+		PageSetLSN(revmapPg, recptr);
+		PageSetLSN(regPg, recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	UnlockReleaseBuffer(regBuf);
+	LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK);
+	brinRevmapTerminate(revmap);
+
+	return true;
+}
+
+/*
  * Given a heap block number, find the corresponding physical revmap block
  * number and return it.  If the revmap page hasn't been allocated yet, return
  * InvalidBlockNumber.
diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c
index 64b8264959..e2e1d23377 100644
--- a/src/backend/access/brin/brin_tuple.c
+++ b/src/backend/access/brin/brin_tuple.c
@@ -23,7 +23,7 @@
  * Note the size of the null bitmask may not be the same as that of the
  * datum array.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -311,17 +311,26 @@ brin_free_tuple(BrinTuple *tuple)
 }
 
 /*
- * Create a palloc'd copy of a BrinTuple.
+ * Given a brin tuple of size len, create a copy of it.  If 'dest' is not
+ * NULL, its size is destsz, and can be used as output buffer; if the tuple
+ * to be copied does not fit, it is enlarged by repalloc, and the size is
+ * updated to match.  This avoids palloc/free cycles when many brin tuples
+ * are being processed in loops.
  */
 BrinTuple *
-brin_copy_tuple(BrinTuple *tuple, Size len)
+brin_copy_tuple(BrinTuple *tuple, Size len, BrinTuple *dest, Size *destsz)
 {
-	BrinTuple  *newtup;
+	if (!destsz || *destsz == 0)
+		dest = palloc(len);
+	else if (len > *destsz)
+	{
+		dest = repalloc(dest, len);
+		*destsz = len;
+	}
 
-	newtup = palloc(len);
-	memcpy(newtup, tuple, len);
+	memcpy(dest, tuple, len);
 
-	return newtup;
+	return dest;
 }
 
 /*
@@ -348,56 +357,69 @@ BrinMemTuple *
 brin_new_memtuple(BrinDesc *brdesc)
 {
 	BrinMemTuple *dtup;
-	char	   *currdatum;
 	long		basesize;
-	int			i;
 
 	basesize = MAXALIGN(sizeof(BrinMemTuple) +
 						sizeof(BrinValues) * brdesc->bd_tupdesc->natts);
 	dtup = palloc0(basesize + sizeof(Datum) * brdesc->bd_totalstored);
-	currdatum = (char *) dtup + basesize;
-	for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
-	{
-		dtup->bt_columns[i].bv_attno = i + 1;
-		dtup->bt_columns[i].bv_allnulls = true;
-		dtup->bt_columns[i].bv_hasnulls = false;
-		dtup->bt_columns[i].bv_values = (Datum *) currdatum;
-		currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored;
-	}
+
+	dtup->bt_values = palloc(sizeof(Datum) * brdesc->bd_totalstored);
+	dtup->bt_allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
+	dtup->bt_hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
 
 	dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext,
 											 "brin dtuple",
-											 ALLOCSET_DEFAULT_MINSIZE,
-											 ALLOCSET_DEFAULT_INITSIZE,
-											 ALLOCSET_DEFAULT_MAXSIZE);
+											 ALLOCSET_DEFAULT_SIZES);
+
+	brin_memtuple_initialize(dtup, brdesc);
+
 	return dtup;
 }
 
 /*
- * Reset a BrinMemTuple to initial state
+ * Reset a BrinMemTuple to initial state.  We return the same tuple, for
+ * notational convenience.
  */
-void
+BrinMemTuple *
 brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
 {
 	int			i;
+	char	   *currdatum;
 
 	MemoryContextReset(dtuple->bt_context);
+
+	currdatum = (char *) dtuple +
+		MAXALIGN(sizeof(BrinMemTuple) +
+				 sizeof(BrinValues) * brdesc->bd_tupdesc->natts);
 	for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
 	{
 		dtuple->bt_columns[i].bv_allnulls = true;
 		dtuple->bt_columns[i].bv_hasnulls = false;
+
+		dtuple->bt_columns[i].bv_attno = i + 1;
+		dtuple->bt_columns[i].bv_allnulls = true;
+		dtuple->bt_columns[i].bv_hasnulls = false;
+		dtuple->bt_columns[i].bv_values = (Datum *) currdatum;
+		currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored;
 	}
+
+	return dtuple;
 }
 
 /*
  * Convert a BrinTuple back to a BrinMemTuple.  This is the reverse of
  * brin_form_tuple.
  *
+ * As an optimization, the caller can pass a previously allocated 'dMemtuple'.
+ * This avoids having to allocate it here, which can be useful when this
+ * function is called many times in a loop.  It is caller's responsibility
+ * that the given BrinMemTuple matches what we need here.
+ *
  * Note we don't need the "on disk tupdesc" here; we rely on our own routine to
  * deconstruct the tuple from the on-disk format.
  */
 BrinMemTuple *
-brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple)
+brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple)
 {
 	BrinMemTuple *dtup;
 	Datum	   *values;
@@ -409,15 +431,16 @@ brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple)
 	int			valueno;
 	MemoryContext oldcxt;
 
-	dtup = brin_new_memtuple(brdesc);
+	dtup = dMemtuple ? brin_memtuple_initialize(dMemtuple, brdesc) :
+		brin_new_memtuple(brdesc);
 
 	if (BrinTupleIsPlaceholder(tuple))
 		dtup->bt_placeholder = true;
 	dtup->bt_blkno = tuple->bt_blkno;
 
-	values = palloc(sizeof(Datum) * brdesc->bd_totalstored);
-	allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
-	hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
+	values = dtup->bt_values;
+	allnulls = dtup->bt_allnulls;
+	hasnulls = dtup->bt_hasnulls;
 
 	tp = (char *) tuple + BrinTupleDataOffset(tuple);
 
@@ -460,10 +483,6 @@ brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple)
 
 	MemoryContextSwitchTo(oldcxt);
 
-	pfree(values);
-	pfree(allnulls);
-	pfree(hasnulls);
-
 	return dtup;
 }
 
diff --git a/src/backend/access/brin/brin_validate.c b/src/backend/access/brin/brin_validate.c
index 1f1011e0ac..dc23e00e89 100644
--- a/src/backend/access/brin/brin_validate.c
+++ b/src/backend/access/brin/brin_validate.c
@@ -3,7 +3,7 @@
  * brin_validate.c
  *	  Opclass validator for BRIN.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -23,6 +23,7 @@
 #include "catalog/pg_type.h"
 #include "utils/builtins.h"
 #include "utils/syscache.h"
+#include "utils/regproc.h"
 
 
 /*
diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c
index 27ba0a97f8..dff7198a39 100644
--- a/src/backend/access/brin/brin_xlog.c
+++ b/src/backend/access/brin/brin_xlog.c
@@ -2,7 +2,7 @@
  * brin_xlog.c
  *		XLog replay routines for BRIN indexes
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -13,6 +13,7 @@
 #include "access/brin_page.h"
 #include "access/brin_pageops.h"
 #include "access/brin_xlog.h"
+#include "access/bufmask.h"
 #include "access/xlogutils.h"
 
 
@@ -148,10 +149,8 @@ brin_xlog_update(XLogReaderState *record)
 		page = (Page) BufferGetPage(buffer);
 
 		offnum = xlrec->oldOffnum;
-		if (PageGetMaxOffsetNumber(page) + 1 < offnum)
-			elog(PANIC, "brin_xlog_update: invalid max offset number");
 
-		PageIndexDeleteNoCompact(page, &offnum, 1);
+		PageIndexTupleDeleteNoCompact(page, offnum);
 
 		PageSetLSN(page, lsn);
 		MarkBufferDirty(buffer);
@@ -189,14 +188,9 @@ brin_xlog_samepage_update(XLogReaderState *record)
 		page = (Page) BufferGetPage(buffer);
 
 		offnum = xlrec->offnum;
-		if (PageGetMaxOffsetNumber(page) + 1 < offnum)
-			elog(PANIC, "brin_xlog_samepage_update: invalid max offset number");
 
-		PageIndexDeleteNoCompact(page, &offnum, 1);
-		offnum = PageAddItemExtended(page, (Item) brintuple, tuplen, offnum,
-									 PAI_OVERWRITE | PAI_ALLOW_FAR_OFFSET);
-		if (offnum == InvalidOffsetNumber)
-			elog(PANIC, "brin_xlog_samepage_update: failed to add tuple");
+		if (!PageIndexTupleOverwrite(page, offnum, (Item) brintuple, tuplen))
+			elog(PANIC, "brin_xlog_samepage_update: failed to replace tuple");
 
 		PageSetLSN(page, lsn);
 		MarkBufferDirty(buffer);
@@ -260,6 +254,46 @@ brin_xlog_revmap_extend(XLogReaderState *record)
 		UnlockReleaseBuffer(metabuf);
 }
 
+static void
+brin_xlog_desummarize_page(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_brin_desummarize *xlrec;
+	Buffer		buffer;
+	XLogRedoAction action;
+
+	xlrec = (xl_brin_desummarize *) XLogRecGetData(record);
+
+	/* Update the revmap */
+	action = XLogReadBufferForRedo(record, 0, &buffer);
+	if (action == BLK_NEEDS_REDO)
+	{
+		ItemPointerData iptr;
+
+		ItemPointerSetInvalid(&iptr);
+		brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk, iptr);
+
+		PageSetLSN(BufferGetPage(buffer), lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+
+	/* remove the leftover entry from the regular page */
+	action = XLogReadBufferForRedo(record, 1, &buffer);
+	if (action == BLK_NEEDS_REDO)
+	{
+		Page		regPg = BufferGetPage(buffer);
+
+		PageIndexTupleDeleteNoCompact(regPg, xlrec->regOffset);
+
+		PageSetLSN(regPg, lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+}
+
 void
 brin_redo(XLogReaderState *record)
 {
@@ -282,7 +316,29 @@ brin_redo(XLogReaderState *record)
 		case XLOG_BRIN_REVMAP_EXTEND:
 			brin_xlog_revmap_extend(record);
 			break;
+		case XLOG_BRIN_DESUMMARIZE:
+			brin_xlog_desummarize_page(record);
+			break;
 		default:
 			elog(PANIC, "brin_redo: unknown op code %u", info);
 	}
 }
+
+/*
+ * Mask a BRIN page before doing consistency checks.
+ */
+void
+brin_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+
+	mask_page_lsn(page);
+
+	mask_page_hint_bits(page);
+
+	if (BRIN_IS_REGULAR_PAGE(page))
+	{
+		/* Regular brin pages contain unused space which needs to be masked. */
+		mask_unused_space(page);
+	}
+}
diff --git a/src/backend/access/common/Makefile b/src/backend/access/common/Makefile
index 1fa6de0823..fb27944b89 100644
--- a/src/backend/access/common/Makefile
+++ b/src/backend/access/common/Makefile
@@ -12,7 +12,7 @@ subdir = src/backend/access/common
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = heaptuple.o indextuple.o printtup.o reloptions.o scankey.o \
-	tupconvert.o tupdesc.o
+OBJS = bufmask.o heaptuple.o indextuple.o printsimple.o printtup.o \
+	reloptions.o scankey.o tupconvert.o tupdesc.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c
new file mode 100644
index 0000000000..10253d3354
--- /dev/null
+++ b/src/backend/access/common/bufmask.c
@@ -0,0 +1,128 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufmask.c
+ *	  Routines for buffer masking. Used to mask certain bits
+ *	  in a page which can be different when the WAL is generated
+ *	  and when the WAL is applied.
+ *
+ * Portions Copyright (c) 2016-2017, PostgreSQL Global Development Group
+ *
+ * Contains common routines required for masking a page.
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/buffer/bufmask.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/bufmask.h"
+
+/*
+ * mask_page_lsn
+ *
+ * In consistency checks, the LSN of the two pages compared will likely be
+ * different because of concurrent operations when the WAL is generated
+ * and the state of the page when WAL is applied.
+ */
+void
+mask_page_lsn(Page page)
+{
+	PageHeader	phdr = (PageHeader) page;
+
+	PageXLogRecPtrSet(phdr->pd_lsn, (uint64) MASK_MARKER);
+}
+
+/*
+ * mask_page_hint_bits
+ *
+ * Mask hint bits in PageHeader. We want to ignore differences in hint bits,
+ * since they can be set without emitting any WAL.
+ */
+void
+mask_page_hint_bits(Page page)
+{
+	PageHeader	phdr = (PageHeader) page;
+
+	/* Ignore prune_xid (it's like a hint-bit) */
+	phdr->pd_prune_xid = MASK_MARKER;
+
+	/* Ignore PD_PAGE_FULL and PD_HAS_FREE_LINES flags, they are just hints. */
+	PageClearFull(page);
+	PageClearHasFreeLinePointers(page);
+
+	/*
+	 * During replay, if the page LSN has advanced past our XLOG record's LSN,
+	 * we don't mark the page all-visible. See heap_xlog_visible() for
+	 * details.
+	 */
+	PageClearAllVisible(page);
+}
+
+/*
+ * mask_unused_space
+ *
+ * Mask the unused space of a page between pd_lower and pd_upper.
+ */
+void
+mask_unused_space(Page page)
+{
+	int			pd_lower = ((PageHeader) page)->pd_lower;
+	int			pd_upper = ((PageHeader) page)->pd_upper;
+	int			pd_special = ((PageHeader) page)->pd_special;
+
+	/* Sanity check */
+	if (pd_lower > pd_upper || pd_special < pd_upper ||
+		pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ)
+	{
+		elog(ERROR, "invalid page pd_lower %u pd_upper %u pd_special %u\n",
+			 pd_lower, pd_upper, pd_special);
+	}
+
+	memset(page + pd_lower, MASK_MARKER, pd_upper - pd_lower);
+}
+
+/*
+ * mask_lp_flags
+ *
+ * In some index AMs, line pointer flags can be modified in master without
+ * emitting any WAL record.
+ */
+void
+mask_lp_flags(Page page)
+{
+	OffsetNumber offnum,
+				maxoff;
+
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = FirstOffsetNumber;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemId = PageGetItemId(page, offnum);
+
+		if (ItemIdIsUsed(itemId))
+			itemId->lp_flags = LP_UNUSED;
+	}
+}
+
+/*
+ * mask_page_content
+ *
+ * In some index AMs, the contents of deleted pages need to be almost
+ * completely ignored.
+ */
+void
+mask_page_content(Page page)
+{
+	/* Mask Page Content */
+	memset(page + SizeOfPageHeaderData, MASK_MARKER,
+		   BLCKSZ - SizeOfPageHeaderData);
+
+	/* Mask pd_lower and pd_upper */
+	memset(&((PageHeader) page)->pd_lower, MASK_MARKER,
+		   sizeof(uint16));
+	memset(&((PageHeader) page)->pd_upper, MASK_MARKER,
+		   sizeof(uint16));
+}
diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c
index 15a18a51cc..970e3aa6c9 100644
--- a/src/backend/access/common/heaptuple.c
+++ b/src/backend/access/common/heaptuple.c
@@ -46,7 +46,7 @@
  *
  *
  * Portions Copyright (c) 2012-2014, TransLattice, Inc.
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -875,6 +875,72 @@ heap_modify_tuple(HeapTuple tuple,
 }
 
 /*
+ * heap_modify_tuple_by_cols
+ *		form a new tuple from an old tuple and a set of replacement values.
+ *
+ * This is like heap_modify_tuple, except that instead of specifying which
+ * column(s) to replace by a boolean map, an array of target column numbers
+ * is used.  This is often more convenient when a fixed number of columns
+ * are to be replaced.  The replCols, replValues, and replIsnull arrays must
+ * be of length nCols.  Target column numbers are indexed from 1.
+ *
+ * The result is allocated in the current memory context.
+ */
+HeapTuple
+heap_modify_tuple_by_cols(HeapTuple tuple,
+						  TupleDesc tupleDesc,
+						  int nCols,
+						  int *replCols,
+						  Datum *replValues,
+						  bool *replIsnull)
+{
+	int			numberOfAttributes = tupleDesc->natts;
+	Datum	   *values;
+	bool	   *isnull;
+	HeapTuple	newTuple;
+	int			i;
+
+	/*
+	 * allocate and fill values and isnull arrays from the tuple, then replace
+	 * selected columns from the input arrays.
+	 */
+	values = (Datum *) palloc(numberOfAttributes * sizeof(Datum));
+	isnull = (bool *) palloc(numberOfAttributes * sizeof(bool));
+
+	heap_deform_tuple(tuple, tupleDesc, values, isnull);
+
+	for (i = 0; i < nCols; i++)
+	{
+		int			attnum = replCols[i];
+
+		if (attnum <= 0 || attnum > numberOfAttributes)
+			elog(ERROR, "invalid column number %d", attnum);
+		values[attnum - 1] = replValues[i];
+		isnull[attnum - 1] = replIsnull[i];
+	}
+
+	/*
+	 * create a new tuple from the values and isnull arrays
+	 */
+	newTuple = heap_form_tuple(tupleDesc, values, isnull);
+
+	pfree(values);
+	pfree(isnull);
+
+	/*
+	 * copy the identification info of the old tuple: t_ctid, t_self, and OID
+	 * (if any)
+	 */
+	newTuple->t_data->t_ctid = tuple->t_data->t_ctid;
+	newTuple->t_self = tuple->t_self;
+	newTuple->t_tableOid = tuple->t_tableOid;
+	if (tupleDesc->tdhasoid)
+		HeapTupleSetOid(newTuple, HeapTupleGetOid(tuple));
+
+	return newTuple;
+}
+
+/*
  * heap_deform_tuple
  *		Given a tuple, extract data into values/isnull arrays; this is
  *		the inverse of heap_form_tuple.
diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c
index 274a6c2e70..2846ec8b34 100644
--- a/src/backend/access/common/indextuple.c
+++ b/src/backend/access/common/indextuple.c
@@ -4,7 +4,7 @@
  *	   This file contains index tuple accessor and mutator routines,
  *	   as well as various tuple utilities.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
diff --git a/src/backend/access/common/printsimple.c b/src/backend/access/common/printsimple.c
new file mode 100644
index 0000000000..851c3bf4de
--- /dev/null
+++ b/src/backend/access/common/printsimple.c
@@ -0,0 +1,131 @@
+/*-------------------------------------------------------------------------
+ *
+ * printsimple.c
+ *	  Routines to print out tuples containing only a limited range of
+ *	  builtin types without catalog access.  This is intended for
+ *	  backends that don't have catalog access because they are not bound
+ *	  to a specific database, such as some walsender processes.  It
+ *	  doesn't handle standalone backends or protocol versions other than
+ *	  3.0, because we don't need such handling for current applications.
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/common/printsimple.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/printsimple.h"
+#include "catalog/pg_type.h"
+#include "fmgr.h"
+#include "libpq/pqformat.h"
+#include "utils/builtins.h"
+
+/*
+ * At startup time, send a RowDescription message.
+ */
+void
+printsimple_startup(DestReceiver *self, int operation, TupleDesc tupdesc)
+{
+	StringInfoData buf;
+	int			i;
+
+	pq_beginmessage(&buf, 'T'); /* RowDescription */
+	pq_sendint(&buf, tupdesc->natts, 2);
+
+	for (i = 0; i < tupdesc->natts; ++i)
+	{
+		Form_pg_attribute attr = tupdesc->attrs[i];
+
+		pq_sendstring(&buf, NameStr(attr->attname));
+		pq_sendint(&buf, 0, 4); /* table oid */
+		pq_sendint(&buf, 0, 2); /* attnum */
+		pq_sendint(&buf, (int) attr->atttypid, 4);
+		pq_sendint(&buf, attr->attlen, 2);
+		pq_sendint(&buf, attr->atttypmod, 4);
+		pq_sendint(&buf, 0, 2); /* format code */
+	}
+
+	pq_endmessage(&buf);
+}
+
+/*
+ * For each tuple, send a DataRow message.
+ */
+bool
+printsimple(TupleTableSlot *slot, DestReceiver *self)
+{
+	TupleDesc	tupdesc = slot->tts_tupleDescriptor;
+	StringInfoData buf;
+	int			i;
+
+	/* Make sure the tuple is fully deconstructed */
+	slot_getallattrs(slot);
+
+	/* Prepare and send message */
+	pq_beginmessage(&buf, 'D');
+	pq_sendint(&buf, tupdesc->natts, 2);
+
+	for (i = 0; i < tupdesc->natts; ++i)
+	{
+		Form_pg_attribute attr = tupdesc->attrs[i];
+		Datum		value;
+
+		if (slot->tts_isnull[i])
+		{
+			pq_sendint(&buf, -1, 4);
+			continue;
+		}
+
+		value = slot->tts_values[i];
+
+		/*
+		 * We can't call the regular type output functions here because we
+		 * might not have catalog access.  Instead, we must hard-wire
+		 * knowledge of the required types.
+		 */
+		switch (attr->atttypid)
+		{
+			case TEXTOID:
+				{
+					text	   *t = DatumGetTextPP(value);
+
+					pq_sendcountedtext(&buf,
+									   VARDATA_ANY(t),
+									   VARSIZE_ANY_EXHDR(t),
+									   false);
+				}
+				break;
+
+			case INT4OID:
+				{
+					int32		num = DatumGetInt32(value);
+					char		str[12];		/* sign, 10 digits and '\0' */
+
+					pg_ltoa(num, str);
+					pq_sendcountedtext(&buf, str, strlen(str), false);
+				}
+				break;
+
+			case INT8OID:
+				{
+					int64		num = DatumGetInt64(value);
+					char		str[23];		/* sign, 21 digits and '\0' */
+
+					pg_lltoa(num, str);
+					pq_sendcountedtext(&buf, str, strlen(str), false);
+				}
+				break;
+
+			default:
+				elog(ERROR, "unsupported type OID: %u", attr->atttypid);
+		}
+	}
+
+	pq_endmessage(&buf);
+
+	return true;
+}
diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c
index a44be6f96f..78704dafd9 100644
--- a/src/backend/access/common/printtup.c
+++ b/src/backend/access/common/printtup.c
@@ -6,7 +6,7 @@
  *
  *
  * Portions Copyright (c) 2012-2014, TransLattice, Inc.
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -139,9 +139,7 @@ printtup_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
 	 */
 	myState->tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
 												"printtup",
-												ALLOCSET_DEFAULT_MINSIZE,
-												ALLOCSET_DEFAULT_INITSIZE,
-												ALLOCSET_DEFAULT_MAXSIZE);
+												ALLOCSET_DEFAULT_SIZES);
 
 	if (PG_PROTOCOL_MAJOR(FrontendProtocol) < 3)
 	{
@@ -249,9 +247,7 @@ SendRowDescriptionMessage(TupleDesc typeinfo, List *targetlist, int16 *formats)
 		atttypid = getBaseTypeAndTypmod(atttypid, &atttypmod);
 		pq_sendint(&buf, (int) atttypid, sizeof(atttypid));
 		pq_sendint(&buf, attrs[i]->attlen, sizeof(attrs[i]->attlen));
-		/* typmod appears in protocol 2.0 and up */
-		if (proto >= 2)
-			pq_sendint(&buf, atttypmod, sizeof(atttypmod));
+		pq_sendint(&buf, atttypmod, sizeof(atttypmod));
 		/* format info appears in protocol 3.0 and up */
 		if (proto >= 3)
 		{
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index ba1f3aafed..6d1f22f049 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -3,7 +3,7 @@
  * reloptions.c
  *	  Core support for relation options (pg_class.reloptions)
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -15,6 +15,8 @@
 
 #include "postgres.h"
 
+#include <float.h>
+
 #include "access/gist_private.h"
 #include "access/hash.h"
 #include "access/htup_details.h"
@@ -46,16 +48,61 @@
  * (iii) add it to the appropriate options struct (perhaps StdRdOptions)
  * (iv) add it to the appropriate handling routine (perhaps
  * default_reloptions)
- * (v) don't forget to document the option
+ * (v) make sure the lock level is set correctly for that operation
+ * (vi) don't forget to document the option
  *
  * Note that we don't handle "oids" in relOpts because it is handled by
  * interpretOidsOption().
+ *
+ * The default choice for any new option should be AccessExclusiveLock.
+ * In some cases the lock level can be reduced from there, but the lock
+ * level chosen should always conflict with itself to ensure that multiple
+ * changes aren't lost when we attempt concurrent changes.
+ * The choice of lock level depends completely upon how that parameter
+ * is used within the server, not upon how and when you'd like to change it.
+ * Safety first. Existing choices are documented here, and elsewhere in
+ * backend code where the parameters are used.
+ *
+ * In general, anything that affects the results obtained from a SELECT must be
+ * protected by AccessExclusiveLock.
+ *
+ * Autovacuum related parameters can be set at ShareUpdateExclusiveLock
+ * since they are only used by the AV procs and don't change anything
+ * currently executing.
+ *
+ * Fillfactor can be set because it applies only to subsequent changes made to
+ * data blocks, as documented in heapio.c
+ *
+ * n_distinct options can be set at ShareUpdateExclusiveLock because they
+ * are only used during ANALYZE, which uses a ShareUpdateExclusiveLock,
+ * so the ANALYZE will not be affected by in-flight changes. Changing those
+ * values has no affect until the next ANALYZE, so no need for stronger lock.
+ *
+ * Planner-related parameters can be set with ShareUpdateExclusiveLock because
+ * they only affect planning and not the correctness of the execution. Plans
+ * cannot be changed in mid-flight, so changes here could not easily result in
+ * new improved plans in any case. So we allow existing queries to continue
+ * and existing plans to survive, a small price to pay for allowing better
+ * plans to be introduced concurrently without interfering with users.
+ *
+ * Setting parallel_workers is safe, since it acts the same as
+ * max_parallel_workers_per_gather which is a USERSET parameter that doesn't
+ * affect existing plans or queries.
  */
 
 static relopt_bool boolRelOpts[] =
 {
 	{
 		{
+			"autosummarize",
+			"Enables automatic summarization on this BRIN index",
+			RELOPT_KIND_BRIN,
+			AccessExclusiveLock
+		},
+		false
+	},
+	{
+		{
 			"autovacuum_enabled",
 			"Enables autovacuum in this relation",
 			RELOPT_KIND_HEAP | RELOPT_KIND_TOAST,
@@ -265,7 +312,7 @@ static relopt_int intRelOpts[] =
 			"effective_io_concurrency",
 			"Number of simultaneous requests that can be handled efficiently by the disk subsystem.",
 			RELOPT_KIND_TABLESPACE,
-			AccessExclusiveLock
+			ShareUpdateExclusiveLock
 		},
 #ifdef USE_PREFETCH
 		-1, 0, MAX_IO_CONCURRENCY
@@ -278,7 +325,7 @@ static relopt_int intRelOpts[] =
 			"parallel_workers",
 			"Number of parallel processes that can be used per executor node for this relation.",
 			RELOPT_KIND_HEAP,
-			AccessExclusiveLock
+			ShareUpdateExclusiveLock
 		},
 		-1, 0, 1024
 	},
@@ -312,7 +359,7 @@ static relopt_real realRelOpts[] =
 			"seq_page_cost",
 			"Sets the planner's estimate of the cost of a sequentially fetched disk page.",
 			RELOPT_KIND_TABLESPACE,
-			AccessExclusiveLock
+			ShareUpdateExclusiveLock
 		},
 		-1, 0.0, DBL_MAX
 	},
@@ -321,7 +368,7 @@ static relopt_real realRelOpts[] =
 			"random_page_cost",
 			"Sets the planner's estimate of the cost of a nonsequentially fetched disk page.",
 			RELOPT_KIND_TABLESPACE,
-			AccessExclusiveLock
+			ShareUpdateExclusiveLock
 		},
 		-1, 0.0, DBL_MAX
 	},
@@ -330,7 +377,7 @@ static relopt_real realRelOpts[] =
 			"n_distinct",
 			"Sets the planner's estimate of the number of distinct values appearing in a column (excluding child relations).",
 			RELOPT_KIND_ATTRIBUTE,
-			AccessExclusiveLock
+			ShareUpdateExclusiveLock
 		},
 		0, -1.0, DBL_MAX
 	},
@@ -339,7 +386,7 @@ static relopt_real realRelOpts[] =
 			"n_distinct_inherited",
 			"Sets the planner's estimate of the number of distinct values appearing in a column (including child relations).",
 			RELOPT_KIND_ATTRIBUTE,
-			AccessExclusiveLock
+			ShareUpdateExclusiveLock
 		},
 		0, -1.0, DBL_MAX
 	},
@@ -722,9 +769,8 @@ transformRelOptions(Datum oldOptions, List *defList, char *namspace,
 
 		for (i = 0; i < noldoptions; i++)
 		{
-			text	   *oldoption = DatumGetTextP(oldoptions[i]);
-			char	   *text_str = VARDATA(oldoption);
-			int			text_len = VARSIZE(oldoption) - VARHDRSZ;
+			char	   *text_str = VARDATA(oldoptions[i]);
+			int			text_len = VARSIZE(oldoptions[i]) - VARHDRSZ;
 
 			/* Search for a match in defList */
 			foreach(cell, defList)
@@ -888,7 +934,7 @@ untransformRelOptions(Datum options)
 			*p++ = '\0';
 			val = (Node *) makeString(pstrdup(p));
 		}
-		result = lappend(result, makeDefElem(pstrdup(s), val));
+		result = lappend(result, makeDefElem(pstrdup(s), val, -1));
 	}
 
 	return result;
@@ -930,6 +976,7 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc,
 		case RELKIND_RELATION:
 		case RELKIND_TOASTVALUE:
 		case RELKIND_MATVIEW:
+		case RELKIND_PARTITIONED_TABLE:
 			options = heap_reloptions(classForm->relkind, datum, false);
 			break;
 		case RELKIND_VIEW:
@@ -962,7 +1009,8 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc,
  * array; this is so that the caller can easily locate the default values.
  *
  * If there are no options of the given kind, numrelopts is set to 0 and NULL
- * is returned.
+ * is returned (unless options are illegally supplied despite none being
+ * defined, in which case an error occurs).
  *
  * Note: values of type int, bool and real are allocated as part of the
  * returned array.  Values of type string are allocated separately and must
@@ -972,7 +1020,7 @@ relopt_value *
 parseRelOptions(Datum options, bool validate, relopt_kind kind,
 				int *numrelopts)
 {
-	relopt_value *reloptions;
+	relopt_value *reloptions = NULL;
 	int			numoptions = 0;
 	int			i;
 	int			j;
@@ -986,21 +1034,18 @@ parseRelOptions(Datum options, bool validate, relopt_kind kind,
 		if (relOpts[i]->kinds & kind)
 			numoptions++;
 
-	if (numoptions == 0)
+	if (numoptions > 0)
 	{
-		*numrelopts = 0;
-		return NULL;
-	}
-
-	reloptions = palloc(numoptions * sizeof(relopt_value));
+		reloptions = palloc(numoptions * sizeof(relopt_value));
 
-	for (i = 0, j = 0; relOpts[i]; i++)
-	{
-		if (relOpts[i]->kinds & kind)
+		for (i = 0, j = 0; relOpts[i]; i++)
 		{
-			reloptions[j].gen = relOpts[i];
-			reloptions[j].isset = false;
-			j++;
+			if (relOpts[i]->kinds & kind)
+			{
+				reloptions[j].gen = relOpts[i];
+				reloptions[j].isset = false;
+				j++;
+			}
 		}
 	}
 
@@ -1016,9 +1061,8 @@ parseRelOptions(Datum options, bool validate, relopt_kind kind,
 
 		for (i = 0; i < noptions; i++)
 		{
-			text	   *optiontext = DatumGetTextP(optiondatums[i]);
-			char	   *text_str = VARDATA(optiontext);
-			int			text_len = VARSIZE(optiontext) - VARHDRSZ;
+			char	   *text_str = VARDATA(optiondatums[i]);
+			int			text_len = VARSIZE(optiondatums[i]) - VARHDRSZ;
 			int			j;
 
 			/* Search for a match in reloptions */
@@ -1382,6 +1426,9 @@ heap_reloptions(char relkind, Datum reloptions, bool validate)
 		case RELKIND_RELATION:
 		case RELKIND_MATVIEW:
 			return default_reloptions(reloptions, validate, RELOPT_KIND_HEAP);
+		case RELKIND_PARTITIONED_TABLE:
+			return default_reloptions(reloptions, validate,
+									  RELOPT_KIND_PARTITIONED);
 		default:
 			/* other relkinds are not supported */
 			return NULL;
diff --git a/src/backend/access/common/scankey.c b/src/backend/access/common/scankey.c
index 35391c2c60..13edca1f94 100644
--- a/src/backend/access/common/scankey.c
+++ b/src/backend/access/common/scankey.c
@@ -3,7 +3,7 @@
  * scankey.c
  *	  scan key support code
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
diff --git a/src/backend/access/common/tupconvert.c b/src/backend/access/common/tupconvert.c
index 4787d4ca98..392a49b522 100644
--- a/src/backend/access/common/tupconvert.c
+++ b/src/backend/access/common/tupconvert.c
@@ -9,7 +9,7 @@
  * executor's "junkfilter" routines, but these functions work on bare
  * HeapTuples rather than TupleTableSlots.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -138,12 +138,14 @@ convert_tuples_by_position(TupleDesc indesc,
 						   nincols, noutcols)));
 
 	/*
-	 * Check to see if the map is one-to-one and the tuple types are the same.
-	 * (We check the latter because if they're not, we want to do conversion
-	 * to inject the right OID into the tuple datum.)
+	 * Check to see if the map is one-to-one, in which case we need not do a
+	 * tuple conversion.  We must also insist that both tupdescs either
+	 * specify or don't specify an OID column, else we need a conversion to
+	 * add/remove space for that.  (For some callers, presence or absence of
+	 * an OID column perhaps would not really matter, but let's be safe.)
 	 */
 	if (indesc->natts == outdesc->natts &&
-		indesc->tdtypeid == outdesc->tdtypeid)
+		indesc->tdhasoid == outdesc->tdhasoid)
 	{
 		for (i = 0; i < n; i++)
 		{
@@ -206,63 +208,22 @@ convert_tuples_by_name(TupleDesc indesc,
 {
 	TupleConversionMap *map;
 	AttrNumber *attrMap;
-	int			n;
+	int			n = outdesc->natts;
 	int			i;
 	bool		same;
 
 	/* Verify compatibility and prepare attribute-number map */
-	n = outdesc->natts;
-	attrMap = (AttrNumber *) palloc0(n * sizeof(AttrNumber));
-	for (i = 0; i < n; i++)
-	{
-		Form_pg_attribute att = outdesc->attrs[i];
-		char	   *attname;
-		Oid			atttypid;
-		int32		atttypmod;
-		int			j;
-
-		if (att->attisdropped)
-			continue;			/* attrMap[i] is already 0 */
-		attname = NameStr(att->attname);
-		atttypid = att->atttypid;
-		atttypmod = att->atttypmod;
-		for (j = 0; j < indesc->natts; j++)
-		{
-			att = indesc->attrs[j];
-			if (att->attisdropped)
-				continue;
-			if (strcmp(attname, NameStr(att->attname)) == 0)
-			{
-				/* Found it, check type */
-				if (atttypid != att->atttypid || atttypmod != att->atttypmod)
-					ereport(ERROR,
-							(errcode(ERRCODE_DATATYPE_MISMATCH),
-							 errmsg_internal("%s", _(msg)),
-							 errdetail("Attribute \"%s\" of type %s does not match corresponding attribute of type %s.",
-									   attname,
-									   format_type_be(outdesc->tdtypeid),
-									   format_type_be(indesc->tdtypeid))));
-				attrMap[i] = (AttrNumber) (j + 1);
-				break;
-			}
-		}
-		if (attrMap[i] == 0)
-			ereport(ERROR,
-					(errcode(ERRCODE_DATATYPE_MISMATCH),
-					 errmsg_internal("%s", _(msg)),
-					 errdetail("Attribute \"%s\" of type %s does not exist in type %s.",
-							   attname,
-							   format_type_be(outdesc->tdtypeid),
-							   format_type_be(indesc->tdtypeid))));
-	}
+	attrMap = convert_tuples_by_name_map(indesc, outdesc, msg);
 
 	/*
-	 * Check to see if the map is one-to-one and the tuple types are the same.
-	 * (We check the latter because if they're not, we want to do conversion
-	 * to inject the right OID into the tuple datum.)
+	 * Check to see if the map is one-to-one, in which case we need not do a
+	 * tuple conversion.  We must also insist that both tupdescs either
+	 * specify or don't specify an OID column, else we need a conversion to
+	 * add/remove space for that.  (For some callers, presence or absence of
+	 * an OID column perhaps would not really matter, but let's be safe.)
 	 */
 	if (indesc->natts == outdesc->natts &&
-		indesc->tdtypeid == outdesc->tdtypeid)
+		indesc->tdhasoid == outdesc->tdhasoid)
 	{
 		same = true;
 		for (i = 0; i < n; i++)
@@ -313,6 +274,69 @@ convert_tuples_by_name(TupleDesc indesc,
 }
 
 /*
+ * Return a palloc'd bare attribute map for tuple conversion, matching input
+ * and output columns by name.  (Dropped columns are ignored in both input and
+ * output.)  This is normally a subroutine for convert_tuples_by_name, but can
+ * be used standalone.
+ */
+AttrNumber *
+convert_tuples_by_name_map(TupleDesc indesc,
+						   TupleDesc outdesc,
+						   const char *msg)
+{
+	AttrNumber *attrMap;
+	int			n;
+	int			i;
+
+	n = outdesc->natts;
+	attrMap = (AttrNumber *) palloc0(n * sizeof(AttrNumber));
+	for (i = 0; i < n; i++)
+	{
+		Form_pg_attribute att = outdesc->attrs[i];
+		char	   *attname;
+		Oid			atttypid;
+		int32		atttypmod;
+		int			j;
+
+		if (att->attisdropped)
+			continue;			/* attrMap[i] is already 0 */
+		attname = NameStr(att->attname);
+		atttypid = att->atttypid;
+		atttypmod = att->atttypmod;
+		for (j = 0; j < indesc->natts; j++)
+		{
+			att = indesc->attrs[j];
+			if (att->attisdropped)
+				continue;
+			if (strcmp(attname, NameStr(att->attname)) == 0)
+			{
+				/* Found it, check type */
+				if (atttypid != att->atttypid || atttypmod != att->atttypmod)
+					ereport(ERROR,
+							(errcode(ERRCODE_DATATYPE_MISMATCH),
+							 errmsg_internal("%s", _(msg)),
+							 errdetail("Attribute \"%s\" of type %s does not match corresponding attribute of type %s.",
+									   attname,
+									   format_type_be(outdesc->tdtypeid),
+									   format_type_be(indesc->tdtypeid))));
+				attrMap[i] = (AttrNumber) (j + 1);
+				break;
+			}
+		}
+		if (attrMap[i] == 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATATYPE_MISMATCH),
+					 errmsg_internal("%s", _(msg)),
+					 errdetail("Attribute \"%s\" of type %s does not exist in type %s.",
+							   attname,
+							   format_type_be(outdesc->tdtypeid),
+							   format_type_be(indesc->tdtypeid))));
+	}
+
+	return attrMap;
+}
+
+/*
  * Perform conversion of a tuple according to the map.
  */
 HeapTuple
diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c
index b56d0e336f..9fd7b4e019 100644
--- a/src/backend/access/common/tupdesc.c
+++ b/src/backend/access/common/tupdesc.c
@@ -3,7 +3,7 @@
  * tupdesc.c
  *	  POSTGRES tuple descriptor support code
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -20,6 +20,7 @@
 #include "postgres.h"
 
 #include "access/htup_details.h"
+#include "catalog/pg_collation.h"
 #include "catalog/pg_type.h"
 #include "miscadmin.h"
 #include "parser/parse_type.h"
@@ -149,6 +150,7 @@ CreateTupleDescCopy(TupleDesc tupdesc)
 		memcpy(desc->attrs[i], tupdesc->attrs[i], ATTRIBUTE_FIXED_PART_SIZE);
 		desc->attrs[i]->attnotnull = false;
 		desc->attrs[i]->atthasdef = false;
+		desc->attrs[i]->attidentity = '\0';
 	}
 
 	desc->tdtypeid = tupdesc->tdtypeid;
@@ -256,6 +258,7 @@ TupleDescCopyEntry(TupleDesc dst, AttrNumber dstAttno,
 	/* since we're not copying constraints or defaults, clear these */
 	dst->attrs[dstAttno - 1]->attnotnull = false;
 	dst->attrs[dstAttno - 1]->atthasdef = false;
+	dst->attrs[dstAttno - 1]->attidentity = '\0';
 }
 
 /*
@@ -400,6 +403,8 @@ equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2)
 			return false;
 		if (attr1->atthasdef != attr2->atthasdef)
 			return false;
+		if (attr1->attidentity != attr2->attidentity)
+			return false;
 		if (attr1->attisdropped != attr2->attisdropped)
 			return false;
 		if (attr1->attislocal != attr2->attislocal)
@@ -533,6 +538,7 @@ TupleDescInitEntry(TupleDesc desc,
 
 	att->attnotnull = false;
 	att->atthasdef = false;
+	att->attidentity = '\0';
 	att->attisdropped = false;
 	att->attislocal = true;
 	att->attinhcount = 0;
@@ -554,6 +560,93 @@ TupleDescInitEntry(TupleDesc desc,
 }
 
 /*
+ * TupleDescInitBuiltinEntry
+ *		Initialize a tuple descriptor without catalog access.  Only
+ *		a limited range of builtin types are supported.
+ */
+void
+TupleDescInitBuiltinEntry(TupleDesc desc,
+						  AttrNumber attributeNumber,
+						  const char *attributeName,
+						  Oid oidtypeid,
+						  int32 typmod,
+						  int attdim)
+{
+	Form_pg_attribute att;
+
+	/* sanity checks */
+	AssertArg(PointerIsValid(desc));
+	AssertArg(attributeNumber >= 1);
+	AssertArg(attributeNumber <= desc->natts);
+
+	/* initialize the attribute fields */
+	att = desc->attrs[attributeNumber - 1];
+	att->attrelid = 0;			/* dummy value */
+
+	/* unlike TupleDescInitEntry, we require an attribute name */
+	Assert(attributeName != NULL);
+	namestrcpy(&(att->attname), attributeName);
+
+	att->attstattarget = -1;
+	att->attcacheoff = -1;
+	att->atttypmod = typmod;
+
+	att->attnum = attributeNumber;
+	att->attndims = attdim;
+
+	att->attnotnull = false;
+	att->atthasdef = false;
+	att->attidentity = '\0';
+	att->attisdropped = false;
+	att->attislocal = true;
+	att->attinhcount = 0;
+	/* attacl, attoptions and attfdwoptions are not present in tupledescs */
+
+	att->atttypid = oidtypeid;
+
+	/*
+	 * Our goal here is to support just enough types to let basic builtin
+	 * commands work without catalog access - e.g. so that we can do certain
+	 * things even in processes that are not connected to a database.
+	 */
+	switch (oidtypeid)
+	{
+		case TEXTOID:
+		case TEXTARRAYOID:
+			att->attlen = -1;
+			att->attbyval = false;
+			att->attalign = 'i';
+			att->attstorage = 'x';
+			att->attcollation = DEFAULT_COLLATION_OID;
+			break;
+
+		case BOOLOID:
+			att->attlen = 1;
+			att->attbyval = true;
+			att->attalign = 'c';
+			att->attstorage = 'p';
+			att->attcollation = InvalidOid;
+			break;
+
+		case INT4OID:
+			att->attlen = 4;
+			att->attbyval = true;
+			att->attalign = 'i';
+			att->attstorage = 'p';
+			att->attcollation = InvalidOid;
+			break;
+
+		case INT8OID:
+			att->attlen = 8;
+			att->attbyval = FLOAT8PASSBYVAL;
+			att->attalign = 'd';
+			att->attstorage = 'p';
+			att->attcollation = InvalidOid;
+			break;
+	}
+}
+
+/*
  * TupleDescInitEntryCollation
  *
  * Assign a nondefault collation to a previously initialized tuple descriptor
diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README
index fade0cbb61..990b5ffa58 100644
--- a/src/backend/access/gin/README
+++ b/src/backend/access/gin/README
@@ -314,10 +314,17 @@ deleted.
 The previous paragraph's reasoning only applies to searches, and only to
 posting trees. To protect from inserters following a downlink to a deleted
 page, vacuum simply locks out all concurrent insertions to the posting tree,
-by holding a super-exclusive lock on the posting tree root. Inserters hold a
-pin on the root page, but searches do not, so while new searches cannot begin
-while root page is locked, any already-in-progress scans can continue
-concurrently with vacuum. In the entry tree, we never delete pages.
+by holding a super-exclusive lock on the parent page of subtree with deletable
+pages. Inserters hold a pin on the root page, but searches do not, so while
+new searches cannot begin while root page is locked, any already-in-progress
+scans can continue concurrently with vacuum in corresponding subtree of
+posting tree. To exclude interference with readers vacuum takes exclusive
+locks in a depth-first scan in left-to-right order of page tuples. Leftmost
+page is never deleted. Thus before deleting any page we obtain exclusive
+lock on any left page, effectively excluding deadlock with any reader, despite
+taking parent lock before current and left lock after current. We take left
+lock not for a concurrency reasons, but rather in need to mark page dirty.
+In the entry tree, we never delete pages.
 
 (This is quite different from the mechanism the btree indexam uses to make
 page-deletions safe; it stamps the deleted pages with an XID and keeps the
diff --git a/src/backend/access/gin/ginarrayproc.c b/src/backend/access/gin/ginarrayproc.c
index aaf72a3f9e..cc7435e030 100644
--- a/src/backend/access/gin/ginarrayproc.c
+++ b/src/backend/access/gin/ginarrayproc.c
@@ -4,7 +4,7 @@
  *	  support functions for GIN's indexing of any array
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c
index fa383719e6..b02cb8ae58 100644
--- a/src/backend/access/gin/ginbtree.c
+++ b/src/backend/access/gin/ginbtree.c
@@ -4,7 +4,7 @@
  *	  page utilities routines for the postgres inverted index access method.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -15,6 +15,7 @@
 #include "postgres.h"
 
 #include "access/gin_private.h"
+#include "access/ginxlog.h"
 #include "access/xloginsert.h"
 #include "miscadmin.h"
 #include "utils/memutils.h"
@@ -30,7 +31,7 @@ static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack,
 /*
  * Lock buffer by needed method for search.
  */
-static int
+int
 ginTraverseLock(Buffer buffer, bool searchMode)
 {
 	Page		page;
@@ -348,9 +349,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
 	 */
 	tmpCxt = AllocSetContextCreate(CurrentMemoryContext,
 								   "ginPlaceToPage temporary context",
-								   ALLOCSET_DEFAULT_MINSIZE,
-								   ALLOCSET_DEFAULT_INITSIZE,
-								   ALLOCSET_DEFAULT_MAXSIZE);
+								   ALLOCSET_DEFAULT_SIZES);
 	oldCxt = MemoryContextSwitchTo(tmpCxt);
 
 	if (GinPageIsData(page))
diff --git a/src/backend/access/gin/ginbulk.c b/src/backend/access/gin/ginbulk.c
index d6422ea91e..f07c76b90b 100644
--- a/src/backend/access/gin/ginbulk.c
+++ b/src/backend/access/gin/ginbulk.c
@@ -4,7 +4,7 @@
  *	  routines for fast build of inverted index
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -255,7 +255,7 @@ qsortCompareItemPointers(const void *a, const void *b)
 void
 ginBeginBAScan(BuildAccumulator *accum)
 {
-	rb_begin_iterate(accum->tree, LeftRightWalk);
+	rb_begin_iterate(accum->tree, LeftRightWalk, &accum->tree_walk);
 }
 
 /*
@@ -271,7 +271,7 @@ ginGetBAEntry(BuildAccumulator *accum,
 	GinEntryAccumulator *entry;
 	ItemPointerData *list;
 
-	entry = (GinEntryAccumulator *) rb_iterate(accum->tree);
+	entry = (GinEntryAccumulator *) rb_iterate(&accum->tree_walk);
 
 	if (entry == NULL)
 		return NULL;			/* no more entries */
diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c
index 97c8bf78e7..ad62d4e0e9 100644
--- a/src/backend/access/gin/gindatapage.c
+++ b/src/backend/access/gin/gindatapage.c
@@ -4,7 +4,7 @@
  *	  routines for handling GIN posting tree pages.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -15,6 +15,7 @@
 #include "postgres.h"
 
 #include "access/gin_private.h"
+#include "access/ginxlog.h"
 #include "access/xloginsert.h"
 #include "lib/ilist.h"
 #include "miscadmin.h"
@@ -86,7 +87,7 @@ typedef struct
 	char		action;
 
 	ItemPointerData *modifieditems;
-	int			nmodifieditems;
+	uint16		nmodifieditems;
 
 	/*
 	 * The following fields represent the items in this segment. If 'items' is
diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c
index 8c0bfe9fde..8c9859ce8e 100644
--- a/src/backend/access/gin/ginentrypage.c
+++ b/src/backend/access/gin/ginentrypage.c
@@ -4,7 +4,7 @@
  *	  routines for handling GIN entry tree pages.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -15,6 +15,7 @@
 #include "postgres.h"
 
 #include "access/gin_private.h"
+#include "access/ginxlog.h"
 #include "access/xloginsert.h"
 #include "miscadmin.h"
 #include "utils/rel.h"
diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
index 59a63f28d0..0d5bb70cc9 100644
--- a/src/backend/access/gin/ginfast.c
+++ b/src/backend/access/gin/ginfast.c
@@ -7,7 +7,7 @@
  *	  transfer pending entries into the regular index structure.  This
  *	  wins because bulk insertion is much more efficient than retail.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -19,6 +19,7 @@
 #include "postgres.h"
 
 #include "access/gin_private.h"
+#include "access/ginxlog.h"
 #include "access/xloginsert.h"
 #include "access/xlog.h"
 #include "commands/vacuum.h"
@@ -30,6 +31,7 @@
 #include "postmaster/autovacuum.h"
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
+#include "utils/builtins.h"
 
 /* GUC parameter */
 int			gin_pending_list_limit = 0;
@@ -808,9 +810,7 @@ ginInsertCleanup(GinState *ginstate, bool full_clean,
 	 */
 	opCtx = AllocSetContextCreate(CurrentMemoryContext,
 								  "GIN insert cleanup temporary context",
-								  ALLOCSET_DEFAULT_MINSIZE,
-								  ALLOCSET_DEFAULT_INITSIZE,
-								  ALLOCSET_DEFAULT_MAXSIZE);
+								  ALLOCSET_DEFAULT_SIZES);
 
 	oldCtx = MemoryContextSwitchTo(opCtx);
 
diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
index 9ed9fd2dc5..610d386ff8 100644
--- a/src/backend/access/gin/ginget.c
+++ b/src/backend/access/gin/ginget.c
@@ -4,7 +4,7 @@
  *	  fetch tuples from a GIN scan.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -120,7 +120,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
 	Form_pg_attribute attr;
 
 	/* Initialize empty bitmap result */
-	scanEntry->matchBitmap = tbm_create(work_mem * 1024L);
+	scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL);
 
 	/* Null query cannot partial-match anything */
 	if (scanEntry->isPartialMatch &&
@@ -626,8 +626,9 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
 		}
 		else
 		{
-			entry->btree.itemptr = advancePast;
-			entry->btree.itemptr.ip_posid++;
+			ItemPointerSet(&entry->btree.itemptr,
+						   GinItemPointerGetBlockNumber(&advancePast),
+			  OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast)));
 		}
 		entry->btree.fullScan = false;
 		stack = ginFindLeafPage(&entry->btree, true, snapshot);
@@ -979,15 +980,17 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
 		if (GinItemPointerGetBlockNumber(&advancePast) <
 			GinItemPointerGetBlockNumber(&minItem))
 		{
-			advancePast.ip_blkid = minItem.ip_blkid;
-			advancePast.ip_posid = 0;
+			ItemPointerSet(&advancePast,
+						   GinItemPointerGetBlockNumber(&minItem),
+						   InvalidOffsetNumber);
 		}
 	}
 	else
 	{
-		Assert(minItem.ip_posid > 0);
-		advancePast = minItem;
-		advancePast.ip_posid--;
+		Assert(GinItemPointerGetOffsetNumber(&minItem) > 0);
+		ItemPointerSet(&advancePast,
+					   GinItemPointerGetBlockNumber(&minItem),
+				  OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem)));
 	}
 
 	/*
@@ -1245,15 +1248,17 @@ scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
 				if (GinItemPointerGetBlockNumber(&advancePast) <
 					GinItemPointerGetBlockNumber(&key->curItem))
 				{
-					advancePast.ip_blkid = key->curItem.ip_blkid;
-					advancePast.ip_posid = 0;
+					ItemPointerSet(&advancePast,
+								 GinItemPointerGetBlockNumber(&key->curItem),
+								   InvalidOffsetNumber);
 				}
 			}
 			else
 			{
-				Assert(key->curItem.ip_posid > 0);
-				advancePast = key->curItem;
-				advancePast.ip_posid--;
+				Assert(GinItemPointerGetOffsetNumber(&key->curItem) > 0);
+				ItemPointerSet(&advancePast,
+							   GinItemPointerGetBlockNumber(&key->curItem),
+							   OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem)));
 			}
 
 			/*
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 9f784bf48d..d90faae65d 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -4,7 +4,7 @@
  *	  insert routines for the postgres inverted index access method.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -15,6 +15,7 @@
 #include "postgres.h"
 
 #include "access/gin_private.h"
+#include "access/ginxlog.h"
 #include "access/xloginsert.h"
 #include "catalog/index.h"
 #include "miscadmin.h"
@@ -372,9 +373,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	 */
 	buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
 											  "Gin build temporary context",
-											  ALLOCSET_DEFAULT_MINSIZE,
-											  ALLOCSET_DEFAULT_INITSIZE,
-											  ALLOCSET_DEFAULT_MAXSIZE);
+											  ALLOCSET_DEFAULT_SIZES);
 
 	/*
 	 * create a temporary memory context that is used for calling
@@ -382,9 +381,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	 */
 	buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext,
 					 "Gin build temporary context for user-defined function",
-											   ALLOCSET_DEFAULT_MINSIZE,
-											   ALLOCSET_DEFAULT_INITSIZE,
-											   ALLOCSET_DEFAULT_MAXSIZE);
+											   ALLOCSET_DEFAULT_SIZES);
 
 	buildstate.accum.ginstate = &buildstate.ginstate;
 	ginInitBA(&buildstate.accum);
@@ -486,41 +483,48 @@ ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum,
 bool
 gininsert(Relation index, Datum *values, bool *isnull,
 		  ItemPointer ht_ctid, Relation heapRel,
-		  IndexUniqueCheck checkUnique)
+		  IndexUniqueCheck checkUnique,
+		  IndexInfo *indexInfo)
 {
-	GinState	ginstate;
+	GinState   *ginstate = (GinState *) indexInfo->ii_AmCache;
 	MemoryContext oldCtx;
 	MemoryContext insertCtx;
 	int			i;
 
+	/* Initialize GinState cache if first call in this statement */
+	if (ginstate == NULL)
+	{
+		oldCtx = MemoryContextSwitchTo(indexInfo->ii_Context);
+		ginstate = (GinState *) palloc(sizeof(GinState));
+		initGinState(ginstate, index);
+		indexInfo->ii_AmCache = (void *) ginstate;
+		MemoryContextSwitchTo(oldCtx);
+	}
+
 	insertCtx = AllocSetContextCreate(CurrentMemoryContext,
 									  "Gin insert temporary context",
-									  ALLOCSET_DEFAULT_MINSIZE,
-									  ALLOCSET_DEFAULT_INITSIZE,
-									  ALLOCSET_DEFAULT_MAXSIZE);
+									  ALLOCSET_DEFAULT_SIZES);
 
 	oldCtx = MemoryContextSwitchTo(insertCtx);
 
-	initGinState(&ginstate, index);
-
 	if (GinGetUseFastUpdate(index))
 	{
 		GinTupleCollector collector;
 
 		memset(&collector, 0, sizeof(GinTupleCollector));
 
-		for (i = 0; i < ginstate.origTupdesc->natts; i++)
-			ginHeapTupleFastCollect(&ginstate, &collector,
+		for (i = 0; i < ginstate->origTupdesc->natts; i++)
+			ginHeapTupleFastCollect(ginstate, &collector,
 									(OffsetNumber) (i + 1),
 									values[i], isnull[i],
 									ht_ctid);
 
-		ginHeapTupleFastInsert(&ginstate, &collector);
+		ginHeapTupleFastInsert(ginstate, &collector);
 	}
 	else
 	{
-		for (i = 0; i < ginstate.origTupdesc->natts; i++)
-			ginHeapTupleInsert(&ginstate, (OffsetNumber) (i + 1),
+		for (i = 0; i < ginstate->origTupdesc->natts; i++)
+			ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1),
 							   values[i], isnull[i],
 							   ht_ctid);
 	}
diff --git a/src/backend/access/gin/ginlogic.c b/src/backend/access/gin/ginlogic.c
index d3e84eee97..a940a9374a 100644
--- a/src/backend/access/gin/ginlogic.c
+++ b/src/backend/access/gin/ginlogic.c
@@ -24,7 +24,7 @@
  * is used for.)
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
diff --git a/src/backend/access/gin/ginpostinglist.c b/src/backend/access/gin/ginpostinglist.c
index 54d5f6f630..8d2d31ac72 100644
--- a/src/backend/access/gin/ginpostinglist.c
+++ b/src/backend/access/gin/ginpostinglist.c
@@ -4,7 +4,7 @@
  *	  routines for dealing with posting lists.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -79,13 +79,11 @@ itemptr_to_uint64(const ItemPointer iptr)
 	uint64		val;
 
 	Assert(ItemPointerIsValid(iptr));
-	Assert(iptr->ip_posid < (1 << MaxHeapTuplesPerPageBits));
+	Assert(GinItemPointerGetOffsetNumber(iptr) < (1 << MaxHeapTuplesPerPageBits));
 
-	val = iptr->ip_blkid.bi_hi;
-	val <<= 16;
-	val |= iptr->ip_blkid.bi_lo;
+	val = GinItemPointerGetBlockNumber(iptr);
 	val <<= MaxHeapTuplesPerPageBits;
-	val |= iptr->ip_posid;
+	val |= GinItemPointerGetOffsetNumber(iptr);
 
 	return val;
 }
@@ -93,11 +91,9 @@ itemptr_to_uint64(const ItemPointer iptr)
 static inline void
 uint64_to_itemptr(uint64 val, ItemPointer iptr)
 {
-	iptr->ip_posid = val & ((1 << MaxHeapTuplesPerPageBits) - 1);
+	GinItemPointerSetOffsetNumber(iptr, val & ((1 << MaxHeapTuplesPerPageBits) - 1));
 	val = val >> MaxHeapTuplesPerPageBits;
-	iptr->ip_blkid.bi_lo = val & 0xFFFF;
-	val = val >> 16;
-	iptr->ip_blkid.bi_hi = val & 0xFFFF;
+	GinItemPointerSetBlockNumber(iptr, val);
 
 	Assert(ItemPointerIsValid(iptr));
 }
diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c
index c449c1cbc0..c83375d6b4 100644
--- a/src/backend/access/gin/ginscan.c
+++ b/src/backend/access/gin/ginscan.c
@@ -4,7 +4,7 @@
  *	  routines to manage scans of inverted index relations
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -38,14 +38,10 @@ ginbeginscan(Relation rel, int nkeys, int norderbys)
 	so->nkeys = 0;
 	so->tempCtx = AllocSetContextCreate(CurrentMemoryContext,
 										"Gin scan temporary context",
-										ALLOCSET_DEFAULT_MINSIZE,
-										ALLOCSET_DEFAULT_INITSIZE,
-										ALLOCSET_DEFAULT_MAXSIZE);
+										ALLOCSET_DEFAULT_SIZES);
 	so->keyCtx = AllocSetContextCreate(CurrentMemoryContext,
 									   "Gin scan key context",
-									   ALLOCSET_DEFAULT_MINSIZE,
-									   ALLOCSET_DEFAULT_INITSIZE,
-									   ALLOCSET_DEFAULT_MAXSIZE);
+									   ALLOCSET_DEFAULT_SIZES);
 	initGinState(&so->ginstate, scan->indexRelation);
 
 	scan->opaque = so;
@@ -151,7 +147,7 @@ ginFillScanKey(GinScanOpaque so, OffsetNumber attnum,
 	key->nuserentries = nUserQueryValues;
 
 	key->scanEntry = (GinScanEntry *) palloc(sizeof(GinScanEntry) * nQueryValues);
-	key->entryRes = (bool *) palloc0(sizeof(bool) * nQueryValues);
+	key->entryRes = (GinTernaryValue *) palloc0(sizeof(GinTernaryValue) * nQueryValues);
 
 	key->query = query;
 	key->queryValues = queryValues;
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index d9146488c4..d03d59da6a 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -4,7 +4,7 @@
  *	  Utility routines for the Postgres inverted index access method.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -15,6 +15,7 @@
 #include "postgres.h"
 
 #include "access/gin_private.h"
+#include "access/ginxlog.h"
 #include "access/reloptions.h"
 #include "access/xloginsert.h"
 #include "catalog/pg_collation.h"
@@ -22,7 +23,9 @@
 #include "miscadmin.h"
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
+#include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
+#include "utils/typcache.h"
 
 
 /*
@@ -47,6 +50,7 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->amstorage = true;
 	amroutine->amclusterable = false;
 	amroutine->ampredlocks = false;
+	amroutine->amcanparallel = false;
 	amroutine->amkeytype = InvalidOid;
 
 	amroutine->ambuild = ginbuild;
@@ -66,6 +70,9 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->amendscan = ginendscan;
 	amroutine->ammarkpos = NULL;
 	amroutine->amrestrpos = NULL;
+	amroutine->amestimateparallelscan = NULL;
+	amroutine->aminitparallelscan = NULL;
+	amroutine->amparallelrescan = NULL;
 
 	PG_RETURN_POINTER(amroutine);
 }
@@ -105,9 +112,33 @@ initGinState(GinState *state, Relation index)
 										origTupdesc->attrs[i]->attcollation);
 		}
 
-		fmgr_info_copy(&(state->compareFn[i]),
-					   index_getprocinfo(index, i + 1, GIN_COMPARE_PROC),
-					   CurrentMemoryContext);
+		/*
+		 * If the compare proc isn't specified in the opclass definition, look
+		 * up the index key type's default btree comparator.
+		 */
+		if (index_getprocid(index, i + 1, GIN_COMPARE_PROC) != InvalidOid)
+		{
+			fmgr_info_copy(&(state->compareFn[i]),
+						   index_getprocinfo(index, i + 1, GIN_COMPARE_PROC),
+						   CurrentMemoryContext);
+		}
+		else
+		{
+			TypeCacheEntry *typentry;
+
+			typentry = lookup_type_cache(origTupdesc->attrs[i]->atttypid,
+										 TYPECACHE_CMP_PROC_FINFO);
+			if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
+				ereport(ERROR,
+						(errcode(ERRCODE_UNDEFINED_FUNCTION),
+				errmsg("could not identify a comparison function for type %s",
+					   format_type_be(origTupdesc->attrs[i]->atttypid))));
+			fmgr_info_copy(&(state->compareFn[i]),
+						   &(typentry->cmp_proc_finfo),
+						   CurrentMemoryContext);
+		}
+
+		/* Opclass must always provide extract procs */
 		fmgr_info_copy(&(state->extractValueFn[i]),
 					   index_getprocinfo(index, i + 1, GIN_EXTRACTVALUE_PROC),
 					   CurrentMemoryContext);
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index c258478f23..27e502a360 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -4,7 +4,7 @@
  *	  delete & vacuum routines for the postgres GIN
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -15,6 +15,7 @@
 #include "postgres.h"
 
 #include "access/gin_private.h"
+#include "access/ginxlog.h"
 #include "access/xloginsert.h"
 #include "commands/vacuum.h"
 #include "miscadmin.h"
@@ -108,75 +109,17 @@ xlogVacuumPage(Relation index, Buffer buffer)
 	PageSetLSN(page, recptr);
 }
 
-static bool
-ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, Buffer *rootBuffer)
-{
-	Buffer		buffer;
-	Page		page;
-	bool		hasVoidPage = FALSE;
-	MemoryContext oldCxt;
-
-	buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
-								RBM_NORMAL, gvs->strategy);
-	page = BufferGetPage(buffer);
-
-	/*
-	 * We should be sure that we don't concurrent with inserts, insert process
-	 * never release root page until end (but it can unlock it and lock
-	 * again). New scan can't start but previously started ones work
-	 * concurrently.
-	 */
-	if (isRoot)
-		LockBufferForCleanup(buffer);
-	else
-		LockBuffer(buffer, GIN_EXCLUSIVE);
-
-	Assert(GinPageIsData(page));
 
-	if (GinPageIsLeaf(page))
-	{
-		oldCxt = MemoryContextSwitchTo(gvs->tmpCxt);
-		ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs);
-		MemoryContextSwitchTo(oldCxt);
-		MemoryContextReset(gvs->tmpCxt);
-
-		/* if root is a leaf page, we don't desire further processing */
-		if (!isRoot && !hasVoidPage && GinDataLeafPageIsEmpty(page))
-			hasVoidPage = TRUE;
-	}
-	else
-	{
-		OffsetNumber i;
-		bool		isChildHasVoid = FALSE;
-
-		for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++)
-		{
-			PostingItem *pitem = GinDataPageGetPostingItem(page, i);
-
-			if (ginVacuumPostingTreeLeaves(gvs, PostingItemGetBlockNumber(pitem), FALSE, NULL))
-				isChildHasVoid = TRUE;
-		}
-
-		if (isChildHasVoid)
-			hasVoidPage = TRUE;
-	}
+typedef struct DataPageDeleteStack
+{
+	struct DataPageDeleteStack *child;
+	struct DataPageDeleteStack *parent;
 
-	/*
-	 * if we have root and there are empty pages in tree, then we don't
-	 * release lock to go further processing and guarantee that tree is unused
-	 */
-	if (!(isRoot && hasVoidPage))
-	{
-		UnlockReleaseBuffer(buffer);
-	}
-	else
-	{
-		Assert(rootBuffer);
-		*rootBuffer = buffer;
-	}
+	BlockNumber blkno;			/* current block number */
+	BlockNumber leftBlkno;		/* rightest non-deleted page on left */
+	bool		isRoot;
+} DataPageDeleteStack;
 
-	return hasVoidPage;
-}
 
 /*
  * Delete a posting tree page.
@@ -193,8 +136,13 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
 	BlockNumber rightlink;
 
 	/*
-	 * Lock the pages in the same order as an insertion would, to avoid
-	 * deadlocks: left, then right, then parent.
+	 * This function MUST be called only if someone of parent pages hold
+	 * exclusive cleanup lock. This guarantees that no insertions currently
+	 * happen in this subtree. Caller also acquire Exclusive lock on deletable
+	 * page and is acquiring and releasing exclusive lock on left page before.
+	 * Left page was locked and released. Then parent and this page are
+	 * locked. We acquire left page lock here only to mark page dirty after
+	 * changing right pointer.
 	 */
 	lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
 								 RBM_NORMAL, gvs->strategy);
@@ -204,10 +152,6 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
 								 RBM_NORMAL, gvs->strategy);
 
 	LockBuffer(lBuffer, GIN_EXCLUSIVE);
-	LockBuffer(dBuffer, GIN_EXCLUSIVE);
-	if (!isParentRoot)			/* parent is already locked by
-								 * LockBufferForCleanup() */
-		LockBuffer(pBuffer, GIN_EXCLUSIVE);
 
 	START_CRIT_SECTION();
 
@@ -271,26 +215,15 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
 		PageSetLSN(BufferGetPage(lBuffer), recptr);
 	}
 
-	if (!isParentRoot)
-		LockBuffer(pBuffer, GIN_UNLOCK);
 	ReleaseBuffer(pBuffer);
 	UnlockReleaseBuffer(lBuffer);
-	UnlockReleaseBuffer(dBuffer);
+	ReleaseBuffer(dBuffer);
 
 	END_CRIT_SECTION();
 
 	gvs->result->pages_deleted++;
 }
 
-typedef struct DataPageDeleteStack
-{
-	struct DataPageDeleteStack *child;
-	struct DataPageDeleteStack *parent;
-
-	BlockNumber blkno;			/* current block number */
-	BlockNumber leftBlkno;		/* rightest non-deleted page on left */
-	bool		isRoot;
-} DataPageDeleteStack;
 
 /*
  * scans posting tree and deletes empty pages
@@ -324,6 +257,10 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
 
 	buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
 								RBM_NORMAL, gvs->strategy);
+
+	if (!isRoot)
+		LockBuffer(buffer, GIN_EXCLUSIVE);
+
 	page = BufferGetPage(buffer);
 
 	Assert(GinPageIsData(page));
@@ -358,6 +295,9 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
 		}
 	}
 
+	if (!isRoot)
+		LockBuffer(buffer, GIN_UNLOCK);
+
 	ReleaseBuffer(buffer);
 
 	if (!meDelete)
@@ -366,37 +306,124 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
 	return meDelete;
 }
 
-static void
-ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
+
+/*
+ * Scan through posting tree, delete empty tuples from leaf pages.
+ * Also, this function collects empty subtrees (with all empty leafs).
+ * For parents of these subtrees CleanUp lock is taken, then we call
+ * ScanToDelete. This is done for every inner page, which points to
+ * empty subtree.
+ */
+static bool
+ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot)
 {
-	Buffer		rootBuffer = InvalidBuffer;
-	DataPageDeleteStack root,
-			   *ptr,
-			   *tmp;
+	Buffer		buffer;
+	Page		page;
+	bool		hasVoidPage = FALSE;
+	MemoryContext oldCxt;
+
+	buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
+								RBM_NORMAL, gvs->strategy);
+	page = BufferGetPage(buffer);
+
+	ginTraverseLock(buffer, false);
+
+	Assert(GinPageIsData(page));
 
-	if (ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE, &rootBuffer) == FALSE)
+	if (GinPageIsLeaf(page))
 	{
-		Assert(rootBuffer == InvalidBuffer);
-		return;
+		oldCxt = MemoryContextSwitchTo(gvs->tmpCxt);
+		ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs);
+		MemoryContextSwitchTo(oldCxt);
+		MemoryContextReset(gvs->tmpCxt);
+
+		/* if root is a leaf page, we don't desire further processing */
+		if (GinDataLeafPageIsEmpty(page))
+			hasVoidPage = TRUE;
+
+		UnlockReleaseBuffer(buffer);
+
+		return hasVoidPage;
 	}
+	else
+	{
+		OffsetNumber i;
+		bool		hasEmptyChild = FALSE;
+		bool		hasNonEmptyChild = FALSE;
+		OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
+		BlockNumber *children = palloc(sizeof(BlockNumber) * (maxoff + 1));
+
+		/*
+		 * Read all children BlockNumbers. Not sure it is safe if there are
+		 * many concurrent vacuums.
+		 */
 
-	memset(&root, 0, sizeof(DataPageDeleteStack));
-	root.leftBlkno = InvalidBlockNumber;
-	root.isRoot = TRUE;
+		for (i = FirstOffsetNumber; i <= maxoff; i++)
+		{
+			PostingItem *pitem = GinDataPageGetPostingItem(page, i);
 
-	vacuum_delay_point();
+			children[i] = PostingItemGetBlockNumber(pitem);
+		}
 
-	ginScanToDelete(gvs, rootBlkno, TRUE, &root, InvalidOffsetNumber);
+		UnlockReleaseBuffer(buffer);
 
-	ptr = root.child;
-	while (ptr)
-	{
-		tmp = ptr->child;
-		pfree(ptr);
-		ptr = tmp;
+		for (i = FirstOffsetNumber; i <= maxoff; i++)
+		{
+			if (ginVacuumPostingTreeLeaves(gvs, children[i], FALSE))
+				hasEmptyChild = TRUE;
+			else
+				hasNonEmptyChild = TRUE;
+		}
+
+		pfree(children);
+
+		vacuum_delay_point();
+
+		/*
+		 * All subtree is empty - just return TRUE to indicate that parent
+		 * must do a cleanup. Unless we are ROOT an there is way to go upper.
+		 */
+
+		if (hasEmptyChild && !hasNonEmptyChild && !isRoot)
+			return TRUE;
+
+		if (hasEmptyChild)
+		{
+			DataPageDeleteStack root,
+					   *ptr,
+					   *tmp;
+
+			buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
+										RBM_NORMAL, gvs->strategy);
+			LockBufferForCleanup(buffer);
+
+			memset(&root, 0, sizeof(DataPageDeleteStack));
+			root.leftBlkno = InvalidBlockNumber;
+			root.isRoot = TRUE;
+
+			ginScanToDelete(gvs, blkno, TRUE, &root, InvalidOffsetNumber);
+
+			ptr = root.child;
+
+			while (ptr)
+			{
+				tmp = ptr->child;
+				pfree(ptr);
+				ptr = tmp;
+			}
+
+			UnlockReleaseBuffer(buffer);
+		}
+
+		/* Here we have deleted all empty subtrees */
+		return FALSE;
 	}
+}
 
-	UnlockReleaseBuffer(rootBuffer);
+static void
+ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
+{
+	ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE);
 }
 
 /*
@@ -526,9 +553,7 @@ ginbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 
 	gvs.tmpCxt = AllocSetContextCreate(CurrentMemoryContext,
 									   "Gin vacuum temporary context",
-									   ALLOCSET_DEFAULT_MINSIZE,
-									   ALLOCSET_DEFAULT_INITSIZE,
-									   ALLOCSET_DEFAULT_MAXSIZE);
+									   ALLOCSET_DEFAULT_SIZES);
 	gvs.index = index;
 	gvs.callback = callback;
 	gvs.callback_state = callback_state;
diff --git a/src/backend/access/gin/ginvalidate.c b/src/backend/access/gin/ginvalidate.c
index 032508387d..0d2847456e 100644
--- a/src/backend/access/gin/ginvalidate.c
+++ b/src/backend/access/gin/ginvalidate.c
@@ -3,7 +3,7 @@
  * ginvalidate.c
  *	  Opclass validator for GIN.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -24,6 +24,7 @@
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
 #include "utils/syscache.h"
+#include "utils/regproc.h"
 
 
 /*
@@ -237,7 +238,7 @@ ginvalidate(Oid opclassoid)
 		if (opclassgroup &&
 			(opclassgroup->functionset & (((uint64) 1) << i)) != 0)
 			continue;			/* got it */
-		if (i == GIN_COMPARE_PARTIAL_PROC)
+		if (i == GIN_COMPARE_PROC || i == GIN_COMPARE_PARTIAL_PROC)
 			continue;			/* optional method */
 		if (i == GIN_CONSISTENT_PROC || i == GIN_TRICONSISTENT_PROC)
 			continue;			/* don't need both, see check below loop */
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index b4d310f337..7ba04e324f 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -4,7 +4,7 @@
  *	  WAL replay logic for inverted index.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -13,7 +13,9 @@
  */
 #include "postgres.h"
 
+#include "access/bufmask.h"
 #include "access/gin_private.h"
+#include "access/ginxlog.h"
 #include "access/xlogutils.h"
 #include "utils/memutils.h"
 
@@ -749,13 +751,43 @@ gin_xlog_startup(void)
 {
 	opCtx = AllocSetContextCreate(CurrentMemoryContext,
 								  "GIN recovery temporary context",
-								  ALLOCSET_DEFAULT_MINSIZE,
-								  ALLOCSET_DEFAULT_INITSIZE,
-								  ALLOCSET_DEFAULT_MAXSIZE);
+								  ALLOCSET_DEFAULT_SIZES);
 }
 
 void
 gin_xlog_cleanup(void)
 {
 	MemoryContextDelete(opCtx);
+	opCtx = NULL;
+}
+
+/*
+ * Mask a GIN page before running consistency checks on it.
+ */
+void
+gin_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+	GinPageOpaque opaque;
+
+	mask_page_lsn(page);
+	opaque = GinPageGetOpaque(page);
+
+	mask_page_hint_bits(page);
+
+	/*
+	 * GIN metapage doesn't use pd_lower/pd_upper. Other page types do. Hence,
+	 * we need to apply masking for those pages.
+	 */
+	if (opaque->flags != GIN_META)
+	{
+		/*
+		 * For GIN_DELETED page, the page is initialized to empty. Hence, mask
+		 * the page content.
+		 */
+		if (opaque->flags & GIN_DELETED)
+			mask_page_content(page);
+		else
+			mask_unused_space(page);
+	}
 }
diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index dd4c9fa70a..02228662b8 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -28,7 +28,7 @@ The current implementation of GiST supports:
 
 The support for concurrency implemented in PostgreSQL was developed based on
 the paper "Access Methods for Next-Generation Database Systems" by
-Marcel Kornaker:
+Marcel Kornacker:
 
     http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz
 
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 9a417ca2f4..6593771361 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -4,7 +4,7 @@
  *	  interface routines for the postgres GiST index access method.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -18,6 +18,8 @@
 #include "access/gistscan.h"
 #include "catalog/pg_collation.h"
 #include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
@@ -69,6 +71,7 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->amstorage = true;
 	amroutine->amclusterable = true;
 	amroutine->ampredlocks = false;
+	amroutine->amcanparallel = false;
 	amroutine->amkeytype = InvalidOid;
 
 	amroutine->ambuild = gistbuild;
@@ -88,6 +91,9 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->amendscan = gistendscan;
 	amroutine->ammarkpos = NULL;
 	amroutine->amrestrpos = NULL;
+	amroutine->amestimateparallelscan = NULL;
+	amroutine->aminitparallelscan = NULL;
+	amroutine->amparallelrescan = NULL;
 
 	PG_RETURN_POINTER(amroutine);
 }
@@ -105,9 +111,7 @@ createTempGistContext(void)
 {
 	return AllocSetContextCreate(CurrentMemoryContext,
 								 "GiST temporary context",
-								 ALLOCSET_DEFAULT_MINSIZE,
-								 ALLOCSET_DEFAULT_INITSIZE,
-								 ALLOCSET_DEFAULT_MAXSIZE);
+								 ALLOCSET_DEFAULT_SIZES);
 }
 
 /*
@@ -142,21 +146,23 @@ gistbuildempty(Relation index)
 bool
 gistinsert(Relation r, Datum *values, bool *isnull,
 		   ItemPointer ht_ctid, Relation heapRel,
-		   IndexUniqueCheck checkUnique)
+		   IndexUniqueCheck checkUnique,
+		   IndexInfo *indexInfo)
 {
+	GISTSTATE  *giststate = (GISTSTATE *) indexInfo->ii_AmCache;
 	IndexTuple	itup;
-	GISTSTATE  *giststate;
 	MemoryContext oldCxt;
 
-	giststate = initGISTstate(r);
+	/* Initialize GISTSTATE cache if first call in this statement */
+	if (giststate == NULL)
+	{
+		oldCxt = MemoryContextSwitchTo(indexInfo->ii_Context);
+		giststate = initGISTstate(r);
+		giststate->tempCxt = createTempGistContext();
+		indexInfo->ii_AmCache = (void *) giststate;
+		MemoryContextSwitchTo(oldCxt);
+	}
 
-	/*
-	 * We use the giststate's scan context as temp context too.  This means
-	 * that any memory leaked by the support functions is not reclaimed until
-	 * end of insert.  In most cases, we aren't going to call the support
-	 * functions very many times before finishing the insert, so this seems
-	 * cheaper than resetting a temp context for each function call.
-	 */
 	oldCxt = MemoryContextSwitchTo(giststate->tempCxt);
 
 	itup = gistFormTuple(giststate, r,
@@ -167,7 +173,7 @@ gistinsert(Relation r, Datum *values, bool *isnull,
 
 	/* cleanup */
 	MemoryContextSwitchTo(oldCxt);
-	freeGISTstate(giststate);
+	MemoryContextReset(giststate->tempCxt);
 
 	return false;
 }
@@ -495,18 +501,36 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
 	else
 	{
 		/*
-		 * Enough space. We also get here if ntuples==0.
+		 * Enough space.  We always get here if ntup==0.
 		 */
 		START_CRIT_SECTION();
 
 		/*
-		 * While we delete only one tuple at once we could mix calls
-		 * PageIndexTupleDelete() here and PageIndexMultiDelete() in
-		 * gistRedoPageUpdateRecord()
+		 * Delete old tuple if any, then insert new tuple(s) if any.  If
+		 * possible, use the fast path of PageIndexTupleOverwrite.
 		 */
 		if (OffsetNumberIsValid(oldoffnum))
-			PageIndexTupleDelete(page, oldoffnum);
-		gistfillbuffer(page, itup, ntup, InvalidOffsetNumber);
+		{
+			if (ntup == 1)
+			{
+				/* One-for-one replacement, so use PageIndexTupleOverwrite */
+				if (!PageIndexTupleOverwrite(page, oldoffnum, (Item) *itup,
+											 IndexTupleSize(*itup)))
+					elog(ERROR, "failed to add item to index page in \"%s\"",
+						 RelationGetRelationName(rel));
+			}
+			else
+			{
+				/* Delete old, then append new tuple(s) to page */
+				PageIndexTupleDelete(page, oldoffnum);
+				gistfillbuffer(page, itup, ntup, InvalidOffsetNumber);
+			}
+		}
+		else
+		{
+			/* Just append new tuples at the end of the page */
+			gistfillbuffer(page, itup, ntup, InvalidOffsetNumber);
+		}
 
 		MarkBufferDirty(buffer);
 
@@ -1411,9 +1435,7 @@ initGISTstate(Relation index)
 	/* Create the memory context that will hold the GISTSTATE */
 	scanCxt = AllocSetContextCreate(CurrentMemoryContext,
 									"GiST scan context",
-									ALLOCSET_DEFAULT_MINSIZE,
-									ALLOCSET_DEFAULT_INITSIZE,
-									ALLOCSET_DEFAULT_MAXSIZE);
+									ALLOCSET_DEFAULT_SIZES);
 	oldCxt = MemoryContextSwitchTo(scanCxt);
 
 	/* Create and fill in the GISTSTATE */
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 4e43a6932a..f1f08bb3d8 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -4,7 +4,7 @@
  *	  build algorithm for GiST indexes implementation.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -18,6 +18,7 @@
 
 #include "access/genam.h"
 #include "access/gist_private.h"
+#include "access/gistxlog.h"
 #include "access/xloginsert.h"
 #include "catalog/index.h"
 #include "miscadmin.h"
diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c
index 8e3fcfbdc1..ca4c32b3fe 100644
--- a/src/backend/access/gist/gistbuildbuffers.c
+++ b/src/backend/access/gist/gistbuildbuffers.c
@@ -4,7 +4,7 @@
  *	  node buffer management functions for GiST buffering build algorithm.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index 5ba7d0a793..5a4dea89ac 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -4,7 +4,7 @@
  *	  fetch tuples from a GiST scan.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -375,6 +375,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
 	}
 
 	so->nPageData = so->curPageData = 0;
+	scan->xs_hitup = NULL;		/* might point into pageDataCxt */
 	if (so->pageDataCxt)
 		MemoryContextReset(so->pageDataCxt);
 
@@ -441,12 +442,13 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
 			so->pageData[so->nPageData].offnum = i;
 
 			/*
-			 * In an index-only scan, also fetch the data from the tuple.
+			 * In an index-only scan, also fetch the data from the tuple.  The
+			 * reconstructed tuples are stored in pageDataCxt.
 			 */
 			if (scan->xs_want_itup)
 			{
 				oldcxt = MemoryContextSwitchTo(so->pageDataCxt);
-				so->pageData[so->nPageData].ftup =
+				so->pageData[so->nPageData].recontup =
 					gistFetchTuple(giststate, r, it);
 				MemoryContextSwitchTo(oldcxt);
 			}
@@ -478,7 +480,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
 				 * In an index-only scan, also fetch the data from the tuple.
 				 */
 				if (scan->xs_want_itup)
-					item->data.heap.ftup = gistFetchTuple(giststate, r, it);
+					item->data.heap.recontup = gistFetchTuple(giststate, r, it);
 			}
 			else
 			{
@@ -540,11 +542,11 @@ getNextNearest(IndexScanDesc scan)
 	bool		res = false;
 	int			i;
 
-	if (scan->xs_itup)
+	if (scan->xs_hitup)
 	{
 		/* free previously returned tuple */
-		pfree(scan->xs_itup);
-		scan->xs_itup = NULL;
+		pfree(scan->xs_hitup);
+		scan->xs_hitup = NULL;
 	}
 
 	do
@@ -601,7 +603,7 @@ getNextNearest(IndexScanDesc scan)
 
 			/* in an index-only scan, also return the reconstructed tuple. */
 			if (scan->xs_want_itup)
-				scan->xs_itup = item->data.heap.ftup;
+				scan->xs_hitup = item->data.heap.recontup;
 			res = true;
 		}
 		else
@@ -641,6 +643,7 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 
 		so->firstCall = false;
 		so->curPageData = so->nPageData = 0;
+		scan->xs_hitup = NULL;
 		if (so->pageDataCxt)
 			MemoryContextReset(so->pageDataCxt);
 
@@ -685,7 +688,7 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 
 				/* in an index-only scan, also return the reconstructed tuple */
 				if (scan->xs_want_itup)
-					scan->xs_itup = so->pageData[so->curPageData].ftup;
+					scan->xs_hitup = so->pageData[so->curPageData].recontup;
 
 				so->curPageData++;
 
@@ -765,6 +768,7 @@ gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 
 	/* Begin the scan by processing the root page */
 	so->curPageData = so->nPageData = 0;
+	scan->xs_hitup = NULL;
 	if (so->pageDataCxt)
 		MemoryContextReset(so->pageDataCxt);
 
diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c
index d47211afc0..15b89fd8ad 100644
--- a/src/backend/access/gist/gistproc.c
+++ b/src/backend/access/gist/gistproc.c
@@ -7,7 +7,7 @@
  * This gives R-tree behavior, with Guttman's poly-time split algorithm.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -17,6 +17,7 @@
  */
 #include "postgres.h"
 
+#include <float.h>
 #include <math.h>
 
 #include "access/gist.h"
diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c
index 6f07cd8d46..058544e2ae 100644
--- a/src/backend/access/gist/gistscan.c
+++ b/src/backend/access/gist/gistscan.c
@@ -4,7 +4,7 @@
  *	  routines to manage scans on GiST index relations
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -125,7 +125,7 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
 	 * which is created on the second call and reset on later calls.  Thus, in
 	 * the common case where a scan is only rescan'd once, we just put the
 	 * queue in scanCxt and don't pay the overhead of making a second memory
-	 * context.  If we do rescan more than once, the first RBTree is just left
+	 * context.  If we do rescan more than once, the first queue is just left
 	 * for dead until end of scan; this small wastage seems worth the savings
 	 * in the common case.
 	 */
@@ -140,9 +140,7 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
 		/* second time through */
 		so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt,
 											 "GiST queue context",
-											 ALLOCSET_DEFAULT_MINSIZE,
-											 ALLOCSET_DEFAULT_INITSIZE,
-											 ALLOCSET_DEFAULT_MAXSIZE);
+											 ALLOCSET_DEFAULT_SIZES);
 		first_time = false;
 	}
 	else
@@ -157,7 +155,7 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
 	 * tuple descriptor to represent the returned index tuples and create a
 	 * memory context to hold them during the scan.
 	 */
-	if (scan->xs_want_itup && !scan->xs_itupdesc)
+	if (scan->xs_want_itup && !scan->xs_hitupdesc)
 	{
 		int			natts;
 		int			attno;
@@ -176,16 +174,15 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
 							   scan->indexRelation->rd_opcintype[attno - 1],
 							   -1, 0);
 		}
-		scan->xs_itupdesc = so->giststate->fetchTupdesc;
+		scan->xs_hitupdesc = so->giststate->fetchTupdesc;
 
+		/* Also create a memory context that will hold the returned tuples */
 		so->pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt,
 												"GiST page data context",
-												ALLOCSET_DEFAULT_MINSIZE,
-												ALLOCSET_DEFAULT_INITSIZE,
-												ALLOCSET_DEFAULT_MAXSIZE);
+												ALLOCSET_DEFAULT_SIZES);
 	}
 
-	/* create new, empty RBTree for search queue */
+	/* create new, empty pairing heap for search queue */
 	oldCxt = MemoryContextSwitchTo(so->queueCxt);
 	so->queue = pairingheap_allocate(pairingheap_GISTSearchItem_cmp, scan);
 	MemoryContextSwitchTo(oldCxt);
@@ -316,6 +313,9 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
 		if (!first_time)
 			pfree(fn_extras);
 	}
+
+	/* any previous xs_hitup will have been pfree'd in context resets above */
+	scan->xs_hitup = NULL;
 }
 
 void
diff --git a/src/backend/access/gist/gistsplit.c b/src/backend/access/gist/gistsplit.c
index d394969a57..cffc5ddc75 100644
--- a/src/backend/access/gist/gistsplit.c
+++ b/src/backend/access/gist/gistsplit.c
@@ -15,7 +15,7 @@
  * gistSplitByKey() is the entry point to this file.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 26d4a64694..cbdaec9d2b 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -4,7 +4,7 @@
  *	  utilities routines for the postgres GiST index access method.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -13,6 +13,7 @@
  */
 #include "postgres.h"
 
+#include <float.h>
 #include <math.h>
 
 #include "access/gist_private.h"
@@ -624,9 +625,9 @@ gistFetchAtt(GISTSTATE *giststate, int nkey, Datum k, Relation r)
 
 /*
  * Fetch all keys in tuple.
- * returns new IndexTuple that contains GISTENTRY with fetched data
+ * Returns a new HeapTuple containing the originally-indexed data.
  */
-IndexTuple
+HeapTuple
 gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple)
 {
 	MemoryContext oldcxt = MemoryContextSwitchTo(giststate->tempCxt);
@@ -660,7 +661,7 @@ gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple)
 	}
 	MemoryContextSwitchTo(oldcxt);
 
-	return index_form_tuple(giststate->fetchTupdesc, fetchatt, isnull);
+	return heap_form_tuple(giststate->fetchTupdesc, fetchatt, isnull);
 }
 
 float
@@ -852,7 +853,7 @@ gistproperty(Oid index_oid, int attno,
 			 bool *res, bool *isnull)
 {
 	HeapTuple	tuple;
-	Form_pg_index rd_index;
+	Form_pg_index rd_index PG_USED_FOR_ASSERTS_ONLY;
 	Form_pg_opclass rd_opclass;
 	Datum		datum;
 	bool		disnull;
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 53e5cea580..77d9d12f0b 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -4,7 +4,7 @@
  *	  vacuuming routines for the postgres GiST index access method.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c
index ffd7fd631b..585c92be26 100644
--- a/src/backend/access/gist/gistvalidate.c
+++ b/src/backend/access/gist/gistvalidate.c
@@ -3,7 +3,7 @@
  * gistvalidate.c
  *	  Opclass validator for GiST.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -23,6 +23,7 @@
 #include "catalog/pg_type.h"
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
+#include "utils/regproc.h"
 #include "utils/syscache.h"
 
 
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 01c7ef7ea6..4f4fe8fab5 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -4,7 +4,7 @@
  *	  WAL replay logic for GiST.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -13,7 +13,9 @@
  */
 #include "postgres.h"
 
+#include "access/bufmask.h"
 #include "access/gist_private.h"
+#include "access/gistxlog.h"
 #include "access/xloginsert.h"
 #include "access/xlogutils.h"
 #include "utils/memutils.h"
@@ -80,9 +82,31 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
 
 		page = (Page) BufferGetPage(buffer);
 
-		/* Delete old tuples */
-		if (xldata->ntodelete > 0)
+		if (xldata->ntodelete == 1 && xldata->ntoinsert == 1)
 		{
+			/*
+			 * When replacing one tuple with one other tuple, we must use
+			 * PageIndexTupleOverwrite for consistency with gistplacetopage.
+			 */
+			OffsetNumber offnum = *((OffsetNumber *) data);
+			IndexTuple	itup;
+			Size		itupsize;
+
+			data += sizeof(OffsetNumber);
+			itup = (IndexTuple) data;
+			itupsize = IndexTupleSize(itup);
+			if (!PageIndexTupleOverwrite(page, offnum, (Item) itup, itupsize))
+				elog(ERROR, "failed to add item to GiST index page, size %d bytes",
+					 (int) itupsize);
+			data += itupsize;
+			/* should be nothing left after consuming 1 tuple */
+			Assert(data - begin == datalen);
+			/* update insertion count for assert check below */
+			ninserted++;
+		}
+		else if (xldata->ntodelete > 0)
+		{
+			/* Otherwise, delete old tuples if any */
 			OffsetNumber *todelete = (OffsetNumber *) data;
 
 			data += sizeof(OffsetNumber) * xldata->ntodelete;
@@ -92,7 +116,7 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
 				GistMarkTuplesDeleted(page);
 		}
 
-		/* add tuples */
+		/* Add new tuples if any */
 		if (data - begin < datalen)
 		{
 			OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber :
@@ -115,6 +139,7 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
 			}
 		}
 
+		/* Check that XLOG record contained expected number of tuples */
 		Assert(ninserted == xldata->ntoinsert);
 
 		PageSetLSN(page, lsn);
@@ -320,6 +345,48 @@ gist_xlog_cleanup(void)
 }
 
 /*
+ * Mask a Gist page before running consistency checks on it.
+ */
+void
+gist_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+
+	mask_page_lsn(page);
+
+	mask_page_hint_bits(page);
+	mask_unused_space(page);
+
+	/*
+	 * NSN is nothing but a special purpose LSN. Hence, mask it for the same
+	 * reason as mask_page_lsn.
+	 */
+	GistPageSetNSN(page, (uint64) MASK_MARKER);
+
+	/*
+	 * We update F_FOLLOW_RIGHT flag on the left child after writing WAL
+	 * record. Hence, mask this flag. See gistplacetopage() for details.
+	 */
+	GistMarkFollowRight(page);
+
+	if (GistPageIsLeaf(page))
+	{
+		/*
+		 * In gist leaf pages, it is possible to modify the LP_FLAGS without
+		 * emitting any WAL record. Hence, mask the line pointer flags. See
+		 * gistkillitems() for details.
+		 */
+		mask_lp_flags(page);
+	}
+
+	/*
+	 * During gist redo, we never mark a page as garbage. Hence, mask it to
+	 * ignore any differences.
+	 */
+	GistClearPageHasGarbage(page);
+}
+
+/*
  * Write WAL record of a page split.
  */
 XLogRecPtr
diff --git a/src/backend/access/hash/Makefile b/src/backend/access/hash/Makefile
index 5d3bd94d3e..b154569b46 100644
--- a/src/backend/access/hash/Makefile
+++ b/src/backend/access/hash/Makefile
@@ -12,7 +12,7 @@ subdir = src/backend/access/hash
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
-       hashsearch.o hashsort.o hashutil.o hashvalidate.o
+OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashsearch.o \
+       hashsort.o hashutil.o hashvalidate.o hash_xlog.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README
index 0a7da89285..c8a0ec78a9 100644
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
@@ -58,35 +58,51 @@ rules to support a variable number of overflow pages while not having to
 move primary bucket pages around after they are created.
 
 Primary bucket pages (henceforth just "bucket pages") are allocated in
-power-of-2 groups, called "split points" in the code.  Buckets 0 and 1
-are created when the index is initialized.  At the first split, buckets 2
-and 3 are allocated; when bucket 4 is needed, buckets 4-7 are allocated;
-when bucket 8 is needed, buckets 8-15 are allocated; etc.  All the bucket
-pages of a power-of-2 group appear consecutively in the index.  This
-addressing scheme allows the physical location of a bucket page to be
-computed from the bucket number relatively easily, using only a small
-amount of control information.  We take the log2() of the bucket number
-to determine which split point S the bucket belongs to, and then simply
-add "hashm_spares[S] + 1" (where hashm_spares[] is an array stored in the
-metapage) to compute the physical address.  hashm_spares[S] can be
-interpreted as the total number of overflow pages that have been allocated
-before the bucket pages of splitpoint S.  hashm_spares[0] is always 0,
-so that buckets 0 and 1 (which belong to splitpoint 0) always appear at
-block numbers 1 and 2, just after the meta page.  We always have
-hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
-former.  The difference between the two represents the number of overflow
-pages appearing between the bucket page groups of splitpoints N and N+1.
-
+power-of-2 groups, called "split points" in the code.  That means at every new
+splitpoint we double the existing number of buckets.  Allocating huge chunks
+of bucket pages all at once isn't optimal and we will take ages to consume
+those.  To avoid this exponential growth of index size, we did use a trick to
+break up allocation of buckets at the splitpoint into 4 equal phases.  If
+(2 ^ x) are the total buckets need to be allocated at a splitpoint (from now on
+we shall call this as a splitpoint group), then we allocate 1/4th (2 ^ (x - 2))
+of total buckets at each phase of splitpoint group.  Next quarter of allocation
+will only happen if buckets of the previous phase have been already consumed.
+For the initial splitpoint groups < 10 we will allocate all of their buckets in
+single phase only, as number of buckets allocated at initial groups are small
+in numbers.  And for the groups >= 10 the allocation process is distributed
+among four equal phases.  At group 10 we allocate (2 ^ 9) buckets in 4
+different phases {2 ^ 7, 2 ^ 7, 2 ^ 7, 2 ^ 7}, the numbers in curly braces
+indicate the number of buckets allocated within each phase of splitpoint group
+10.  And, for splitpoint group 11 and 12 allocation phases will be
+{2 ^ 8, 2 ^ 8, 2 ^ 8, 2 ^ 8} and {2 ^ 9, 2 ^ 9, 2 ^ 9, 2 ^ 9} respectively.  We
+can see that at each splitpoint group we double the total number of buckets
+from the previous group but in an incremental phase.  The bucket pages
+allocated within one phase of a splitpoint group will appear consecutively in
+the index.  This addressing scheme allows the physical location of a bucket
+page to be computed from the bucket number relatively easily, using only a
+small amount of control information.  If we look at the function
+_hash_spareindex for a given bucket number we first compute the
+splitpoint group it belongs to and then the phase to which the bucket belongs
+to.  Adding them we get the global splitpoint phase number S to which the
+bucket belongs and then simply add "hashm_spares[S] + 1" (where hashm_spares[]
+is an array stored in the metapage) with given bucket number to compute its
+physical address.  The hashm_spares[S] can be interpreted as the total number
+of overflow pages that have been allocated before the bucket pages of
+splitpoint phase S.  The hashm_spares[0] is always 0, so that buckets 0 and 1
+always appear at block numbers 1 and 2, just after the meta page.  We always
+have hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
+former.  The difference between the two represents the number of overflow pages
+appearing between the bucket page groups of splitpoints phase N and N+1.
 (Note: the above describes what happens when filling an initially minimally
-sized hash index.  In practice, we try to estimate the required index size
-and allocate a suitable number of splitpoints immediately, to avoid
+sized hash index.  In practice, we try to estimate the required index size and
+allocate a suitable number of splitpoints phases immediately, to avoid
 expensive re-splitting during initial index build.)
 
 When S splitpoints exist altogether, the array entries hashm_spares[0]
 through hashm_spares[S] are valid; hashm_spares[S] records the current
 total number of overflow pages.  New overflow pages are created as needed
 at the end of the index, and recorded by incrementing hashm_spares[S].
-When it is time to create a new splitpoint's worth of bucket pages, we
+When it is time to create a new splitpoint phase's worth of bucket pages, we
 copy hashm_spares[S] into hashm_spares[S+1] and increment S (which is
 stored in the hashm_ovflpoint field of the meta page).  This has the
 effect of reserving the correct number of bucket pages at the end of the
@@ -101,7 +117,7 @@ We have to allow the case "greater than" because it's possible that during
 an index extension we crash after allocating filesystem space and before
 updating the metapage.  Note that on filesystems that allow "holes" in
 files, it's entirely likely that pages before the logical EOF are not yet
-allocated: when we allocate a new splitpoint's worth of bucket pages, we
+allocated: when we allocate a new splitpoint phase's worth of bucket pages, we
 physically zero the last such page to force the EOF up, and the first such
 page will be used immediately, but the intervening pages are not written
 until needed.
@@ -126,53 +142,98 @@ the initially created buckets.
 Lock Definitions
 ----------------
 
-We use both lmgr locks ("heavyweight" locks) and buffer context locks
-(LWLocks) to control access to a hash index.  lmgr locks are needed for
-long-term locking since there is a (small) risk of deadlock, which we must
-be able to detect.  Buffer context locks are used for short-term access
-control to individual pages of the index.
-
-LockPage(rel, page), where page is the page number of a hash bucket page,
-represents the right to split or compact an individual bucket.  A process
-splitting a bucket must exclusive-lock both old and new halves of the
-bucket until it is done.  A process doing VACUUM must exclusive-lock the
-bucket it is currently purging tuples from.  Processes doing scans or
-insertions must share-lock the bucket they are scanning or inserting into.
-(It is okay to allow concurrent scans and insertions.)
-
-The lmgr lock IDs corresponding to overflow pages are currently unused.
-These are available for possible future refinements.  LockPage(rel, 0)
-is also currently undefined (it was previously used to represent the right
-to modify the hash-code-to-bucket mapping, but it is no longer needed for
-that purpose).
-
-Note that these lock definitions are conceptually distinct from any sort
-of lock on the pages whose numbers they share.  A process must also obtain
-read or write buffer lock on the metapage or bucket page before accessing
-said page.
-
-Processes performing hash index scans must hold share lock on the bucket
-they are scanning throughout the scan.  This seems to be essential, since
-there is no reasonable way for a scan to cope with its bucket being split
-underneath it.  This creates a possibility of deadlock external to the
-hash index code, since a process holding one of these locks could block
-waiting for an unrelated lock held by another process.  If that process
-then does something that requires exclusive lock on the bucket, we have
-deadlock.  Therefore the bucket locks must be lmgr locks so that deadlock
-can be detected and recovered from.
-
-Processes must obtain read (share) buffer context lock on any hash index
-page while reading it, and write (exclusive) lock while modifying it.
-To prevent deadlock we enforce these coding rules: no buffer lock may be
-held long term (across index AM calls), nor may any buffer lock be held
-while waiting for an lmgr lock, nor may more than one buffer lock
-be held at a time by any one process.  (The third restriction is probably
-stronger than necessary, but it makes the proof of no deadlock obvious.)
+Concurrency control for hash indexes is provided using buffer content
+locks, buffer pins, and cleanup locks.   Here as elsewhere in PostgreSQL,
+cleanup lock means that we hold an exclusive lock on the buffer and have
+observed at some point after acquiring the lock that we hold the only pin
+on that buffer.  For hash indexes, a cleanup lock on a primary bucket page
+represents the right to perform an arbitrary reorganization of the entire
+bucket.  Therefore, scans retain a pin on the primary bucket page for the
+bucket they are currently scanning.  Splitting a bucket requires a cleanup
+lock on both the old and new primary bucket pages.  VACUUM therefore takes
+a cleanup lock on every bucket page in order to remove tuples.  It can also
+remove tuples copied to a new bucket by any previous split operation, because
+the cleanup lock taken on the primary bucket page guarantees that no scans
+which started prior to the most recent split can still be in progress.  After
+cleaning each page individually, it attempts to take a cleanup lock on the
+primary bucket page in order to "squeeze" the bucket down to the minimum
+possible number of pages.
+
+To avoid deadlocks, we must be consistent about the lock order in which we
+lock the buckets for operations that requires locks on two different buckets.
+We choose to always lock the lower-numbered bucket first.  The metapage is
+only ever locked after all bucket locks have been taken.
+
+
+Metapage Caching
+----------------
 
+Both scanning the index and inserting tuples require locating the bucket
+where a given tuple ought to be located.  To do this, we need the bucket
+count, highmask, and lowmask from the metapage; however, it's undesirable
+for performance reasons to have to have to lock and pin the metapage for
+every such operation.  Instead, we retain a cached copy of the metapage
+in each each backend's relcache entry.  This will produce the correct
+bucket mapping as long as the target bucket hasn't been split since the
+last cache refresh.
+
+To guard against the possibility that such a split has occurred, the
+primary page of each bucket chain stores the number of buckets that
+existed as of the time the bucket was last split, or if never split as
+of the time it was created, in the space normally used for the
+previous block number (that is, hasho_prevblkno).  This doesn't cost
+anything because the primary bucket page is always the first page in
+the chain, and the previous block number is therefore always, in
+reality, InvalidBlockNumber.
+
+After computing the ostensibly-correct bucket number based on our cached
+copy of the metapage, we lock the corresponding primary bucket page and
+check whether the bucket count stored in hasho_prevblkno is greater than
+our the number of buckets stored in our cached copy of the metapage.  If
+so, the bucket has certainly been split, because the must originally
+have been less than the number of buckets that existed at that time and
+can't have increased except due to a split.  If not, the bucket can't have
+been split, because a split would have created a new bucket with a higher
+bucket number than any we'd seen previously.  In the latter case, we've
+locked the correct bucket and can proceed; in the former case, we must
+release the lock on this bucket, lock the metapage, update our cache,
+unlock the metapage, and retry.
+
+Needing to retry occasionally might seem expensive, but the number of times
+any given bucket can be split is limited to a few dozen no matter how
+many times the hash index is accessed, because the total number of
+buckets is limited to less than 2^32.  On the other hand, the number of
+times we access a bucket is unbounded and will be several orders of
+magnitude larger even in unsympathetic cases.
+
+(The metapage cache is new in v10.  Older hash indexes had the primary
+bucket page's hasho_prevblkno initialized to InvalidBuffer.)
 
 Pseudocode Algorithms
 ---------------------
 
+Various flags that are used in hash index operations are described as below:
+
+The bucket-being-split and bucket-being-populated flags indicate that split
+the operation is in progress for a bucket.  During split operation, a
+bucket-being-split flag is set on the old bucket and bucket-being-populated
+flag is set on new bucket.  These flags are cleared once the split operation
+is finished.
+
+The split-cleanup flag indicates that a bucket which has been recently split
+still contains tuples that were also copied to the new bucket; it essentially
+marks the split as incomplete.  Once we're certain that no scans which
+started before the new bucket was fully populated are still in progress, we
+can remove the copies from the old bucket and clear the flag.  We insist that
+this flag must be clear before splitting a bucket; thus, a bucket can't be
+split again until the previous split is totally complete.
+
+The moved-by-split flag on a tuple indicates that tuple is moved from old to
+new bucket.  Concurrent scans will skip such tuples until the split operation
+is finished.  Once the tuple is marked as moved-by-split, it will remain so
+forever but that does no harm.  We have intentionally not cleared it as that
+can generate an additional I/O which is not necessary.
+
 The operations we need to support are: readers scanning the index for
 entries of a particular hash code (which by definition are all in the same
 bucket); insertion of a new tuple into the correct bucket; enlarging the
@@ -187,67 +248,75 @@ track of available overflow pages.
 
 The reader algorithm is:
 
-	pin meta page and take buffer content lock in shared mode
-	loop:
-		compute bucket number for target hash key
-		release meta page buffer content lock
-		if (correct bucket page is already locked)
-			break
-		release any existing bucket page lock (if a concurrent split happened)
-		take heavyweight bucket lock
-		retake meta page buffer content lock in shared mode
+    lock the primary bucket page of the target bucket
+	if the target bucket is still being populated by a split:
+		release the buffer content lock on current bucket page
+		pin and acquire the buffer content lock on old bucket in shared mode
+		release the buffer content lock on old bucket, but not pin
+		retake the buffer content lock on new bucket
+		arrange to scan the old bucket normally and the new bucket for
+         tuples which are not moved-by-split
 -- then, per read request:
-	release pin on metapage
-	read current page of bucket and take shared buffer content lock
-		step to next page if necessary (no chaining of locks)
+	reacquire content lock on current page
+	step to next page if necessary (no chaining of content locks, but keep
+     the pin on the primary bucket throughout the scan; we also maintain
+     a pin on the page currently being scanned)
 	get tuple
-	release buffer content lock and pin on current page
+	release content lock
 -- at scan shutdown:
-	release bucket share-lock
-
-We can't hold the metapage lock while acquiring a lock on the target bucket,
-because that might result in an undetected deadlock (lwlocks do not participate
-in deadlock detection).  Instead, we relock the metapage after acquiring the
-bucket page lock and check whether the bucket has been split.  If not, we're
-done.  If so, we release our previously-acquired lock and repeat the process
-using the new bucket number.  Holding the bucket sharelock for
-the remainder of the scan prevents the reader's current-tuple pointer from
-being invalidated by splits or compactions.  Notice that the reader's lock
-does not prevent other buckets from being split or compacted.
+	release all pins still held
+
+Holding the buffer pin on the primary bucket page for the whole scan prevents
+the reader's current-tuple pointer from being invalidated by splits or
+compactions.  (Of course, other buckets can still be split or compacted.)
 
 To keep concurrency reasonably good, we require readers to cope with
 concurrent insertions, which means that they have to be able to re-find
-their current scan position after re-acquiring the page sharelock.  Since
-deletion is not possible while a reader holds the bucket sharelock, and
-we assume that heap tuple TIDs are unique, this can be implemented by
+their current scan position after re-acquiring the buffer content lock on
+page.  Since deletion is not possible while a reader holds the pin on bucket,
+and we assume that heap tuple TIDs are unique, this can be implemented by
 searching for the same heap tuple TID previously returned.  Insertion does
 not move index entries across pages, so the previously-returned index entry
 should always be on the same page, at the same or higher offset number,
 as it was before.
 
+To allow for scans during a bucket split, if at the start of the scan, the
+bucket is marked as bucket-being-populated, it scan all the tuples in that
+bucket except for those that are marked as moved-by-split.  Once it finishes
+the scan of all the tuples in the current bucket, it scans the old bucket from
+which this bucket is formed by split.
+
 The insertion algorithm is rather similar:
 
-	pin meta page and take buffer content lock in shared mode
-	loop:
-		compute bucket number for target hash key
-		release meta page buffer content lock
-		if (correct bucket page is already locked)
-			break
-		release any existing bucket page lock (if a concurrent split happened)
-		take heavyweight bucket lock in shared mode
-		retake meta page buffer content lock in shared mode
--- (so far same as reader)
-	release pin on metapage
-	pin current page of bucket and take exclusive buffer content lock
-	if full, release, read/exclusive-lock next page; repeat as needed
+    lock the primary bucket page of the target bucket
+-- (so far same as reader, except for acquisition of buffer content lock in
+	exclusive mode on primary bucket page)
+	if the bucket-being-split flag is set for a bucket and pin count on it is
+	 one, then finish the split
+		release the buffer content lock on current bucket
+		get the "new" bucket which was being populated by the split
+		scan the new bucket and form the hash table of TIDs
+		conditionally get the cleanup lock on old and new buckets
+		if we get the lock on both the buckets
+			finish the split using algorithm mentioned below for split
+		release the pin on old bucket and restart the insert from beginning.
+	if current page is full, first check if this page contains any dead tuples.
+	if yes, remove dead tuples from the current page and again check for the
+	availability of the space. If enough space found, insert the tuple else
+	release lock but not pin, read/exclusive-lock
+     next page; repeat as needed
 	>> see below if no space in any page of bucket
+	take buffer content lock in exclusive mode on metapage
 	insert tuple at appropriate place in page
-	mark current page dirty and release buffer content lock and pin
-	release heavyweight share-lock
-	pin meta page and take buffer content lock in shared mode
+	mark current page dirty
 	increment tuple count, decide if split needed
-	mark meta page dirty and release buffer content lock and pin
-	done if no split needed, else enter Split algorithm below
+	mark meta page dirty
+	write WAL for insertion of tuple
+	release the buffer content lock on metapage
+	release buffer content lock on current page
+	if current page is not a bucket page, release the pin on bucket page
+	if split is needed, enter Split algorithm below
+	release the pin on metapage
 
 To speed searches, the index entries within any individual index page are
 kept sorted by hash code; the insertion code must take care to insert new
@@ -256,11 +325,13 @@ bucket that is being actively scanned, because readers can cope with this
 as explained above.  We only need the short-term buffer locks to ensure
 that readers do not see a partially-updated page.
 
-It is clearly impossible for readers and inserters to deadlock, and in
-fact this algorithm allows them a very high degree of concurrency.
-(The exclusive metapage lock taken to update the tuple count is stronger
-than necessary, since readers do not care about the tuple count, but the
-lock is held for such a short time that this is probably not an issue.)
+To avoid deadlock between readers and inserters, whenever there is a need
+to lock multiple buckets, we always take in the order suggested in Lock
+Definitions above.  This algorithm allows them a very high degree of
+concurrency.  (The exclusive metapage lock taken to update the tuple count
+is stronger than necessary, since readers do not care about the tuple count,
+but the lock is held for such a short time that this is probably not an
+issue.)
 
 When an inserter cannot find space in any existing page of a bucket, it
 must obtain an overflow page and add that page to the bucket's chain.
@@ -271,46 +342,47 @@ index is overfull (has a higher-than-wanted ratio of tuples to buckets).
 The algorithm attempts, but does not necessarily succeed, to split one
 existing bucket in two, thereby lowering the fill ratio:
 
-	pin meta page and take buffer content lock in exclusive mode
-	check split still needed
-	if split not needed anymore, drop buffer content lock and pin and exit
-	decide which bucket to split
-	Attempt to X-lock old bucket number (definitely could fail)
-	Attempt to X-lock new bucket number (shouldn't fail, but...)
-	if above fail, drop locks and pin and exit
-	update meta page to reflect new number of buckets
-	mark meta page dirty and release buffer content lock and pin
-	-- now, accesses to all other buckets can proceed.
-	Perform actual split of bucket, moving tuples as needed
-	>> see below about acquiring needed extra space
-	Release X-locks of old and new buckets
-
-Note the metapage lock is not held while the actual tuple rearrangement is
-performed, so accesses to other buckets can proceed in parallel; in fact,
-it's possible for multiple bucket splits to proceed in parallel.
-
-Split's attempt to X-lock the old bucket number could fail if another
-process holds S-lock on it.  We do not want to wait if that happens, first
-because we don't want to wait while holding the metapage exclusive-lock,
-and second because it could very easily result in deadlock.  (The other
-process might be out of the hash AM altogether, and could do something
-that blocks on another lock this process holds; so even if the hash
-algorithm itself is deadlock-free, a user-induced deadlock could occur.)
-So, this is a conditional LockAcquire operation, and if it fails we just
-abandon the attempt to split.  This is all right since the index is
-overfull but perfectly functional.  Every subsequent inserter will try to
-split, and eventually one will succeed.  If multiple inserters failed to
-split, the index might still be overfull, but eventually, the index will
+    pin meta page and take buffer content lock in exclusive mode
+    check split still needed
+    if split not needed anymore, drop buffer content lock and pin and exit
+    decide which bucket to split
+    try to take a cleanup lock on that bucket; if fail, give up
+    if that bucket is still being split or has split-cleanup work:
+       try to finish the split and the cleanup work
+       if that succeeds, start over; if it fails, give up
+	mark the old and new buckets indicating split is in progress
+	mark both old and new buckets as dirty
+	write WAL for allocation of new page for split
+	copy the tuples that belongs to new bucket from old bucket, marking
+     them as moved-by-split
+	write WAL record for moving tuples to new page once the new page is full
+	or all the pages of old bucket are finished
+	release lock but not pin for primary bucket page of old bucket,
+	 read/shared-lock next page; repeat as needed
+	clear the bucket-being-split and bucket-being-populated flags
+	mark the old bucket indicating split-cleanup
+	write WAL for changing the flags on both old and new buckets
+
+The split operation's attempt to acquire cleanup-lock on the old bucket number
+could fail if another process holds any lock or pin on it.  We do not want to
+wait if that happens, because we don't want to wait while holding the metapage
+exclusive-lock.  So, this is a conditional LWLockAcquire operation, and if
+it fails we just abandon the attempt to split.  This is all right since the
+index is overfull but perfectly functional.  Every subsequent inserter will
+try to split, and eventually one will succeed.  If multiple inserters failed
+to split, the index might still be overfull, but eventually, the index will
 not be overfull and split attempts will stop.  (We could make a successful
 splitter loop to see if the index is still overfull, but it seems better to
 distribute the split overhead across successive insertions.)
 
-A problem is that if a split fails partway through (eg due to insufficient
-disk space) the index is left corrupt.  The probability of that could be
-made quite low if we grab a free page or two before we update the meta
-page, but the only real solution is to treat a split as a WAL-loggable,
-must-complete action.  I'm not planning to teach hash about WAL in this
-go-round.
+If a split fails partway through (e.g. due to insufficient disk space or an
+interrupt), the index will not be corrupted.  Instead, we'll retry the split
+every time a tuple is inserted into the old bucket prior to inserting the new
+tuple; eventually, we should succeed.  The fact that a split is left
+unfinished doesn't prevent subsequent buckets from being split, but we won't
+try to split the bucket again until the prior split is finished.  In other
+words, a bucket can be in the middle of being split for some time, but it can't
+be in the middle of two splits at the same time.
 
 The fourth operation is garbage collection (bulk deletion):
 
@@ -319,31 +391,46 @@ The fourth operation is garbage collection (bulk deletion):
 	fetch current max bucket number
 	release meta page buffer content lock and pin
 	while next bucket <= max bucket do
-		Acquire X lock on target bucket
-		Scan and remove tuples, compact free space as needed
-		Release X lock
+		acquire cleanup lock on primary bucket page
+		loop:
+			scan and remove tuples
+			mark the target page dirty
+			write WAL for deleting tuples from target page
+			if this is the last bucket page, break out of loop
+			pin and x-lock next page
+			release prior lock and pin (except keep pin on primary bucket page)
+		if the page we have locked is not the primary bucket page:
+			release lock and take exclusive lock on primary bucket page
+		if there are no other pins on the primary bucket page:
+			squeeze the bucket to remove free space
+		release the pin on primary bucket page
 		next bucket ++
 	end loop
 	pin metapage and take buffer content lock in exclusive mode
 	check if number of buckets changed
 	if so, release content lock and pin and return to for-each-bucket loop
 	else update metapage tuple count
-	mark meta page dirty and release buffer content lock and pin
-
-Note that this is designed to allow concurrent splits.  If a split occurs,
-tuples relocated into the new bucket will be visited twice by the scan,
-but that does no harm.  (We must however be careful about the statistics
+	mark meta page dirty and write WAL for update of metapage
+	release buffer content lock and pin
+
+Note that this is designed to allow concurrent splits and scans.  If a split
+occurs, tuples relocated into the new bucket will be visited twice by the
+scan, but that does no harm.  As we release the lock on bucket page during
+cleanup scan of a bucket, it will allow concurrent scan to start on a bucket
+and ensures that scan will always be behind cleanup.  It is must to keep scans
+behind cleanup, else vacuum could decrease the TIDs that are required to
+complete the scan.  Now, as the scan that returns multiple tuples from the
+same bucket page always expect next valid TID to be greater than or equal to
+the current TID, it might miss the tuples.  This holds true for backward scans
+as well (backward scans first traverse each bucket starting from first bucket
+to last overflow page in the chain).  We must be careful about the statistics
 reported by the VACUUM operation.  What we can do is count the number of
-tuples scanned, and believe this in preference to the stored tuple count
-if the stored tuple count and number of buckets did *not* change at any
-time during the scan.  This provides a way of correcting the stored tuple
-count if it gets out of sync for some reason.  But if a split or insertion
-does occur concurrently, the scan count is untrustworthy; instead,
-subtract the number of tuples deleted from the stored tuple count and
-use that.)
-
-The exclusive lock request could deadlock in some strange scenarios, but
-we can just error out without any great harm being done.
+tuples scanned, and believe this in preference to the stored tuple count if
+the stored tuple count and number of buckets did *not* change at any time
+during the scan.  This provides a way of correcting the stored tuple count if
+it gets out of sync for some reason.  But if a split or insertion does occur
+concurrently, the scan count is untrustworthy; instead, subtract the number of
+tuples deleted from the stored tuple count and use that.
 
 
 Free Space Management
@@ -366,18 +453,16 @@ Obtaining an overflow page:
 	search for a free page (zero bit in bitmap)
 	if found:
 		set bit in bitmap
-		mark bitmap page dirty and release content lock
+		mark bitmap page dirty
 		take metapage buffer content lock in exclusive mode
 		if first-free-bit value did not change,
 			update it and mark meta page dirty
-		release meta page buffer content lock
-		return page number
 	else (not found):
 	release bitmap page buffer content lock
 	loop back to try next bitmap page, if any
 -- here when we have checked all bitmap pages; we hold meta excl. lock
 	extend index to add another overflow page; update meta information
-	mark meta page dirty and release buffer content lock
+	mark meta page dirty
 	return page number
 
 It is slightly annoying to release and reacquire the metapage lock
@@ -397,12 +482,17 @@ like this:
 
 	-- having determined that no space is free in the target bucket:
 	remember last page of bucket, drop write lock on it
-	call free-page-acquire routine
 	re-write-lock last page of bucket
 	if it is not last anymore, step to the last page
-	update (former) last page to point to new page
+	execute free-page-acquire (obtaining an overflow page) mechanism
+      described above
+	update (former) last page to point to the new page and mark buffer dirty
 	write-lock and initialize new page, with back link to former last page
-	write and release former last page
+	write WAL for addition of overflow page
+	release the locks on meta page and bitmap page acquired in
+      free-page-acquire algorithm
+	release the lock on former last page
+	release the lock on new overflow page
 	insert tuple into new page
 	-- etc.
 
@@ -417,13 +507,11 @@ free page; there can be no other process holding lock on it.
 
 Bucket splitting uses a similar algorithm if it has to extend the new
 bucket, but it need not worry about concurrent extension since it has
-exclusive lock on the new bucket.
+buffer content lock in exclusive mode on the new bucket.
 
-Freeing an overflow page is done by garbage collection and by bucket
-splitting (the old bucket may contain no-longer-needed overflow pages).
-In both cases, the process holds exclusive lock on the containing bucket,
-so need not worry about other accessors of pages in the bucket.  The
-algorithm is:
+Freeing an overflow page requires the process to hold buffer content lock in
+exclusive mode on the containing bucket, so need not worry about other
+accessors of pages in the bucket.  The algorithm is:
 
 	delink overflow page from bucket chain
 	(this requires read/update/write/release of fore and aft siblings)
@@ -431,12 +519,14 @@ algorithm is:
 	determine which bitmap page contains the free space bit for page
 	release meta page buffer content lock
 	pin bitmap page and take buffer content lock in exclusive mode
-	update bitmap bit
-	mark bitmap page dirty and release buffer content lock and pin
-	if page number is less than what we saw as first-free-bit in meta:
 	retake meta page buffer content lock in exclusive mode
+	move (insert) tuples that belong to the overflow page being freed
+	update bitmap bit
+	mark bitmap page dirty
 	if page number is still less than first-free-bit,
 		update first-free-bit field and mark meta page dirty
+	write WAL for delinking overflow page operation
+	release buffer content lock and pin
 	release meta page buffer content lock and pin
 
 We have to do it this way because we must clear the bitmap bit before
@@ -447,21 +537,96 @@ page acquirer will scan more bitmap bits than he needs to.  What must be
 avoided is having first-free-bit greater than the actual first free bit,
 because then that free page would never be found by searchers.
 
-All the freespace operations should be called while holding no buffer
-locks.  Since they need no lmgr locks, deadlock is not possible.
+The reason of moving tuples from overflow page while delinking the later is
+to make that as an atomic operation.  Not doing so could lead to spurious reads
+on standby.  Basically, the user might see the same tuple twice.
+
+
+WAL Considerations
+------------------
+
+The hash index operations like create index, insert, delete, bucket split,
+allocate overflow page, and squeeze in themselves don't guarantee hash index
+consistency after a crash.  To provide robustness, we write WAL for each of
+these operations.
+
+CREATE INDEX writes multiple WAL records.  First, we write a record to cover
+the initializatoin of the metapage, followed by one for each new bucket
+created, followed by one for the initial bitmap page.  It's not important for
+index creation to appear atomic, because the index isn't yet visible to any
+other transaction, and the creating transaction will roll back in the event of
+a crash.  It would be difficult to cover the whole operation with a single
+write-ahead log record anyway, because we can log only a fixed number of
+pages, as given by XLR_MAX_BLOCK_ID (32), with current XLog machinery.
+
+Ordinary item insertions (that don't force a page split or need a new overflow
+page) are single WAL entries.  They touch a single bucket page and the
+metapage.  The metapage is updated during replay as it is updated during
+original operation.
+
+If an insertion causes the addition of an overflow page, there will be one
+WAL entry for the new overflow page and second entry for insert itself.
+
+If an insertion causes a bucket split, there will be one WAL entry for insert
+itself, followed by a WAL entry for allocating a new bucket, followed by a WAL
+entry for each overflow bucket page in the new bucket to which the tuples are
+moved from old bucket, followed by a WAL entry to indicate that split is
+complete for both old and new buckets.  A split operation which requires
+overflow pages to complete the operation will need to write a WAL record for
+each new allocation of an overflow page.
+
+As splitting involves multiple atomic actions, it's possible that the system
+crashes between moving tuples from bucket pages of the old bucket to new
+bucket.  In such a case, after recovery, the old and new buckets will be
+marked with bucket-being-split and bucket-being-populated flags respectively
+which indicates that split is in progress for those buckets.  The reader
+algorithm works correctly, as it will scan both the old and new buckets when
+the split is in progress as explained in the reader algorithm section above.
+
+We finish the split at next insert or split operation on the old bucket as
+explained in insert and split algorithm above.  It could be done during
+searches, too, but it seems best not to put any extra updates in what would
+otherwise be a read-only operation (updating is not possible in hot standby
+mode anyway).  It would seem natural to complete the split in VACUUM, but since
+splitting a bucket might require allocating a new page, it might fail if you
+run out of disk space.  That would be bad during VACUUM - the reason for
+running VACUUM in the first place might be that you run out of disk space,
+and now VACUUM won't finish because you're out of disk space.  In contrast,
+an insertion can require enlarging the physical file anyway.
+
+Deletion of tuples from a bucket is performed for two reasons: to remove dead
+tuples, and to remove tuples that were moved by a bucket split.  A WAL entry
+is made for each bucket page from which tuples are removed, and then another
+WAL entry is made when we clear the needs-split-cleanup flag.  If dead tuples
+are removed, a separate WAL entry is made to update the metapage.
+
+As deletion involves multiple atomic operations, it is quite possible that
+system crashes after (a) removing tuples from some of the bucket pages, (b)
+before clearing the garbage flag, or (c) before updating the metapage.  If the
+system crashes before completing (b), it will again try to clean the bucket
+during next vacuum or insert after recovery which can have some performance
+impact, but it will work fine. If the system crashes before completing (c),
+after recovery there could be some additional splits until the next vacuum
+updates the metapage, but the other operations like insert, delete and scan
+will work correctly.  We can fix this problem by actually updating the
+metapage based on delete operation during replay, but it's not clear whether
+it's worth the complication.
+
+A squeeze operation moves tuples from one of the buckets later in the chain to
+one of the bucket earlier in chain and writes WAL record when either the
+bucket to which it is writing tuples is filled or bucket from which it
+is removing the tuples becomes empty.
+
+As a squeeze operation involves writing multiple atomic operations, it is
+quite possible that the system crashes before completing the operation on
+entire bucket.  After recovery, the operations will work correctly, but
+the index will remain bloated and this can impact performance of read and
+insert operations until the next vacuum squeeze the bucket completely.
 
 
 Other Notes
 -----------
 
-All the shenanigans with locking prevent a split occurring while *another*
-process is stopped in a given bucket.  They do not ensure that one of
-our *own* backend's scans is not stopped in the bucket, because lmgr
-doesn't consider a process's own locks to conflict.  So the Split
-algorithm must check for that case separately before deciding it can go
-ahead with the split.  VACUUM does not have this problem since nothing
-else can be happening within the vacuuming backend.
-
-Should we instead try to fix the state of any conflicting local scan?
-Seems mighty ugly --- got to move the held bucket S-lock as well as lots
-of other messiness.  For now, just punt and don't split.
+Clean up locks prevent a split from occurring while *another* process is stopped
+in a given bucket.  It also ensures that one of our *own* backend's scans is not
+stopped in the bucket.
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 07496f8156..8a3297924f 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -3,7 +3,7 @@
  * hash.c
  *	  Implementation of Margo Seltzer's Hashing package for postgres.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -19,13 +19,16 @@
 #include "postgres.h"
 
 #include "access/hash.h"
+#include "access/hash_xlog.h"
 #include "access/relscan.h"
 #include "catalog/index.h"
 #include "commands/vacuum.h"
 #include "miscadmin.h"
 #include "optimizer/plancat.h"
+#include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
 #include "utils/rel.h"
+#include "miscadmin.h"
 
 
 /* Working state for hashbuild and its callback */
@@ -33,6 +36,7 @@ typedef struct
 {
 	HSpool	   *spool;			/* NULL if not using spooling */
 	double		indtuples;		/* # tuples accepted into index */
+	Relation	heapRel;		/* heap relation descriptor */
 } HashBuildState;
 
 static void hashbuildCallback(Relation index,
@@ -65,6 +69,7 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->amstorage = false;
 	amroutine->amclusterable = false;
 	amroutine->ampredlocks = false;
+	amroutine->amcanparallel = false;
 	amroutine->amkeytype = INT4OID;
 
 	amroutine->ambuild = hashbuild;
@@ -84,6 +89,9 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->amendscan = hashendscan;
 	amroutine->ammarkpos = NULL;
 	amroutine->amrestrpos = NULL;
+	amroutine->amestimateparallelscan = NULL;
+	amroutine->aminitparallelscan = NULL;
+	amroutine->amparallelrescan = NULL;
 
 	PG_RETURN_POINTER(amroutine);
 }
@@ -114,7 +122,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
 
 	/* Initialize the hash index metadata page and initial buckets */
-	num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM);
+	num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
 
 	/*
 	 * If we just insert the tuples into the index in scan order, then
@@ -147,6 +155,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 
 	/* prepare to build the index */
 	buildstate.indtuples = 0;
+	buildstate.heapRel = heap;
 
 	/* do the heap scan */
 	reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
@@ -155,7 +164,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	if (buildstate.spool)
 	{
 		/* sort the tuples and insert them into the index */
-		_h_indexbuild(buildstate.spool);
+		_h_indexbuild(buildstate.spool, buildstate.heapRel);
 		_h_spooldestroy(buildstate.spool);
 	}
 
@@ -176,7 +185,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 void
 hashbuildempty(Relation index)
 {
-	_hash_metapinit(index, 0, INIT_FORKNUM);
+	_hash_init(index, 0, INIT_FORKNUM);
 }
 
 /*
@@ -211,7 +220,7 @@ hashbuildCallback(Relation index,
 		itup = index_form_tuple(RelationGetDescr(index),
 								index_values, index_isnull);
 		itup->t_tid = htup->t_self;
-		_hash_doinsert(index, itup);
+		_hash_doinsert(index, itup, buildstate->heapRel);
 		pfree(itup);
 	}
 
@@ -227,7 +236,8 @@ hashbuildCallback(Relation index,
 bool
 hashinsert(Relation rel, Datum *values, bool *isnull,
 		   ItemPointer ht_ctid, Relation heapRel,
-		   IndexUniqueCheck checkUnique)
+		   IndexUniqueCheck checkUnique,
+		   IndexInfo *indexInfo)
 {
 	Datum		index_values[1];
 	bool		index_isnull[1];
@@ -243,7 +253,7 @@ hashinsert(Relation rel, Datum *values, bool *isnull,
 	itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
 	itup->t_tid = *ht_ctid;
 
-	_hash_doinsert(rel, itup);
+	_hash_doinsert(rel, itup, heapRel);
 
 	pfree(itup);
 
@@ -273,7 +283,7 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir)
 	 * Reacquire the read lock here.
 	 */
 	if (BufferIsValid(so->hashso_curbuf))
-		_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);
+		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
 
 	/*
 	 * If we've already initialized this scan, we can just advance it in the
@@ -286,16 +296,21 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir)
 		/*
 		 * An insertion into the current index page could have happened while
 		 * we didn't have read lock on it.  Re-find our position by looking
-		 * for the TID we previously returned.  (Because we hold share lock on
-		 * the bucket, no deletions or splits could have occurred; therefore
-		 * we can expect that the TID still exists in the current index page,
-		 * at an offset >= where we were.)
+		 * for the TID we previously returned.  (Because we hold a pin on the
+		 * primary bucket page, no deletions or splits could have occurred;
+		 * therefore we can expect that the TID still exists in the current
+		 * index page, at an offset >= where we were.)
 		 */
 		OffsetNumber maxoffnum;
 
 		buf = so->hashso_curbuf;
 		Assert(BufferIsValid(buf));
 		page = BufferGetPage(buf);
+
+		/*
+		 * We don't need test for old snapshot here as the current buffer is
+		 * pinned, so vacuum can't clean the page.
+		 */
 		maxoffnum = PageGetMaxOffsetNumber(page);
 		for (offnum = ItemPointerGetOffsetNumber(current);
 			 offnum <= maxoffnum;
@@ -318,14 +333,24 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir)
 		if (scan->kill_prior_tuple)
 		{
 			/*
-			 * Yes, so mark it by setting the LP_DEAD state in the item flags.
+			 * Yes, so remember it for later. (We'll deal with all such tuples
+			 * at once right after leaving the index page or at end of scan.)
+			 * In case if caller reverses the indexscan direction it is quite
+			 * possible that the same item might get entered multiple times.
+			 * But, we don't detect that; instead, we just forget any excess
+			 * entries.
 			 */
-			ItemIdMarkDead(PageGetItemId(page, offnum));
+			if (so->killedItems == NULL)
+				so->killedItems = palloc(MaxIndexTuplesPerPage *
+										 sizeof(HashScanPosItem));
 
-			/*
-			 * Since this can be redone later if needed, mark as a hint.
-			 */
-			MarkBufferDirtyHint(buf, true);
+			if (so->numKilled < MaxIndexTuplesPerPage)
+			{
+				so->killedItems[so->numKilled].heapTid = so->hashso_heappos;
+				so->killedItems[so->numKilled].indexOffset =
+					ItemPointerGetOffsetNumber(&(so->hashso_curpos));
+				so->numKilled++;
+			}
 		}
 
 		/*
@@ -353,7 +378,7 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir)
 
 	/* Release read lock on current buffer, but keep it pinned */
 	if (BufferIsValid(so->hashso_curbuf))
-		_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK);
+		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
 
 	/* Return current heap TID on success */
 	scan->xs_ctup.t_self = so->hashso_heappos;
@@ -423,17 +448,20 @@ hashbeginscan(Relation rel, int nkeys, int norderbys)
 	scan = RelationGetIndexScan(rel, nkeys, norderbys);
 
 	so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
-	so->hashso_bucket_valid = false;
-	so->hashso_bucket_blkno = 0;
 	so->hashso_curbuf = InvalidBuffer;
+	so->hashso_bucket_buf = InvalidBuffer;
+	so->hashso_split_bucket_buf = InvalidBuffer;
 	/* set position invalid (this will cause _hash_first call) */
 	ItemPointerSetInvalid(&(so->hashso_curpos));
 	ItemPointerSetInvalid(&(so->hashso_heappos));
 
-	scan->opaque = so;
+	so->hashso_buc_populated = false;
+	so->hashso_buc_split = false;
 
-	/* register scan in case we change pages it's using */
-	_hash_regscan(scan);
+	so->killedItems = NULL;
+	so->numKilled = 0;
+
+	scan->opaque = so;
 
 	return scan;
 }
@@ -448,15 +476,18 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
 	Relation	rel = scan->indexRelation;
 
-	/* release any pin we still hold */
-	if (BufferIsValid(so->hashso_curbuf))
-		_hash_dropbuf(rel, so->hashso_curbuf);
-	so->hashso_curbuf = InvalidBuffer;
+	/*
+	 * Before leaving current page, deal with any killed items. Also, ensure
+	 * that we acquire lock on current page before calling _hash_kill_items.
+	 */
+	if (so->numKilled > 0)
+	{
+		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
+		_hash_kill_items(scan);
+		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
+	}
 
-	/* release lock on bucket, too */
-	if (so->hashso_bucket_blkno)
-		_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
-	so->hashso_bucket_blkno = 0;
+	_hash_dropscanbuf(rel, so);
 
 	/* set position invalid (this will cause _hash_first call) */
 	ItemPointerSetInvalid(&(so->hashso_curpos));
@@ -468,8 +499,10 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 		memmove(scan->keyData,
 				scankey,
 				scan->numberOfKeys * sizeof(ScanKeyData));
-		so->hashso_bucket_valid = false;
 	}
+
+	so->hashso_buc_populated = false;
+	so->hashso_buc_split = false;
 }
 
 /*
@@ -481,19 +514,21 @@ hashendscan(IndexScanDesc scan)
 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
 	Relation	rel = scan->indexRelation;
 
-	/* don't need scan registered anymore */
-	_hash_dropscan(scan);
-
-	/* release any pin we still hold */
-	if (BufferIsValid(so->hashso_curbuf))
-		_hash_dropbuf(rel, so->hashso_curbuf);
-	so->hashso_curbuf = InvalidBuffer;
+	/*
+	 * Before leaving current page, deal with any killed items. Also, ensure
+	 * that we acquire lock on current page before calling _hash_kill_items.
+	 */
+	if (so->numKilled > 0)
+	{
+		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
+		_hash_kill_items(scan);
+		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
+	}
 
-	/* release lock on bucket, too */
-	if (so->hashso_bucket_blkno)
-		_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
-	so->hashso_bucket_blkno = 0;
+	_hash_dropscanbuf(rel, so);
 
+	if (so->killedItems != NULL)
+		pfree(so->killedItems);
 	pfree(so);
 	scan->opaque = NULL;
 }
@@ -503,6 +538,9 @@ hashendscan(IndexScanDesc scan)
  * The set of target tuples is specified via a callback routine that tells
  * whether any given heap tuple (identified by ItemPointer) is being deleted.
  *
+ * This function also deletes the tuples that are moved by split to other
+ * bucket.
+ *
  * Result: a palloc'd struct containing statistical info for VACUUM displays.
  */
 IndexBulkDeleteResult *
@@ -516,27 +554,24 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Bucket		orig_maxbucket;
 	Bucket		cur_maxbucket;
 	Bucket		cur_bucket;
-	Buffer		metabuf;
+	Buffer		metabuf = InvalidBuffer;
 	HashMetaPage metap;
-	HashMetaPageData local_metapage;
+	HashMetaPage cachedmetap;
 
 	tuples_removed = 0;
 	num_index_tuples = 0;
 
 	/*
-	 * Read the metapage to fetch original bucket and tuple counts.  Also, we
-	 * keep a copy of the last-seen metapage so that we can use its
-	 * hashm_spares[] values to compute bucket page addresses.  This is a bit
-	 * hokey but perfectly safe, since the interesting entries in the spares
-	 * array cannot change under us; and it beats rereading the metapage for
-	 * each bucket.
+	 * We need a copy of the metapage so that we can use its hashm_spares[]
+	 * values to compute bucket page addresses, but a cached copy should be
+	 * good enough.  (If not, we'll detect that further down and refresh the
+	 * cache as necessary.)
 	 */
-	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	metap = HashPageGetMeta(BufferGetPage(metabuf));
-	orig_maxbucket = metap->hashm_maxbucket;
-	orig_ntuples = metap->hashm_ntuples;
-	memcpy(&local_metapage, metap, sizeof(local_metapage));
-	_hash_relbuf(rel, metabuf);
+	cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
+	Assert(cachedmetap != NULL);
+
+	orig_maxbucket = cachedmetap->hashm_maxbucket;
+	orig_ntuples = cachedmetap->hashm_ntuples;
 
 	/* Scan the buckets that we know exist */
 	cur_bucket = 0;
@@ -547,102 +582,89 @@ loop_top:
 	{
 		BlockNumber bucket_blkno;
 		BlockNumber blkno;
-		bool		bucket_dirty = false;
+		Buffer		bucket_buf;
+		Buffer		buf;
+		HashPageOpaque bucket_opaque;
+		Page		page;
+		bool		split_cleanup = false;
 
 		/* Get address of bucket's start page */
-		bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
+		bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
 
-		/* Exclusive-lock the bucket so we can shrink it */
-		_hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE);
+		blkno = bucket_blkno;
 
-		/* Shouldn't have any active scans locally, either */
-		if (_hash_has_active_scan(rel, cur_bucket))
-			elog(ERROR, "hash index has active scan during VACUUM");
+		/*
+		 * We need to acquire a cleanup lock on the primary bucket page to out
+		 * wait concurrent scans before deleting the dead tuples.
+		 */
+		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
+		LockBufferForCleanup(buf);
+		_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
 
-		/* Scan each page in bucket */
-		blkno = bucket_blkno;
-		while (BlockNumberIsValid(blkno))
-		{
-			Buffer		buf;
-			Page		page;
-			HashPageOpaque opaque;
-			OffsetNumber offno;
-			OffsetNumber maxoffno;
-			OffsetNumber deletable[MaxOffsetNumber];
-			int			ndeletable = 0;
-
-			vacuum_delay_point();
-
-			buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
-										   LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
-											 info->strategy);
-			page = BufferGetPage(buf);
-			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
-			Assert(opaque->hasho_bucket == cur_bucket);
-
-			/* Scan each tuple in page */
-			maxoffno = PageGetMaxOffsetNumber(page);
-			for (offno = FirstOffsetNumber;
-				 offno <= maxoffno;
-				 offno = OffsetNumberNext(offno))
-			{
-				IndexTuple	itup;
-				ItemPointer htup;
+		page = BufferGetPage(buf);
+		bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 
-				itup = (IndexTuple) PageGetItem(page,
-												PageGetItemId(page, offno));
-				htup = &(itup->t_tid);
-				if (callback(htup, callback_state))
-				{
-					/* mark the item for deletion */
-					deletable[ndeletable++] = offno;
-					tuples_removed += 1;
-				}
-				else
-					num_index_tuples += 1;
-			}
+		/*
+		 * If the bucket contains tuples that are moved by split, then we need
+		 * to delete such tuples.  We can't delete such tuples if the split
+		 * operation on bucket is not finished as those are needed by scans.
+		 */
+		if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
+			H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
+		{
+			split_cleanup = true;
 
 			/*
-			 * Apply deletions and write page if needed, advance to next page.
+			 * This bucket might have been split since we last held a lock on
+			 * the metapage.  If so, hashm_maxbucket, hashm_highmask and
+			 * hashm_lowmask might be old enough to cause us to fail to remove
+			 * tuples left behind by the most recent split.  To prevent that,
+			 * now that the primary page of the target bucket has been locked
+			 * (and thus can't be further split), check whether we need to
+			 * update our cached metapage data.
 			 */
-			blkno = opaque->hasho_nextblkno;
-
-			if (ndeletable > 0)
+			Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
+			if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
 			{
-				PageIndexMultiDelete(page, deletable, ndeletable);
-				_hash_wrtbuf(rel, buf);
-				bucket_dirty = true;
+				cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
+				Assert(cachedmetap != NULL);
 			}
-			else
-				_hash_relbuf(rel, buf);
 		}
 
-		/* If we deleted anything, try to compact free space */
-		if (bucket_dirty)
-			_hash_squeezebucket(rel, cur_bucket, bucket_blkno,
-								info->strategy);
+		bucket_buf = buf;
+
+		hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
+						  cachedmetap->hashm_maxbucket,
+						  cachedmetap->hashm_highmask,
+						  cachedmetap->hashm_lowmask, &tuples_removed,
+						  &num_index_tuples, split_cleanup,
+						  callback, callback_state);
 
-		/* Release bucket lock */
-		_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);
+		_hash_dropbuf(rel, bucket_buf);
 
 		/* Advance to next bucket */
 		cur_bucket++;
 	}
 
+	if (BufferIsInvalid(metabuf))
+		metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
+
 	/* Write-lock metapage and check for split since we started */
-	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
+	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 	metap = HashPageGetMeta(BufferGetPage(metabuf));
 
 	if (cur_maxbucket != metap->hashm_maxbucket)
 	{
 		/* There's been a split, so process the additional bucket(s) */
-		cur_maxbucket = metap->hashm_maxbucket;
-		memcpy(&local_metapage, metap, sizeof(local_metapage));
-		_hash_relbuf(rel, metabuf);
+		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+		cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
+		Assert(cachedmetap != NULL);
+		cur_maxbucket = cachedmetap->hashm_maxbucket;
 		goto loop_top;
 	}
 
 	/* Okay, we're really done.  Update tuple count in metapage. */
+	START_CRIT_SECTION();
 
 	if (orig_maxbucket == metap->hashm_maxbucket &&
 		orig_ntuples == metap->hashm_ntuples)
@@ -668,7 +690,28 @@ loop_top:
 		num_index_tuples = metap->hashm_ntuples;
 	}
 
-	_hash_wrtbuf(rel, metabuf);
+	MarkBufferDirty(metabuf);
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_hash_update_meta_page xlrec;
+		XLogRecPtr	recptr;
+
+		xlrec.ntuples = metap->hashm_ntuples;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
+
+		XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
+
+		recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
+		PageSetLSN(BufferGetPage(metabuf), recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	_hash_relbuf(rel, metabuf);
 
 	/* return statistics */
 	if (stats == NULL)
@@ -704,9 +747,262 @@ hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	return stats;
 }
 
-
+/*
+ * Helper function to perform deletion of index entries from a bucket.
+ *
+ * This function expects that the caller has acquired a cleanup lock on the
+ * primary bucket page, and will return with a write lock again held on the
+ * primary bucket page.  The lock won't necessarily be held continuously,
+ * though, because we'll release it when visiting overflow pages.
+ *
+ * It would be very bad if this function cleaned a page while some other
+ * backend was in the midst of scanning it, because hashgettuple assumes
+ * that the next valid TID will be greater than or equal to the current
+ * valid TID.  There can't be any concurrent scans in progress when we first
+ * enter this function because of the cleanup lock we hold on the primary
+ * bucket page, but as soon as we release that lock, there might be.  We
+ * handle that by conspiring to prevent those scans from passing our cleanup
+ * scan.  To do that, we lock the next page in the bucket chain before
+ * releasing the lock on the previous page.  (This type of lock chaining is
+ * not ideal, so we might want to look for a better solution at some point.)
+ *
+ * We need to retain a pin on the primary bucket to ensure that no concurrent
+ * split can start.
+ */
 void
-hash_redo(XLogReaderState *record)
+hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
+				  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
+				  uint32 maxbucket, uint32 highmask, uint32 lowmask,
+				  double *tuples_removed, double *num_index_tuples,
+				  bool split_cleanup,
+				  IndexBulkDeleteCallback callback, void *callback_state)
 {
-	elog(PANIC, "hash_redo: unimplemented");
+	BlockNumber blkno;
+	Buffer		buf;
+	Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
+	bool		bucket_dirty = false;
+
+	blkno = bucket_blkno;
+	buf = bucket_buf;
+
+	if (split_cleanup)
+		new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
+														lowmask, maxbucket);
+
+	/* Scan each page in bucket */
+	for (;;)
+	{
+		HashPageOpaque opaque;
+		OffsetNumber offno;
+		OffsetNumber maxoffno;
+		Buffer		next_buf;
+		Page		page;
+		OffsetNumber deletable[MaxOffsetNumber];
+		int			ndeletable = 0;
+		bool		retain_pin = false;
+		bool		clear_dead_marking = false;
+
+		vacuum_delay_point();
+
+		page = BufferGetPage(buf);
+		opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+		/* Scan each tuple in page */
+		maxoffno = PageGetMaxOffsetNumber(page);
+		for (offno = FirstOffsetNumber;
+			 offno <= maxoffno;
+			 offno = OffsetNumberNext(offno))
+		{
+			ItemPointer htup;
+			IndexTuple	itup;
+			Bucket		bucket;
+			bool		kill_tuple = false;
+
+			itup = (IndexTuple) PageGetItem(page,
+											PageGetItemId(page, offno));
+			htup = &(itup->t_tid);
+
+			/*
+			 * To remove the dead tuples, we strictly want to rely on results
+			 * of callback function.  refer btvacuumpage for detailed reason.
+			 */
+			if (callback && callback(htup, callback_state))
+			{
+				kill_tuple = true;
+				if (tuples_removed)
+					*tuples_removed += 1;
+			}
+			else if (split_cleanup)
+			{
+				/* delete the tuples that are moved by split. */
+				bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
+											  maxbucket,
+											  highmask,
+											  lowmask);
+				/* mark the item for deletion */
+				if (bucket != cur_bucket)
+				{
+					/*
+					 * We expect tuples to either belong to current bucket or
+					 * new_bucket.  This is ensured because we don't allow
+					 * further splits from bucket that contains garbage. See
+					 * comments in _hash_expandtable.
+					 */
+					Assert(bucket == new_bucket);
+					kill_tuple = true;
+				}
+			}
+
+			if (kill_tuple)
+			{
+				/* mark the item for deletion */
+				deletable[ndeletable++] = offno;
+			}
+			else
+			{
+				/* we're keeping it, so count it */
+				if (num_index_tuples)
+					*num_index_tuples += 1;
+			}
+		}
+
+		/* retain the pin on primary bucket page till end of bucket scan */
+		if (blkno == bucket_blkno)
+			retain_pin = true;
+		else
+			retain_pin = false;
+
+		blkno = opaque->hasho_nextblkno;
+
+		/*
+		 * Apply deletions, advance to next page and write page if needed.
+		 */
+		if (ndeletable > 0)
+		{
+			/* No ereport(ERROR) until changes are logged */
+			START_CRIT_SECTION();
+
+			PageIndexMultiDelete(page, deletable, ndeletable);
+			bucket_dirty = true;
+
+			/*
+			 * Let us mark the page as clean if vacuum removes the DEAD tuples
+			 * from an index page. We do this by clearing
+			 * LH_PAGE_HAS_DEAD_TUPLES flag.
+			 */
+			if (tuples_removed && *tuples_removed > 0 &&
+				H_HAS_DEAD_TUPLES(opaque))
+			{
+				opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
+				clear_dead_marking = true;
+			}
+
+			MarkBufferDirty(buf);
+
+			/* XLOG stuff */
+			if (RelationNeedsWAL(rel))
+			{
+				xl_hash_delete xlrec;
+				XLogRecPtr	recptr;
+
+				xlrec.clear_dead_marking = clear_dead_marking;
+				xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
+
+				XLogBeginInsert();
+				XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
+
+				/*
+				 * bucket buffer needs to be registered to ensure that we can
+				 * acquire a cleanup lock on it during replay.
+				 */
+				if (!xlrec.is_primary_bucket_page)
+					XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
+
+				XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
+				XLogRegisterBufData(1, (char *) deletable,
+									ndeletable * sizeof(OffsetNumber));
+
+				recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
+				PageSetLSN(BufferGetPage(buf), recptr);
+			}
+
+			END_CRIT_SECTION();
+		}
+
+		/* bail out if there are no more pages to scan. */
+		if (!BlockNumberIsValid(blkno))
+			break;
+
+		next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
+											  LH_OVERFLOW_PAGE,
+											  bstrategy);
+
+		/*
+		 * release the lock on previous page after acquiring the lock on next
+		 * page
+		 */
+		if (retain_pin)
+			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		else
+			_hash_relbuf(rel, buf);
+
+		buf = next_buf;
+	}
+
+	/*
+	 * lock the bucket page to clear the garbage flag and squeeze the bucket.
+	 * if the current buffer is same as bucket buffer, then we already have
+	 * lock on bucket page.
+	 */
+	if (buf != bucket_buf)
+	{
+		_hash_relbuf(rel, buf);
+		LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
+	}
+
+	/*
+	 * Clear the garbage flag from bucket after deleting the tuples that are
+	 * moved by split.  We purposefully clear the flag before squeeze bucket,
+	 * so that after restart, vacuum shouldn't again try to delete the moved
+	 * by split tuples.
+	 */
+	if (split_cleanup)
+	{
+		HashPageOpaque bucket_opaque;
+		Page		page;
+
+		page = BufferGetPage(bucket_buf);
+		bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+		/* No ereport(ERROR) until changes are logged */
+		START_CRIT_SECTION();
+
+		bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
+		MarkBufferDirty(bucket_buf);
+
+		/* XLOG stuff */
+		if (RelationNeedsWAL(rel))
+		{
+			XLogRecPtr	recptr;
+
+			XLogBeginInsert();
+			XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
+
+			recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
+			PageSetLSN(page, recptr);
+		}
+
+		END_CRIT_SECTION();
+	}
+
+	/*
+	 * If we have deleted anything, try to compact free space.  For squeezing
+	 * the bucket, we must have a cleanup lock, else it can impact the
+	 * ordering of tuples for a scan that has started before it.
+	 */
+	if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
+		_hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
+							bstrategy);
+	else
+		LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
 }
diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c
new file mode 100644
index 0000000000..0ea11b2e74
--- /dev/null
+++ b/src/backend/access/hash/hash_xlog.c
@@ -0,0 +1,1270 @@
+/*-------------------------------------------------------------------------
+ *
+ * hash_xlog.c
+ *	  WAL replay logic for hash index.
+ *
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/hash/hash_xlog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam_xlog.h"
+#include "access/bufmask.h"
+#include "access/hash.h"
+#include "access/hash_xlog.h"
+#include "access/xlogutils.h"
+#include "access/xlog.h"
+#include "access/transam.h"
+#include "storage/procarray.h"
+#include "miscadmin.h"
+
+/*
+ * replay a hash index meta page
+ */
+static void
+hash_xlog_init_meta_page(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	Page		page;
+	Buffer		metabuf;
+
+	xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) XLogRecGetData(record);
+
+	/* create the index' metapage */
+	metabuf = XLogInitBufferForRedo(record, 0);
+	Assert(BufferIsValid(metabuf));
+	_hash_init_metabuffer(metabuf, xlrec->num_tuples, xlrec->procid,
+						  xlrec->ffactor, true);
+	page = (Page) BufferGetPage(metabuf);
+	PageSetLSN(page, lsn);
+	MarkBufferDirty(metabuf);
+	/* all done */
+	UnlockReleaseBuffer(metabuf);
+}
+
+/*
+ * replay a hash index bitmap page
+ */
+static void
+hash_xlog_init_bitmap_page(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	Buffer		bitmapbuf;
+	Buffer		metabuf;
+	Page		page;
+	HashMetaPage metap;
+	uint32		num_buckets;
+
+	xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) XLogRecGetData(record);
+
+	/*
+	 * Initialize bitmap page
+	 */
+	bitmapbuf = XLogInitBufferForRedo(record, 0);
+	_hash_initbitmapbuffer(bitmapbuf, xlrec->bmsize, true);
+	PageSetLSN(BufferGetPage(bitmapbuf), lsn);
+	MarkBufferDirty(bitmapbuf);
+	UnlockReleaseBuffer(bitmapbuf);
+
+	/* add the new bitmap page to the metapage's list of bitmaps */
+	if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO)
+	{
+		/*
+		 * Note: in normal operation, we'd update the metapage while still
+		 * holding lock on the bitmap page.  But during replay it's not
+		 * necessary to hold that lock, since nobody can see it yet; the
+		 * creating transaction hasn't yet committed.
+		 */
+		page = BufferGetPage(metabuf);
+		metap = HashPageGetMeta(page);
+
+		num_buckets = metap->hashm_maxbucket + 1;
+		metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1;
+		metap->hashm_nmaps++;
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(metabuf);
+	}
+	if (BufferIsValid(metabuf))
+		UnlockReleaseBuffer(metabuf);
+}
+
+/*
+ * replay a hash index insert without split
+ */
+static void
+hash_xlog_insert(XLogReaderState *record)
+{
+	HashMetaPage metap;
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_hash_insert *xlrec = (xl_hash_insert *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		Size		datalen;
+		char	   *datapos = XLogRecGetBlockData(record, 0, &datalen);
+
+		page = BufferGetPage(buffer);
+
+		if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
+						false, false) == InvalidOffsetNumber)
+			elog(PANIC, "hash_xlog_insert: failed to add item");
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+
+	if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
+	{
+		/*
+		 * Note: in normal operation, we'd update the metapage while still
+		 * holding lock on the page we inserted into.  But during replay it's
+		 * not necessary to hold that lock, since no other index updates can
+		 * be happening concurrently.
+		 */
+		page = BufferGetPage(buffer);
+		metap = HashPageGetMeta(page);
+		metap->hashm_ntuples += 1;
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * replay addition of overflow page for hash index
+ */
+static void
+hash_xlog_add_ovfl_page(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) XLogRecGetData(record);
+	Buffer		leftbuf;
+	Buffer		ovflbuf;
+	Buffer		metabuf;
+	BlockNumber leftblk;
+	BlockNumber rightblk;
+	BlockNumber newmapblk = InvalidBlockNumber;
+	Page		ovflpage;
+	HashPageOpaque ovflopaque;
+	uint32	   *num_bucket;
+	char	   *data;
+	Size datalen PG_USED_FOR_ASSERTS_ONLY;
+	bool		new_bmpage = false;
+
+	XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk);
+	XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk);
+
+	ovflbuf = XLogInitBufferForRedo(record, 0);
+	Assert(BufferIsValid(ovflbuf));
+
+	data = XLogRecGetBlockData(record, 0, &datalen);
+	num_bucket = (uint32 *) data;
+	Assert(datalen == sizeof(uint32));
+	_hash_initbuf(ovflbuf, InvalidBlockNumber, *num_bucket, LH_OVERFLOW_PAGE,
+				  true);
+	/* update backlink */
+	ovflpage = BufferGetPage(ovflbuf);
+	ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
+	ovflopaque->hasho_prevblkno = leftblk;
+
+	PageSetLSN(ovflpage, lsn);
+	MarkBufferDirty(ovflbuf);
+
+	if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO)
+	{
+		Page		leftpage;
+		HashPageOpaque leftopaque;
+
+		leftpage = BufferGetPage(leftbuf);
+		leftopaque = (HashPageOpaque) PageGetSpecialPointer(leftpage);
+		leftopaque->hasho_nextblkno = rightblk;
+
+		PageSetLSN(leftpage, lsn);
+		MarkBufferDirty(leftbuf);
+	}
+
+	if (BufferIsValid(leftbuf))
+		UnlockReleaseBuffer(leftbuf);
+	UnlockReleaseBuffer(ovflbuf);
+
+	/*
+	 * Note: in normal operation, we'd update the bitmap and meta page while
+	 * still holding lock on the overflow pages.  But during replay it's not
+	 * necessary to hold those locks, since no other index updates can be
+	 * happening concurrently.
+	 */
+	if (XLogRecHasBlockRef(record, 2))
+	{
+		Buffer		mapbuffer;
+
+		if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO)
+		{
+			Page		mappage = (Page) BufferGetPage(mapbuffer);
+			uint32	   *freep = NULL;
+			char	   *data;
+			uint32	   *bitmap_page_bit;
+
+			freep = HashPageGetBitmap(mappage);
+
+			data = XLogRecGetBlockData(record, 2, &datalen);
+			bitmap_page_bit = (uint32 *) data;
+
+			SETBIT(freep, *bitmap_page_bit);
+
+			PageSetLSN(mappage, lsn);
+			MarkBufferDirty(mapbuffer);
+		}
+		if (BufferIsValid(mapbuffer))
+			UnlockReleaseBuffer(mapbuffer);
+	}
+
+	if (XLogRecHasBlockRef(record, 3))
+	{
+		Buffer		newmapbuf;
+
+		newmapbuf = XLogInitBufferForRedo(record, 3);
+
+		_hash_initbitmapbuffer(newmapbuf, xlrec->bmsize, true);
+
+		new_bmpage = true;
+		newmapblk = BufferGetBlockNumber(newmapbuf);
+
+		MarkBufferDirty(newmapbuf);
+		PageSetLSN(BufferGetPage(newmapbuf), lsn);
+
+		UnlockReleaseBuffer(newmapbuf);
+	}
+
+	if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO)
+	{
+		HashMetaPage metap;
+		Page		page;
+		uint32	   *firstfree_ovflpage;
+
+		data = XLogRecGetBlockData(record, 4, &datalen);
+		firstfree_ovflpage = (uint32 *) data;
+
+		page = BufferGetPage(metabuf);
+		metap = HashPageGetMeta(page);
+		metap->hashm_firstfree = *firstfree_ovflpage;
+
+		if (!xlrec->bmpage_found)
+		{
+			metap->hashm_spares[metap->hashm_ovflpoint]++;
+
+			if (new_bmpage)
+			{
+				Assert(BlockNumberIsValid(newmapblk));
+
+				metap->hashm_mapp[metap->hashm_nmaps] = newmapblk;
+				metap->hashm_nmaps++;
+				metap->hashm_spares[metap->hashm_ovflpoint]++;
+			}
+		}
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(metabuf);
+	}
+	if (BufferIsValid(metabuf))
+		UnlockReleaseBuffer(metabuf);
+}
+
+/*
+ * replay allocation of page for split operation
+ */
+static void
+hash_xlog_split_allocate_page(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) XLogRecGetData(record);
+	Buffer		oldbuf;
+	Buffer		newbuf;
+	Buffer		metabuf;
+	Size datalen PG_USED_FOR_ASSERTS_ONLY;
+	char	   *data;
+	XLogRedoAction action;
+
+	/*
+	 * To be consistent with normal operation, here we take cleanup locks on
+	 * both the old and new buckets even though there can't be any concurrent
+	 * inserts.
+	 */
+
+	/* replay the record for old bucket */
+	action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf);
+
+	/*
+	 * Note that we still update the page even if it was restored from a full
+	 * page image, because the special space is not included in the image.
+	 */
+	if (action == BLK_NEEDS_REDO || action == BLK_RESTORED)
+	{
+		Page		oldpage;
+		HashPageOpaque oldopaque;
+
+		oldpage = BufferGetPage(oldbuf);
+		oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage);
+
+		oldopaque->hasho_flag = xlrec->old_bucket_flag;
+		oldopaque->hasho_prevblkno = xlrec->new_bucket;
+
+		PageSetLSN(oldpage, lsn);
+		MarkBufferDirty(oldbuf);
+	}
+
+	/* replay the record for new bucket */
+	newbuf = XLogInitBufferForRedo(record, 1);
+	_hash_initbuf(newbuf, xlrec->new_bucket, xlrec->new_bucket,
+				  xlrec->new_bucket_flag, true);
+	if (!IsBufferCleanupOK(newbuf))
+		elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock");
+	MarkBufferDirty(newbuf);
+	PageSetLSN(BufferGetPage(newbuf), lsn);
+
+	/*
+	 * We can release the lock on old bucket early as well but doing here to
+	 * consistent with normal operation.
+	 */
+	if (BufferIsValid(oldbuf))
+		UnlockReleaseBuffer(oldbuf);
+	if (BufferIsValid(newbuf))
+		UnlockReleaseBuffer(newbuf);
+
+	/*
+	 * Note: in normal operation, we'd update the meta page while still
+	 * holding lock on the old and new bucket pages.  But during replay it's
+	 * not necessary to hold those locks, since no other bucket splits can be
+	 * happening concurrently.
+	 */
+
+	/* replay the record for metapage changes */
+	if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO)
+	{
+		Page		page;
+		HashMetaPage metap;
+
+		page = BufferGetPage(metabuf);
+		metap = HashPageGetMeta(page);
+		metap->hashm_maxbucket = xlrec->new_bucket;
+
+		data = XLogRecGetBlockData(record, 2, &datalen);
+
+		if (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS)
+		{
+			uint32		lowmask;
+			uint32	   *highmask;
+
+			/* extract low and high masks. */
+			memcpy(&lowmask, data, sizeof(uint32));
+			highmask = (uint32 *) ((char *) data + sizeof(uint32));
+
+			/* update metapage */
+			metap->hashm_lowmask = lowmask;
+			metap->hashm_highmask = *highmask;
+
+			data += sizeof(uint32) * 2;
+		}
+
+		if (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT)
+		{
+			uint32		ovflpoint;
+			uint32	   *ovflpages;
+
+			/* extract information of overflow pages. */
+			memcpy(&ovflpoint, data, sizeof(uint32));
+			ovflpages = (uint32 *) ((char *) data + sizeof(uint32));
+
+			/* update metapage */
+			metap->hashm_spares[ovflpoint] = *ovflpages;
+			metap->hashm_ovflpoint = ovflpoint;
+		}
+
+		MarkBufferDirty(metabuf);
+		PageSetLSN(BufferGetPage(metabuf), lsn);
+	}
+
+	if (BufferIsValid(metabuf))
+		UnlockReleaseBuffer(metabuf);
+}
+
+/*
+ * replay of split operation
+ */
+static void
+hash_xlog_split_page(XLogReaderState *record)
+{
+	Buffer		buf;
+
+	if (XLogReadBufferForRedo(record, 0, &buf) != BLK_RESTORED)
+		elog(ERROR, "Hash split record did not contain a full-page image");
+
+	UnlockReleaseBuffer(buf);
+}
+
+/*
+ * replay completion of split operation
+ */
+static void
+hash_xlog_split_complete(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_hash_split_complete *xlrec = (xl_hash_split_complete *) XLogRecGetData(record);
+	Buffer		oldbuf;
+	Buffer		newbuf;
+	XLogRedoAction action;
+
+	/* replay the record for old bucket */
+	action = XLogReadBufferForRedo(record, 0, &oldbuf);
+
+	/*
+	 * Note that we still update the page even if it was restored from a full
+	 * page image, because the bucket flag is not included in the image.
+	 */
+	if (action == BLK_NEEDS_REDO || action == BLK_RESTORED)
+	{
+		Page		oldpage;
+		HashPageOpaque oldopaque;
+
+		oldpage = BufferGetPage(oldbuf);
+		oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage);
+
+		oldopaque->hasho_flag = xlrec->old_bucket_flag;
+
+		PageSetLSN(oldpage, lsn);
+		MarkBufferDirty(oldbuf);
+	}
+	if (BufferIsValid(oldbuf))
+		UnlockReleaseBuffer(oldbuf);
+
+	/* replay the record for new bucket */
+	action = XLogReadBufferForRedo(record, 1, &newbuf);
+
+	/*
+	 * Note that we still update the page even if it was restored from a full
+	 * page image, because the bucket flag is not included in the image.
+	 */
+	if (action == BLK_NEEDS_REDO || action == BLK_RESTORED)
+	{
+		Page		newpage;
+		HashPageOpaque nopaque;
+
+		newpage = BufferGetPage(newbuf);
+		nopaque = (HashPageOpaque) PageGetSpecialPointer(newpage);
+
+		nopaque->hasho_flag = xlrec->new_bucket_flag;
+
+		PageSetLSN(newpage, lsn);
+		MarkBufferDirty(newbuf);
+	}
+	if (BufferIsValid(newbuf))
+		UnlockReleaseBuffer(newbuf);
+}
+
+/*
+ * replay move of page contents for squeeze operation of hash index
+ */
+static void
+hash_xlog_move_page_contents(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record);
+	Buffer		bucketbuf = InvalidBuffer;
+	Buffer		writebuf = InvalidBuffer;
+	Buffer		deletebuf = InvalidBuffer;
+	XLogRedoAction action;
+
+	/*
+	 * Ensure we have a cleanup lock on primary bucket page before we start
+	 * with the actual replay operation.  This is to ensure that neither a
+	 * scan can start nor a scan can be already-in-progress during the replay
+	 * of this operation.  If we allow scans during this operation, then they
+	 * can miss some records or show the same record multiple times.
+	 */
+	if (xldata->is_prim_bucket_same_wrt)
+		action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf);
+	else
+	{
+		/*
+		 * we don't care for return value as the purpose of reading bucketbuf
+		 * is to ensure a cleanup lock on primary bucket page.
+		 */
+		(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
+
+		action = XLogReadBufferForRedo(record, 1, &writebuf);
+	}
+
+	/* replay the record for adding entries in overflow buffer */
+	if (action == BLK_NEEDS_REDO)
+	{
+		Page		writepage;
+		char	   *begin;
+		char	   *data;
+		Size		datalen;
+		uint16		ninserted = 0;
+
+		data = begin = XLogRecGetBlockData(record, 1, &datalen);
+
+		writepage = (Page) BufferGetPage(writebuf);
+
+		if (xldata->ntups > 0)
+		{
+			OffsetNumber *towrite = (OffsetNumber *) data;
+
+			data += sizeof(OffsetNumber) * xldata->ntups;
+
+			while (data - begin < datalen)
+			{
+				IndexTuple	itup = (IndexTuple) data;
+				Size		itemsz;
+				OffsetNumber l;
+
+				itemsz = IndexTupleDSize(*itup);
+				itemsz = MAXALIGN(itemsz);
+
+				data += itemsz;
+
+				l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false);
+				if (l == InvalidOffsetNumber)
+					elog(ERROR, "hash_xlog_move_page_contents: failed to add item to hash index page, size %d bytes",
+						 (int) itemsz);
+
+				ninserted++;
+			}
+		}
+
+		/*
+		 * number of tuples inserted must be same as requested in REDO record.
+		 */
+		Assert(ninserted == xldata->ntups);
+
+		PageSetLSN(writepage, lsn);
+		MarkBufferDirty(writebuf);
+	}
+
+	/* replay the record for deleting entries from overflow buffer */
+	if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO)
+	{
+		Page		page;
+		char	   *ptr;
+		Size		len;
+
+		ptr = XLogRecGetBlockData(record, 2, &len);
+
+		page = (Page) BufferGetPage(deletebuf);
+
+		if (len > 0)
+		{
+			OffsetNumber *unused;
+			OffsetNumber *unend;
+
+			unused = (OffsetNumber *) ptr;
+			unend = (OffsetNumber *) ((char *) ptr + len);
+
+			if ((unend - unused) > 0)
+				PageIndexMultiDelete(page, unused, unend - unused);
+		}
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(deletebuf);
+	}
+
+	/*
+	 * Replay is complete, now we can release the buffers. We release locks at
+	 * end of replay operation to ensure that we hold lock on primary bucket
+	 * page till end of operation.  We can optimize by releasing the lock on
+	 * write buffer as soon as the operation for same is complete, if it is
+	 * not same as primary bucket page, but that doesn't seem to be worth
+	 * complicating the code.
+	 */
+	if (BufferIsValid(deletebuf))
+		UnlockReleaseBuffer(deletebuf);
+
+	if (BufferIsValid(writebuf))
+		UnlockReleaseBuffer(writebuf);
+
+	if (BufferIsValid(bucketbuf))
+		UnlockReleaseBuffer(bucketbuf);
+}
+
+/*
+ * replay squeeze page operation of hash index
+ */
+static void
+hash_xlog_squeeze_page(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record);
+	Buffer		bucketbuf = InvalidBuffer;
+	Buffer		writebuf;
+	Buffer		ovflbuf;
+	Buffer		prevbuf = InvalidBuffer;
+	Buffer		mapbuf;
+	XLogRedoAction action;
+
+	/*
+	 * Ensure we have a cleanup lock on primary bucket page before we start
+	 * with the actual replay operation.  This is to ensure that neither a
+	 * scan can start nor a scan can be already-in-progress during the replay
+	 * of this operation.  If we allow scans during this operation, then they
+	 * can miss some records or show the same record multiple times.
+	 */
+	if (xldata->is_prim_bucket_same_wrt)
+		action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf);
+	else
+	{
+		/*
+		 * we don't care for return value as the purpose of reading bucketbuf
+		 * is to ensure a cleanup lock on primary bucket page.
+		 */
+		(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
+
+		action = XLogReadBufferForRedo(record, 1, &writebuf);
+	}
+
+	/* replay the record for adding entries in overflow buffer */
+	if (action == BLK_NEEDS_REDO)
+	{
+		Page		writepage;
+		char	   *begin;
+		char	   *data;
+		Size		datalen;
+		uint16		ninserted = 0;
+
+		data = begin = XLogRecGetBlockData(record, 1, &datalen);
+
+		writepage = (Page) BufferGetPage(writebuf);
+
+		if (xldata->ntups > 0)
+		{
+			OffsetNumber *towrite = (OffsetNumber *) data;
+
+			data += sizeof(OffsetNumber) * xldata->ntups;
+
+			while (data - begin < datalen)
+			{
+				IndexTuple	itup = (IndexTuple) data;
+				Size		itemsz;
+				OffsetNumber l;
+
+				itemsz = IndexTupleDSize(*itup);
+				itemsz = MAXALIGN(itemsz);
+
+				data += itemsz;
+
+				l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false);
+				if (l == InvalidOffsetNumber)
+					elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes",
+						 (int) itemsz);
+
+				ninserted++;
+			}
+		}
+
+		/*
+		 * number of tuples inserted must be same as requested in REDO record.
+		 */
+		Assert(ninserted == xldata->ntups);
+
+		/*
+		 * if the page on which are adding tuples is a page previous to freed
+		 * overflow page, then update its nextblno.
+		 */
+		if (xldata->is_prev_bucket_same_wrt)
+		{
+			HashPageOpaque writeopaque = (HashPageOpaque) PageGetSpecialPointer(writepage);
+
+			writeopaque->hasho_nextblkno = xldata->nextblkno;
+		}
+
+		PageSetLSN(writepage, lsn);
+		MarkBufferDirty(writebuf);
+	}
+
+	/* replay the record for initializing overflow buffer */
+	if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO)
+	{
+		Page		ovflpage;
+		HashPageOpaque ovflopaque;
+
+		ovflpage = BufferGetPage(ovflbuf);
+
+		_hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
+
+		ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
+
+		ovflopaque->hasho_prevblkno = InvalidBlockNumber;
+		ovflopaque->hasho_nextblkno = InvalidBlockNumber;
+		ovflopaque->hasho_bucket = -1;
+		ovflopaque->hasho_flag = LH_UNUSED_PAGE;
+		ovflopaque->hasho_page_id = HASHO_PAGE_ID;
+
+		PageSetLSN(ovflpage, lsn);
+		MarkBufferDirty(ovflbuf);
+	}
+	if (BufferIsValid(ovflbuf))
+		UnlockReleaseBuffer(ovflbuf);
+
+	/* replay the record for page previous to the freed overflow page */
+	if (!xldata->is_prev_bucket_same_wrt &&
+		XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO)
+	{
+		Page		prevpage = BufferGetPage(prevbuf);
+		HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);
+
+		prevopaque->hasho_nextblkno = xldata->nextblkno;
+
+		PageSetLSN(prevpage, lsn);
+		MarkBufferDirty(prevbuf);
+	}
+	if (BufferIsValid(prevbuf))
+		UnlockReleaseBuffer(prevbuf);
+
+	/* replay the record for page next to the freed overflow page */
+	if (XLogRecHasBlockRef(record, 4))
+	{
+		Buffer		nextbuf;
+
+		if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO)
+		{
+			Page		nextpage = BufferGetPage(nextbuf);
+			HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);
+
+			nextopaque->hasho_prevblkno = xldata->prevblkno;
+
+			PageSetLSN(nextpage, lsn);
+			MarkBufferDirty(nextbuf);
+		}
+		if (BufferIsValid(nextbuf))
+			UnlockReleaseBuffer(nextbuf);
+	}
+
+	if (BufferIsValid(writebuf))
+		UnlockReleaseBuffer(writebuf);
+
+	if (BufferIsValid(bucketbuf))
+		UnlockReleaseBuffer(bucketbuf);
+
+	/*
+	 * Note: in normal operation, we'd update the bitmap and meta page while
+	 * still holding lock on the primary bucket page and overflow pages.  But
+	 * during replay it's not necessary to hold those locks, since no other
+	 * index updates can be happening concurrently.
+	 */
+	/* replay the record for bitmap page */
+	if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO)
+	{
+		Page		mappage = (Page) BufferGetPage(mapbuf);
+		uint32	   *freep = NULL;
+		char	   *data;
+		uint32	   *bitmap_page_bit;
+		Size		datalen;
+
+		freep = HashPageGetBitmap(mappage);
+
+		data = XLogRecGetBlockData(record, 5, &datalen);
+		bitmap_page_bit = (uint32 *) data;
+
+		CLRBIT(freep, *bitmap_page_bit);
+
+		PageSetLSN(mappage, lsn);
+		MarkBufferDirty(mapbuf);
+	}
+	if (BufferIsValid(mapbuf))
+		UnlockReleaseBuffer(mapbuf);
+
+	/* replay the record for meta page */
+	if (XLogRecHasBlockRef(record, 6))
+	{
+		Buffer		metabuf;
+
+		if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO)
+		{
+			HashMetaPage metap;
+			Page		page;
+			char	   *data;
+			uint32	   *firstfree_ovflpage;
+			Size		datalen;
+
+			data = XLogRecGetBlockData(record, 6, &datalen);
+			firstfree_ovflpage = (uint32 *) data;
+
+			page = BufferGetPage(metabuf);
+			metap = HashPageGetMeta(page);
+			metap->hashm_firstfree = *firstfree_ovflpage;
+
+			PageSetLSN(page, lsn);
+			MarkBufferDirty(metabuf);
+		}
+		if (BufferIsValid(metabuf))
+			UnlockReleaseBuffer(metabuf);
+	}
+}
+
+/*
+ * replay delete operation of hash index
+ */
+static void
+hash_xlog_delete(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record);
+	Buffer		bucketbuf = InvalidBuffer;
+	Buffer		deletebuf;
+	Page		page;
+	XLogRedoAction action;
+
+	/*
+	 * Ensure we have a cleanup lock on primary bucket page before we start
+	 * with the actual replay operation.  This is to ensure that neither a
+	 * scan can start nor a scan can be already-in-progress during the replay
+	 * of this operation.  If we allow scans during this operation, then they
+	 * can miss some records or show the same record multiple times.
+	 */
+	if (xldata->is_primary_bucket_page)
+		action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf);
+	else
+	{
+		/*
+		 * we don't care for return value as the purpose of reading bucketbuf
+		 * is to ensure a cleanup lock on primary bucket page.
+		 */
+		(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
+
+		action = XLogReadBufferForRedo(record, 1, &deletebuf);
+	}
+
+	/* replay the record for deleting entries in bucket page */
+	if (action == BLK_NEEDS_REDO)
+	{
+		char	   *ptr;
+		Size		len;
+
+		ptr = XLogRecGetBlockData(record, 1, &len);
+
+		page = (Page) BufferGetPage(deletebuf);
+
+		if (len > 0)
+		{
+			OffsetNumber *unused;
+			OffsetNumber *unend;
+
+			unused = (OffsetNumber *) ptr;
+			unend = (OffsetNumber *) ((char *) ptr + len);
+
+			if ((unend - unused) > 0)
+				PageIndexMultiDelete(page, unused, unend - unused);
+		}
+
+		/*
+		 * Mark the page as not containing any LP_DEAD items only if
+		 * clear_dead_marking flag is set to true. See comments in
+		 * hashbucketcleanup() for details.
+		 */
+		if (xldata->clear_dead_marking)
+		{
+			HashPageOpaque pageopaque;
+
+			pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+			pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
+		}
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(deletebuf);
+	}
+	if (BufferIsValid(deletebuf))
+		UnlockReleaseBuffer(deletebuf);
+
+	if (BufferIsValid(bucketbuf))
+		UnlockReleaseBuffer(bucketbuf);
+}
+
+/*
+ * replay split cleanup flag operation for primary bucket page.
+ */
+static void
+hash_xlog_split_cleanup(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	Buffer		buffer;
+	Page		page;
+
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		HashPageOpaque bucket_opaque;
+
+		page = (Page) BufferGetPage(buffer);
+
+		bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+		bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * replay for update meta page
+ */
+static void
+hash_xlog_update_meta_page(XLogReaderState *record)
+{
+	HashMetaPage metap;
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_hash_update_meta_page *xldata = (xl_hash_update_meta_page *) XLogRecGetData(record);
+	Buffer		metabuf;
+	Page		page;
+
+	if (XLogReadBufferForRedo(record, 0, &metabuf) == BLK_NEEDS_REDO)
+	{
+		page = BufferGetPage(metabuf);
+		metap = HashPageGetMeta(page);
+
+		metap->hashm_ntuples = xldata->ntuples;
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(metabuf);
+	}
+	if (BufferIsValid(metabuf))
+		UnlockReleaseBuffer(metabuf);
+}
+
+/*
+ * Get the latestRemovedXid from the heap pages pointed at by the index
+ * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid,
+ * on which this function is based.
+ */
+static TransactionId
+hash_xlog_vacuum_get_latestRemovedXid(XLogReaderState *record)
+{
+	xl_hash_vacuum_one_page *xlrec;
+	OffsetNumber *unused;
+	Buffer		ibuffer,
+				hbuffer;
+	Page		ipage,
+				hpage;
+	RelFileNode rnode;
+	BlockNumber blkno;
+	ItemId		iitemid,
+				hitemid;
+	IndexTuple	itup;
+	HeapTupleHeader htuphdr;
+	BlockNumber hblkno;
+	OffsetNumber hoffnum;
+	TransactionId latestRemovedXid = InvalidTransactionId;
+	int			i;
+
+	xlrec = (xl_hash_vacuum_one_page *) XLogRecGetData(record);
+
+	/*
+	 * If there's nothing running on the standby we don't need to derive a
+	 * full latestRemovedXid value, so use a fast path out of here.  This
+	 * returns InvalidTransactionId, and so will conflict with all HS
+	 * transactions; but since we just worked out that that's zero people,
+	 * it's OK.
+	 *
+	 * XXX There is a race condition here, which is that a new backend might
+	 * start just after we look.  If so, it cannot need to conflict, but this
+	 * coding will result in throwing a conflict anyway.
+	 */
+	if (CountDBBackends(InvalidOid) == 0)
+		return latestRemovedXid;
+
+	/*
+	 * Check if WAL replay has reached a consistent database state. If not, we
+	 * must PANIC. See the definition of
+	 * btree_xlog_delete_get_latestRemovedXid for more details.
+	 */
+	if (!reachedConsistency)
+		elog(PANIC, "hash_xlog_vacuum_get_latestRemovedXid: cannot operate with inconsistent data");
+
+	/*
+	 * Get index page.  If the DB is consistent, this should not fail, nor
+	 * should any of the heap page fetches below.  If one does, we return
+	 * InvalidTransactionId to cancel all HS transactions.  That's probably
+	 * overkill, but it's safe, and certainly better than panicking here.
+	 */
+	XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
+	ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL);
+
+	if (!BufferIsValid(ibuffer))
+		return InvalidTransactionId;
+	LockBuffer(ibuffer, HASH_READ);
+	ipage = (Page) BufferGetPage(ibuffer);
+
+	/*
+	 * Loop through the deleted index items to obtain the TransactionId from
+	 * the heap items they point to.
+	 */
+	unused = (OffsetNumber *) ((char *) xlrec + SizeOfHashVacuumOnePage);
+
+	for (i = 0; i < xlrec->ntuples; i++)
+	{
+		/*
+		 * Identify the index tuple about to be deleted.
+		 */
+		iitemid = PageGetItemId(ipage, unused[i]);
+		itup = (IndexTuple) PageGetItem(ipage, iitemid);
+
+		/*
+		 * Locate the heap page that the index tuple points at
+		 */
+		hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+		hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM,
+										 hblkno, RBM_NORMAL);
+
+		if (!BufferIsValid(hbuffer))
+		{
+			UnlockReleaseBuffer(ibuffer);
+			return InvalidTransactionId;
+		}
+		LockBuffer(hbuffer, HASH_READ);
+		hpage = (Page) BufferGetPage(hbuffer);
+
+		/*
+		 * Look up the heap tuple header that the index tuple points at by
+		 * using the heap node supplied with the xlrec. We can't use
+		 * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
+		 * Note that we are not looking at tuple data here, just headers.
+		 */
+		hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
+		hitemid = PageGetItemId(hpage, hoffnum);
+
+		/*
+		 * Follow any redirections until we find something useful.
+		 */
+		while (ItemIdIsRedirected(hitemid))
+		{
+			hoffnum = ItemIdGetRedirect(hitemid);
+			hitemid = PageGetItemId(hpage, hoffnum);
+			CHECK_FOR_INTERRUPTS();
+		}
+
+		/*
+		 * If the heap item has storage, then read the header and use that to
+		 * set latestRemovedXid.
+		 *
+		 * Some LP_DEAD items may not be accessible, so we ignore them.
+		 */
+		if (ItemIdHasStorage(hitemid))
+		{
+			htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
+			HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
+		}
+		else if (ItemIdIsDead(hitemid))
+		{
+			/*
+			 * Conjecture: if hitemid is dead then it had xids before the xids
+			 * marked on LP_NORMAL items. So we just ignore this item and move
+			 * onto the next, for the purposes of calculating
+			 * latestRemovedxids.
+			 */
+		}
+		else
+			Assert(!ItemIdIsUsed(hitemid));
+
+		UnlockReleaseBuffer(hbuffer);
+	}
+
+	UnlockReleaseBuffer(ibuffer);
+
+	/*
+	 * If all heap tuples were LP_DEAD then we will be returning
+	 * InvalidTransactionId here, which avoids conflicts. This matches
+	 * existing logic which assumes that LP_DEAD tuples must already be older
+	 * than the latestRemovedXid on the cleanup record that set them as
+	 * LP_DEAD, hence must already have generated a conflict.
+	 */
+	return latestRemovedXid;
+}
+
+/*
+ * replay delete operation in hash index to remove
+ * tuples marked as DEAD during index tuple insertion.
+ */
+static void
+hash_xlog_vacuum_one_page(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_hash_vacuum_one_page *xldata;
+	Buffer		buffer;
+	Buffer		metabuf;
+	Page		page;
+	XLogRedoAction action;
+	HashPageOpaque pageopaque;
+
+	xldata = (xl_hash_vacuum_one_page *) XLogRecGetData(record);
+
+	/*
+	 * If we have any conflict processing to do, it must happen before we
+	 * update the page.
+	 *
+	 * Hash index records that are marked as LP_DEAD and being removed during
+	 * hash index tuple insertion can conflict with standby queries. You might
+	 * think that vacuum records would conflict as well, but we've handled
+	 * that already.  XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
+	 * cleaned by the vacuum of the heap and so we can resolve any conflicts
+	 * just once when that arrives.  After that we know that no conflicts
+	 * exist from individual hash index vacuum records on that index.
+	 */
+	if (InHotStandby)
+	{
+		TransactionId latestRemovedXid =
+		hash_xlog_vacuum_get_latestRemovedXid(record);
+		RelFileNode rnode;
+
+		XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
+		ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
+	}
+
+	action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer);
+
+	if (action == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(buffer);
+
+		if (XLogRecGetDataLen(record) > SizeOfHashVacuumOnePage)
+		{
+			OffsetNumber *unused;
+
+			unused = (OffsetNumber *) ((char *) xldata + SizeOfHashVacuumOnePage);
+
+			PageIndexMultiDelete(page, unused, xldata->ntuples);
+		}
+
+		/*
+		 * Mark the page as not containing any LP_DEAD items. See comments in
+		 * _hash_vacuum_one_page() for details.
+		 */
+		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+		pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+
+	if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO)
+	{
+		Page		metapage;
+		HashMetaPage metap;
+
+		metapage = BufferGetPage(metabuf);
+		metap = HashPageGetMeta(metapage);
+
+		metap->hashm_ntuples -= xldata->ntuples;
+
+		PageSetLSN(metapage, lsn);
+		MarkBufferDirty(metabuf);
+	}
+	if (BufferIsValid(metabuf))
+		UnlockReleaseBuffer(metabuf);
+}
+
+void
+hash_redo(XLogReaderState *record)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	switch (info)
+	{
+		case XLOG_HASH_INIT_META_PAGE:
+			hash_xlog_init_meta_page(record);
+			break;
+		case XLOG_HASH_INIT_BITMAP_PAGE:
+			hash_xlog_init_bitmap_page(record);
+			break;
+		case XLOG_HASH_INSERT:
+			hash_xlog_insert(record);
+			break;
+		case XLOG_HASH_ADD_OVFL_PAGE:
+			hash_xlog_add_ovfl_page(record);
+			break;
+		case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
+			hash_xlog_split_allocate_page(record);
+			break;
+		case XLOG_HASH_SPLIT_PAGE:
+			hash_xlog_split_page(record);
+			break;
+		case XLOG_HASH_SPLIT_COMPLETE:
+			hash_xlog_split_complete(record);
+			break;
+		case XLOG_HASH_MOVE_PAGE_CONTENTS:
+			hash_xlog_move_page_contents(record);
+			break;
+		case XLOG_HASH_SQUEEZE_PAGE:
+			hash_xlog_squeeze_page(record);
+			break;
+		case XLOG_HASH_DELETE:
+			hash_xlog_delete(record);
+			break;
+		case XLOG_HASH_SPLIT_CLEANUP:
+			hash_xlog_split_cleanup(record);
+			break;
+		case XLOG_HASH_UPDATE_META_PAGE:
+			hash_xlog_update_meta_page(record);
+			break;
+		case XLOG_HASH_VACUUM_ONE_PAGE:
+			hash_xlog_vacuum_one_page(record);
+			break;
+		default:
+			elog(PANIC, "hash_redo: unknown op code %u", info);
+	}
+}
+
+/*
+ * Mask a hash page before performing consistency checks on it.
+ */
+void
+hash_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+	HashPageOpaque opaque;
+	int			pagetype;
+
+	mask_page_lsn(page);
+
+	mask_page_hint_bits(page);
+	mask_unused_space(page);
+
+	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+	pagetype = opaque->hasho_flag & LH_PAGE_TYPE;
+	if (pagetype == LH_UNUSED_PAGE)
+	{
+		/*
+		 * Mask everything on a UNUSED page.
+		 */
+		mask_page_content(page);
+	}
+	else if (pagetype == LH_BUCKET_PAGE ||
+			 pagetype == LH_OVERFLOW_PAGE)
+	{
+		/*
+		 * In hash bucket and overflow pages, it is possible to modify the
+		 * LP_FLAGS without emitting any WAL record. Hence, mask the line
+		 * pointer flags. See hashgettuple(), _hash_kill_items() for details.
+		 */
+		mask_lp_flags(page);
+	}
+
+	/*
+	 * It is possible that the hint bit LH_PAGE_HAS_DEAD_TUPLES may remain
+	 * unlogged. So, mask it. See _hash_kill_items() for details.
+	 */
+	opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
+}
diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
index bb9adad82e..4089fd6d8a 100644
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -3,7 +3,7 @@
  * hashfunc.c
  *	  Support functions for hash access method.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -27,6 +27,7 @@
 #include "postgres.h"
 
 #include "access/hash.h"
+#include "utils/builtins.h"
 
 #ifdef PGXC
 #include "catalog/pg_type.h"
@@ -36,6 +37,16 @@
 #include "utils/nabstime.h"
 #endif
 
+/*
+ * Datatype-specific hash functions.
+ *
+ * These support both hash indexes and hash joins.
+ *
+ * NOTE: some of these are also used by catcache operations, without
+ * any direct connection to hash indexes.  Also, the common hash_any
+ * routine is also used by dynahash tables.
+ */
+
 /* Note: this is used for both "char" and boolean datatypes */
 Datum
 hashchar(PG_FUNCTION_ARGS)
@@ -138,22 +149,11 @@ hashoidvector(PG_FUNCTION_ARGS)
 }
 
 Datum
-hashint2vector(PG_FUNCTION_ARGS)
-{
-	int2vector *key = (int2vector *) PG_GETARG_POINTER(0);
-
-	return hash_any((unsigned char *) key->values, key->dim1 * sizeof(int16));
-}
-
-Datum
 hashname(PG_FUNCTION_ARGS)
 {
 	char	   *key = NameStr(*PG_GETARG_NAME(0));
-	int			keylen = strlen(key);
-
-	Assert(keylen < NAMEDATALEN);		/* else it's not truncated correctly */
 
-	return hash_any((unsigned char *) key, keylen);
+	return hash_any((unsigned char *) key, strlen(key));
 }
 
 Datum
@@ -581,8 +581,6 @@ compute_hash(Oid type, Datum value, char locator)
 			return DirectFunctionCall1(hashchar, value);
 		case NAMEOID:
 			return DirectFunctionCall1(hashname, value);
-		case INT2VECTOROID:
-			return DirectFunctionCall1(hashint2vector, value);
 
 		case VARCHAROID:
 		case TEXTOID:
@@ -677,8 +675,6 @@ get_compute_hash_function(Oid type, char locator)
 			return "hashchar";
 		case NAMEOID:
 			return "hashname";
-		case INT2VECTOROID:
-			return "hashint2vector";
 		case VARCHAROID:
 		case TEXTOID:
 			return "hashtext";
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
index acd2e64763..01c8d8006c 100644
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -3,7 +3,7 @@
  * hashinsert.c
  *	  Item insertion in hash tables for Postgres.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -16,8 +16,15 @@
 #include "postgres.h"
 
 #include "access/hash.h"
+#include "access/hash_xlog.h"
+#include "access/heapam.h"
+#include "miscadmin.h"
 #include "utils/rel.h"
+#include "storage/lwlock.h"
+#include "storage/buf_internals.h"
 
+static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf,
+					  RelFileNode hnode);
 
 /*
  *	_hash_doinsert() -- Handle insertion of a single index tuple.
@@ -26,20 +33,21 @@
  *		and hashinsert.  By here, itup is completely filled in.
  */
 void
-_hash_doinsert(Relation rel, IndexTuple itup)
+_hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
 {
-	Buffer		buf;
+	Buffer		buf = InvalidBuffer;
+	Buffer		bucket_buf;
 	Buffer		metabuf;
 	HashMetaPage metap;
-	BlockNumber blkno;
-	BlockNumber oldblkno = InvalidBlockNumber;
-	bool		retry = false;
+	HashMetaPage usedmetap = NULL;
+	Page		metapage;
 	Page		page;
 	HashPageOpaque pageopaque;
 	Size		itemsz;
 	bool		do_expand;
 	uint32		hashkey;
 	Bucket		bucket;
+	OffsetNumber itup_off;
 
 	/*
 	 * Get the hash key for the item (it's stored in the index tuple itself).
@@ -51,9 +59,15 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
 								 * need to be consistent */
 
-	/* Read the metapage */
-	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	metap = HashPageGetMeta(BufferGetPage(metabuf));
+restart_insert:
+
+	/*
+	 * Read the metapage.  We don't lock it yet; HashMaxItemSize() will
+	 * examine pd_pagesize_version, but that can't change so we can examine it
+	 * without a lock.
+	 */
+	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
+	metapage = BufferGetPage(metabuf);
 
 	/*
 	 * Check whether the item can fit on a hash page at all. (Eventually, we
@@ -62,74 +76,90 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 	 *
 	 * XXX this is useless code if we are only storing hash keys.
 	 */
-	if (itemsz > HashMaxItemSize((Page) metap))
+	if (itemsz > HashMaxItemSize(metapage))
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("index row size %zu exceeds hash maximum %zu",
-						itemsz, HashMaxItemSize((Page) metap)),
+						itemsz, HashMaxItemSize(metapage)),
 			errhint("Values larger than a buffer page cannot be indexed.")));
 
+	/* Lock the primary bucket page for the target bucket. */
+	buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE,
+										  &usedmetap);
+	Assert(usedmetap != NULL);
+
+	/* remember the primary bucket buffer to release the pin on it at end. */
+	bucket_buf = buf;
+
+	page = BufferGetPage(buf);
+	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+	bucket = pageopaque->hasho_bucket;
+
 	/*
-	 * Loop until we get a lock on the correct target bucket.
+	 * If this bucket is in the process of being split, try to finish the
+	 * split before inserting, because that might create room for the
+	 * insertion to proceed without allocating an additional overflow page.
+	 * It's only interesting to finish the split if we're trying to insert
+	 * into the bucket from which we're removing tuples (the "old" bucket),
+	 * not if we're trying to insert into the bucket into which tuples are
+	 * being moved (the "new" bucket).
 	 */
-	for (;;)
+	if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf))
 	{
-		/*
-		 * Compute the target bucket number, and convert to block number.
-		 */
-		bucket = _hash_hashkey2bucket(hashkey,
-									  metap->hashm_maxbucket,
-									  metap->hashm_highmask,
-									  metap->hashm_lowmask);
-
-		blkno = BUCKET_TO_BLKNO(metap, bucket);
+		/* release the lock on bucket buffer, before completing the split. */
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+		_hash_finish_split(rel, metabuf, buf, bucket,
+						   usedmetap->hashm_maxbucket,
+						   usedmetap->hashm_highmask,
+						   usedmetap->hashm_lowmask);
+
+		/* release the pin on old and meta buffer.  retry for insert. */
+		_hash_dropbuf(rel, buf);
+		_hash_dropbuf(rel, metabuf);
+		goto restart_insert;
+	}
 
-		/* Release metapage lock, but keep pin. */
-		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+	/* Do the insertion */
+	while (PageGetFreeSpace(page) < itemsz)
+	{
+		BlockNumber nextblkno;
 
 		/*
-		 * If the previous iteration of this loop locked what is still the
-		 * correct target bucket, we are done.  Otherwise, drop any old lock
-		 * and lock what now appears to be the correct bucket.
+		 * Check if current page has any DEAD tuples. If yes, delete these
+		 * tuples and see if we can get a space for the new item to be
+		 * inserted before moving to the next page in the bucket chain.
 		 */
-		if (retry)
+		if (H_HAS_DEAD_TUPLES(pageopaque))
 		{
-			if (oldblkno == blkno)
-				break;
-			_hash_droplock(rel, oldblkno, HASH_SHARE);
-		}
-		_hash_getlock(rel, blkno, HASH_SHARE);
 
-		/*
-		 * Reacquire metapage lock and check that no bucket split has taken
-		 * place while we were awaiting the bucket lock.
-		 */
-		_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
-		oldblkno = blkno;
-		retry = true;
-	}
+			if (IsBufferCleanupOK(buf))
+			{
+				_hash_vacuum_one_page(rel, metabuf, buf, heapRel->rd_node);
 
-	/* Fetch the primary bucket page for the bucket */
-	buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
-	page = BufferGetPage(buf);
-	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
-	Assert(pageopaque->hasho_bucket == bucket);
+				if (PageGetFreeSpace(page) >= itemsz)
+					break;		/* OK, now we have enough space */
+			}
+		}
 
-	/* Do the insertion */
-	while (PageGetFreeSpace(page) < itemsz)
-	{
 		/*
 		 * no space on this page; check for an overflow page
 		 */
-		BlockNumber nextblkno = pageopaque->hasho_nextblkno;
+		nextblkno = pageopaque->hasho_nextblkno;
 
 		if (BlockNumberIsValid(nextblkno))
 		{
 			/*
 			 * ovfl page exists; go get it.  if it doesn't have room, we'll
-			 * find out next pass through the loop test above.
+			 * find out next pass through the loop test above.  we always
+			 * release both the lock and pin if this is an overflow page, but
+			 * only the lock if this is the primary bucket page, since the pin
+			 * on the primary bucket must be retained throughout the scan.
 			 */
-			_hash_relbuf(rel, buf);
+			if (buf != bucket_buf)
+				_hash_relbuf(rel, buf);
+			else
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 			buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
 			page = BufferGetPage(buf);
 		}
@@ -141,43 +171,77 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 			 */
 
 			/* release our write lock without modifying buffer */
-			_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
+			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 
 			/* chain to a new overflow page */
-			buf = _hash_addovflpage(rel, metabuf, buf);
+			buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
 			page = BufferGetPage(buf);
 
 			/* should fit now, given test above */
 			Assert(PageGetFreeSpace(page) >= itemsz);
 		}
 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
-		Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE);
+		Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE);
 		Assert(pageopaque->hasho_bucket == bucket);
 	}
 
-	/* found page with enough space, so add the item here */
-	(void) _hash_pgaddtup(rel, buf, itemsz, itup);
-
-	/* write and release the modified page */
-	_hash_wrtbuf(rel, buf);
-
-	/* We can drop the bucket lock now */
-	_hash_droplock(rel, blkno, HASH_SHARE);
-
 	/*
 	 * Write-lock the metapage so we can increment the tuple count. After
 	 * incrementing it, check to see if it's time for a split.
 	 */
-	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
+	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 
+	/* Do the update.  No ereport(ERROR) until changes are logged */
+	START_CRIT_SECTION();
+
+	/* found page with enough space, so add the item here */
+	itup_off = _hash_pgaddtup(rel, buf, itemsz, itup);
+	MarkBufferDirty(buf);
+
+	/* metapage operations */
+	metap = HashPageGetMeta(metapage);
 	metap->hashm_ntuples += 1;
 
 	/* Make sure this stays in sync with _hash_expandtable() */
 	do_expand = metap->hashm_ntuples >
 		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
 
-	/* Write out the metapage and drop lock, but keep pin */
-	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
+	MarkBufferDirty(metabuf);
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_hash_insert xlrec;
+		XLogRecPtr	recptr;
+
+		xlrec.offnum = itup_off;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, SizeOfHashInsert);
+
+		XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
+
+		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+		XLogRegisterBufData(0, (char *) itup, IndexTupleDSize(*itup));
+
+		recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT);
+
+		PageSetLSN(BufferGetPage(buf), recptr);
+		PageSetLSN(BufferGetPage(metabuf), recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	/* drop lock on metapage, but keep pin */
+	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+
+	/*
+	 * Release the modified page and ensure to release the pin on primary
+	 * page.
+	 */
+	_hash_relbuf(rel, buf);
+	if (buf != bucket_buf)
+		_hash_dropbuf(rel, bucket_buf);
 
 	/* Attempt to split if a split is needed */
 	if (do_expand)
@@ -219,3 +283,143 @@ _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup)
 
 	return itup_off;
 }
+
+/*
+ *	_hash_pgaddmultitup() -- add a tuple vector to a particular page in the
+ *							 index.
+ *
+ * This routine has same requirements for locking and tuple ordering as
+ * _hash_pgaddtup().
+ *
+ * Returns the offset number array at which the tuples were inserted.
+ */
+void
+_hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
+					OffsetNumber *itup_offsets, uint16 nitups)
+{
+	OffsetNumber itup_off;
+	Page		page;
+	uint32		hashkey;
+	int			i;
+
+	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+	page = BufferGetPage(buf);
+
+	for (i = 0; i < nitups; i++)
+	{
+		Size		itemsize;
+
+		itemsize = IndexTupleDSize(*itups[i]);
+		itemsize = MAXALIGN(itemsize);
+
+		/* Find where to insert the tuple (preserving page's hashkey ordering) */
+		hashkey = _hash_get_indextuple_hashkey(itups[i]);
+		itup_off = _hash_binsearch(page, hashkey);
+
+		itup_offsets[i] = itup_off;
+
+		if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false)
+			== InvalidOffsetNumber)
+			elog(ERROR, "failed to add index item to \"%s\"",
+				 RelationGetRelationName(rel));
+	}
+}
+
+/*
+ * _hash_vacuum_one_page - vacuum just one index page.
+ *
+ * Try to remove LP_DEAD items from the given page. We must acquire cleanup
+ * lock on the page being modified before calling this function.
+ */
+
+static void
+_hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf,
+					  RelFileNode hnode)
+{
+	OffsetNumber deletable[MaxOffsetNumber];
+	int			ndeletable = 0;
+	OffsetNumber offnum,
+				maxoff;
+	Page		page = BufferGetPage(buf);
+	HashPageOpaque pageopaque;
+	HashMetaPage metap;
+
+	/* Scan each tuple in page to see if it is marked as LP_DEAD */
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = FirstOffsetNumber;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemId = PageGetItemId(page, offnum);
+
+		if (ItemIdIsDead(itemId))
+			deletable[ndeletable++] = offnum;
+	}
+
+	if (ndeletable > 0)
+	{
+		/*
+		 * Write-lock the meta page so that we can decrement tuple count.
+		 */
+		LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+
+		/* No ereport(ERROR) until changes are logged */
+		START_CRIT_SECTION();
+
+		PageIndexMultiDelete(page, deletable, ndeletable);
+
+		/*
+		 * Mark the page as not containing any LP_DEAD items. This is not
+		 * certainly true (there might be some that have recently been marked,
+		 * but weren't included in our target-item list), but it will almost
+		 * always be true and it doesn't seem worth an additional page scan to
+		 * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
+		 * anyway.
+		 */
+		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+		pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
+
+		metap = HashPageGetMeta(BufferGetPage(metabuf));
+		metap->hashm_ntuples -= ndeletable;
+
+		MarkBufferDirty(buf);
+		MarkBufferDirty(metabuf);
+
+		/* XLOG stuff */
+		if (RelationNeedsWAL(rel))
+		{
+			xl_hash_vacuum_one_page xlrec;
+			XLogRecPtr	recptr;
+
+			xlrec.hnode = hnode;
+			xlrec.ntuples = ndeletable;
+
+			XLogBeginInsert();
+			XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+			XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage);
+
+			/*
+			 * We need the target-offsets array whether or not we store the
+			 * whole buffer, to allow us to find the latestRemovedXid on a
+			 * standby server.
+			 */
+			XLogRegisterData((char *) deletable,
+							 ndeletable * sizeof(OffsetNumber));
+
+			XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
+
+			recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE);
+
+			PageSetLSN(BufferGetPage(buf), recptr);
+			PageSetLSN(BufferGetPage(metabuf), recptr);
+		}
+
+		END_CRIT_SECTION();
+
+		/*
+		 * Releasing write lock on meta page as we have updated the tuple
+		 * count.
+		 */
+		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+	}
+}
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
index db3e268a76..b5133e3945 100644
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -3,7 +3,7 @@
  * hashovfl.c
  *	  Overflow page management code for the Postgres hash access method
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -18,10 +18,11 @@
 #include "postgres.h"
 
 #include "access/hash.h"
+#include "access/hash_xlog.h"
+#include "miscadmin.h"
 #include "utils/rel.h"
 
 
-static Buffer _hash_getovflpage(Relation rel, Buffer metabuf);
 static uint32 _hash_firstfreebit(uint32 map);
 
 
@@ -48,14 +49,16 @@ bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
 	 * Convert to absolute page number by adding the number of bucket pages
 	 * that exist before this split point.
 	 */
-	return (BlockNumber) ((1 << i) + ovflbitnum);
+	return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum);
 }
 
 /*
+ * _hash_ovflblkno_to_bitno
+ *
  * Convert overflow page block number to bit number for free-page bitmap.
  */
-static uint32
-blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
+uint32
+_hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
 {
 	uint32		splitnum = metap->hashm_ovflpoint;
 	uint32		i;
@@ -64,14 +67,24 @@ blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
 	/* Determine the split number containing this page */
 	for (i = 1; i <= splitnum; i++)
 	{
-		if (ovflblkno <= (BlockNumber) (1 << i))
+		if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i))
 			break;				/* oops */
-		bitnum = ovflblkno - (1 << i);
-		if (bitnum <= metap->hashm_spares[i])
+		bitnum = ovflblkno - _hash_get_totalbuckets(i);
+
+		/*
+		 * bitnum has to be greater than number of overflow page added in
+		 * previous split point. The overflow page at this splitnum (i) if any
+		 * should start from (_hash_get_totalbuckets(i) +
+		 * metap->hashm_spares[i - 1] + 1).
+		 */
+		if (bitnum > metap->hashm_spares[i - 1] &&
+			bitnum <= metap->hashm_spares[i])
 			return bitnum - 1;	/* -1 to convert 1-based to 0-based */
 	}
 
-	elog(ERROR, "invalid overflow block number %u", ovflblkno);
+	ereport(ERROR,
+			(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+			 errmsg("invalid overflow block number %u", ovflblkno)));
 	return 0;					/* keep compiler quiet */
 }
 
@@ -82,38 +95,59 @@ blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
  *
  *	On entry, the caller must hold a pin but no lock on 'buf'.  The pin is
  *	dropped before exiting (we assume the caller is not interested in 'buf'
- *	anymore).  The returned overflow page will be pinned and write-locked;
- *	it is guaranteed to be empty.
+ *	anymore) if not asked to retain.  The pin will be retained only for the
+ *	primary bucket.  The returned overflow page will be pinned and
+ *	write-locked; it is guaranteed to be empty.
  *
  *	The caller must hold a pin, but no lock, on the metapage buffer.
  *	That buffer is returned in the same state.
  *
- *	The caller must hold at least share lock on the bucket, to ensure that
- *	no one else tries to compact the bucket meanwhile.  This guarantees that
- *	'buf' won't stop being part of the bucket while it's unlocked.
- *
  * NB: since this could be executed concurrently by multiple processes,
  * one should not assume that the returned overflow page will be the
  * immediate successor of the originally passed 'buf'.  Additional overflow
  * pages might have been added to the bucket chain in between.
  */
 Buffer
-_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
+_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
 {
 	Buffer		ovflbuf;
 	Page		page;
 	Page		ovflpage;
 	HashPageOpaque pageopaque;
 	HashPageOpaque ovflopaque;
-
-	/* allocate and lock an empty overflow page */
-	ovflbuf = _hash_getovflpage(rel, metabuf);
+	HashMetaPage metap;
+	Buffer		mapbuf = InvalidBuffer;
+	Buffer		newmapbuf = InvalidBuffer;
+	BlockNumber blkno;
+	uint32		orig_firstfree;
+	uint32		splitnum;
+	uint32	   *freep = NULL;
+	uint32		max_ovflpg;
+	uint32		bit;
+	uint32		bitmap_page_bit;
+	uint32		first_page;
+	uint32		last_bit;
+	uint32		last_page;
+	uint32		i,
+				j;
+	bool		page_found = false;
 
 	/*
-	 * Write-lock the tail page.  It is okay to hold two buffer locks here
-	 * since there cannot be anyone else contending for access to ovflbuf.
+	 * Write-lock the tail page.  Here, we need to maintain locking order such
+	 * that, first acquire the lock on tail page of bucket, then on meta page
+	 * to find and lock the bitmap page and if it is found, then lock on meta
+	 * page is released, then finally acquire the lock on new overflow buffer.
+	 * We need this locking order to avoid deadlock with backends that are
+	 * doing inserts.
+	 *
+	 * Note: We could have avoided locking many buffers here if we made two
+	 * WAL records for acquiring an overflow page (one to allocate an overflow
+	 * page and another to add it to overflow bucket chain).  However, doing
+	 * so can leak an overflow page, if the system crashes after allocation.
+	 * Needless to say, it is better to have a single record from a
+	 * performance point of view as well.
 	 */
-	_hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_WRITE);
+	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
 	/* probably redundant... */
 	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
@@ -131,59 +165,22 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
 			break;
 
 		/* we assume we do not need to write the unmodified page */
-		_hash_relbuf(rel, buf);
+		if (retain_pin)
+		{
+			/* pin will be retained only for the primary bucket page */
+			Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE);
+			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		}
+		else
+			_hash_relbuf(rel, buf);
+
+		retain_pin = false;
 
 		buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
 	}
 
-	/* now that we have correct backlink, initialize new overflow page */
-	ovflpage = BufferGetPage(ovflbuf);
-	ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
-	ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
-	ovflopaque->hasho_nextblkno = InvalidBlockNumber;
-	ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
-	ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
-	ovflopaque->hasho_page_id = HASHO_PAGE_ID;
-
-	MarkBufferDirty(ovflbuf);
-
-	/* logically chain overflow page to previous page */
-	pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
-	_hash_wrtbuf(rel, buf);
-
-	return ovflbuf;
-}
-
-/*
- *	_hash_getovflpage()
- *
- *	Find an available overflow page and return it.  The returned buffer
- *	is pinned and write-locked, and has had _hash_pageinit() applied,
- *	but it is caller's responsibility to fill the special space.
- *
- * The caller must hold a pin, but no lock, on the metapage buffer.
- * That buffer is left in the same state at exit.
- */
-static Buffer
-_hash_getovflpage(Relation rel, Buffer metabuf)
-{
-	HashMetaPage metap;
-	Buffer		mapbuf = 0;
-	Buffer		newbuf;
-	BlockNumber blkno;
-	uint32		orig_firstfree;
-	uint32		splitnum;
-	uint32	   *freep = NULL;
-	uint32		max_ovflpg;
-	uint32		bit;
-	uint32		first_page;
-	uint32		last_bit;
-	uint32		last_page;
-	uint32		i,
-				j;
-
 	/* Get exclusive lock on the meta page */
-	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
+	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 
 	_hash_checkpage(rel, metabuf, LH_META_PAGE);
 	metap = HashPageGetMeta(BufferGetPage(metabuf));
@@ -221,7 +218,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
 			last_inpage = BMPGSZ_BIT(metap) - 1;
 
 		/* Release exclusive lock on metapage while reading bitmap page */
-		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
 
 		mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE);
 		mappage = BufferGetPage(mapbuf);
@@ -230,17 +227,37 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
 		for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
 		{
 			if (freep[j] != ALL_SET)
+			{
+				page_found = true;
+
+				/* Reacquire exclusive lock on the meta page */
+				LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+
+				/* convert bit to bit number within page */
+				bit += _hash_firstfreebit(freep[j]);
+				bitmap_page_bit = bit;
+
+				/* convert bit to absolute bit number */
+				bit += (i << BMPG_SHIFT(metap));
+				/* Calculate address of the recycled overflow page */
+				blkno = bitno_to_blkno(metap, bit);
+
+				/* Fetch and init the recycled page */
+				ovflbuf = _hash_getinitbuf(rel, blkno);
+
 				goto found;
+			}
 		}
 
 		/* No free space here, try to advance to next map page */
 		_hash_relbuf(rel, mapbuf);
+		mapbuf = InvalidBuffer;
 		i++;
 		j = 0;					/* scan from start of next map page */
 		bit = 0;
 
 		/* Reacquire exclusive lock on the meta page */
-		_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
+		LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 	}
 
 	/*
@@ -258,8 +275,15 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
 		 * convenient to pre-mark them as "in use" too.
 		 */
 		bit = metap->hashm_spares[splitnum];
-		_hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
-		metap->hashm_spares[splitnum]++;
+
+		/* metapage already has a write lock */
+		if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("out of overflow pages in hash index \"%s\"",
+							RelationGetRelationName(rel))));
+
+		newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
 	}
 	else
 	{
@@ -270,7 +294,8 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
 	}
 
 	/* Calculate address of the new overflow page */
-	bit = metap->hashm_spares[splitnum];
+	bit = BufferIsValid(newmapbuf) ?
+		metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum];
 	blkno = bitno_to_blkno(metap, bit);
 
 	/*
@@ -278,39 +303,52 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
 	 * relation length stays in sync with ours.  XXX It's annoying to do this
 	 * with metapage write lock held; would be better to use a lock that
 	 * doesn't block incoming searches.
+	 *
+	 * It is okay to hold two buffer locks here (one on tail page of bucket
+	 * and other on new overflow page) since there cannot be anyone else
+	 * contending for access to ovflbuf.
 	 */
-	newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
+	ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
 
-	metap->hashm_spares[splitnum]++;
+found:
 
 	/*
-	 * Adjust hashm_firstfree to avoid redundant searches.  But don't risk
-	 * changing it if someone moved it while we were searching bitmap pages.
+	 * Do the update.  No ereport(ERROR) until changes are logged. We want to
+	 * log the changes for bitmap page and overflow page together to avoid
+	 * loss of pages in case the new page is added.
 	 */
-	if (metap->hashm_firstfree == orig_firstfree)
-		metap->hashm_firstfree = bit + 1;
-
-	/* Write updated metapage and release lock, but not pin */
-	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
+	START_CRIT_SECTION();
 
-	return newbuf;
-
-found:
-	/* convert bit to bit number within page */
-	bit += _hash_firstfreebit(freep[j]);
-
-	/* mark page "in use" in the bitmap */
-	SETBIT(freep, bit);
-	_hash_wrtbuf(rel, mapbuf);
+	if (page_found)
+	{
+		Assert(BufferIsValid(mapbuf));
 
-	/* Reacquire exclusive lock on the meta page */
-	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
+		/* mark page "in use" in the bitmap */
+		SETBIT(freep, bitmap_page_bit);
+		MarkBufferDirty(mapbuf);
+	}
+	else
+	{
+		/* update the count to indicate new overflow page is added */
+		metap->hashm_spares[splitnum]++;
 
-	/* convert bit to absolute bit number */
-	bit += (i << BMPG_SHIFT(metap));
+		if (BufferIsValid(newmapbuf))
+		{
+			_hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false);
+			MarkBufferDirty(newmapbuf);
+
+			/* add the new bitmap page to the metapage's list of bitmaps */
+			metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf);
+			metap->hashm_nmaps++;
+			metap->hashm_spares[splitnum]++;
+			MarkBufferDirty(metabuf);
+		}
 
-	/* Calculate address of the recycled overflow page */
-	blkno = bitno_to_blkno(metap, bit);
+		/*
+		 * for new overflow page, we don't need to explicitly set the bit in
+		 * bitmap page, as by default that will be set to "in use".
+		 */
+	}
 
 	/*
 	 * Adjust hashm_firstfree to avoid redundant searches.  But don't risk
@@ -319,18 +357,84 @@ found:
 	if (metap->hashm_firstfree == orig_firstfree)
 	{
 		metap->hashm_firstfree = bit + 1;
-
-		/* Write updated metapage and release lock, but not pin */
-		_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
+		MarkBufferDirty(metabuf);
 	}
-	else
+
+	/* initialize new overflow page */
+	ovflpage = BufferGetPage(ovflbuf);
+	ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
+	ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
+	ovflopaque->hasho_nextblkno = InvalidBlockNumber;
+	ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
+	ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
+	ovflopaque->hasho_page_id = HASHO_PAGE_ID;
+
+	MarkBufferDirty(ovflbuf);
+
+	/* logically chain overflow page to previous page */
+	pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
+
+	MarkBufferDirty(buf);
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
 	{
-		/* We didn't change the metapage, so no need to write */
-		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+		XLogRecPtr	recptr;
+		xl_hash_add_ovfl_page xlrec;
+
+		xlrec.bmpage_found = page_found;
+		xlrec.bmsize = metap->hashm_bmsize;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage);
+
+		XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT);
+		XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket));
+
+		XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
+
+		if (BufferIsValid(mapbuf))
+		{
+			XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD);
+			XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32));
+		}
+
+		if (BufferIsValid(newmapbuf))
+			XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT);
+
+		XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD);
+		XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32));
+
+		recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE);
+
+		PageSetLSN(BufferGetPage(ovflbuf), recptr);
+		PageSetLSN(BufferGetPage(buf), recptr);
+
+		if (BufferIsValid(mapbuf))
+			PageSetLSN(BufferGetPage(mapbuf), recptr);
+
+		if (BufferIsValid(newmapbuf))
+			PageSetLSN(BufferGetPage(newmapbuf), recptr);
+
+		PageSetLSN(BufferGetPage(metabuf), recptr);
 	}
 
-	/* Fetch, init, and return the recycled page */
-	return _hash_getinitbuf(rel, blkno);
+	END_CRIT_SECTION();
+
+	if (retain_pin)
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+	else
+		_hash_relbuf(rel, buf);
+
+	if (BufferIsValid(mapbuf))
+		_hash_relbuf(rel, mapbuf);
+
+	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+
+	if (BufferIsValid(newmapbuf))
+		_hash_relbuf(rel, newmapbuf);
+
+	return ovflbuf;
 }
 
 /*
@@ -363,18 +467,27 @@ _hash_firstfreebit(uint32 map)
  *	Remove this overflow page from its bucket's chain, and mark the page as
  *	free.  On entry, ovflbuf is write-locked; it is released before exiting.
  *
+ *	Add the tuples (itups) to wbuf in this function.  We could do that in the
+ *	caller as well, but the advantage of doing it here is we can easily write
+ *	the WAL for XLOG_HASH_SQUEEZE_PAGE operation.  Addition of tuples and
+ *	removal of overflow page has to done as an atomic operation, otherwise
+ *	during replay on standby users might find duplicate records.
+ *
  *	Since this function is invoked in VACUUM, we provide an access strategy
  *	parameter that controls fetches of the bucket pages.
  *
  *	Returns the block number of the page that followed the given page
  *	in the bucket, or InvalidBlockNumber if no following page.
  *
- *	NB: caller must not hold lock on metapage, nor on either page that's
- *	adjacent in the bucket chain.  The caller had better hold exclusive lock
- *	on the bucket, too.
+ *	NB: caller must not hold lock on metapage, nor on page, that's next to
+ *	ovflbuf in the bucket chain.  We don't acquire the lock on page that's
+ *	prior to ovflbuf in chain if it is same as wbuf because the caller already
+ *	has a lock on same.
  */
 BlockNumber
-_hash_freeovflpage(Relation rel, Buffer ovflbuf,
+_hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
+				   Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
+				   Size *tups_size, uint16 nitups,
 				   BufferAccessStrategy bstrategy)
 {
 	HashMetaPage metap;
@@ -384,6 +497,7 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf,
 	BlockNumber prevblkno;
 	BlockNumber blkno;
 	BlockNumber nextblkno;
+	BlockNumber writeblkno;
 	HashPageOpaque ovflopaque;
 	Page		ovflpage;
 	Page		mappage;
@@ -392,6 +506,9 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf,
 	int32		bitmappage,
 				bitmapbit;
 	Bucket bucket PG_USED_FOR_ASSERTS_ONLY;
+	Buffer		prevbuf = InvalidBuffer;
+	Buffer		nextbuf = InvalidBuffer;
+	bool		update_metap = false;
 
 	/* Get information from the doomed page */
 	_hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE);
@@ -400,50 +517,32 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf,
 	ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
 	nextblkno = ovflopaque->hasho_nextblkno;
 	prevblkno = ovflopaque->hasho_prevblkno;
+	writeblkno = BufferGetBlockNumber(wbuf);
 	bucket = ovflopaque->hasho_bucket;
 
 	/*
-	 * Zero the page for debugging's sake; then write and release it. (Note:
-	 * if we failed to zero the page here, we'd have problems with the Assert
-	 * in _hash_pageinit() when the page is reused.)
-	 */
-	MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf));
-	_hash_wrtbuf(rel, ovflbuf);
-
-	/*
 	 * Fix up the bucket chain.  this is a doubly-linked list, so we must fix
 	 * up the bucket chain members behind and ahead of the overflow page being
-	 * deleted.  No concurrency issues since we hold exclusive lock on the
-	 * entire bucket.
+	 * deleted.  Concurrency issues are avoided by using lock chaining as
+	 * described atop hashbucketcleanup.
 	 */
 	if (BlockNumberIsValid(prevblkno))
 	{
-		Buffer		prevbuf = _hash_getbuf_with_strategy(rel,
-														 prevblkno,
-														 HASH_WRITE,
+		if (prevblkno == writeblkno)
+			prevbuf = wbuf;
+		else
+			prevbuf = _hash_getbuf_with_strategy(rel,
+												 prevblkno,
+												 HASH_WRITE,
 										   LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
-														 bstrategy);
-		Page		prevpage = BufferGetPage(prevbuf);
-		HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);
-
-		Assert(prevopaque->hasho_bucket == bucket);
-		prevopaque->hasho_nextblkno = nextblkno;
-		_hash_wrtbuf(rel, prevbuf);
+												 bstrategy);
 	}
 	if (BlockNumberIsValid(nextblkno))
-	{
-		Buffer		nextbuf = _hash_getbuf_with_strategy(rel,
-														 nextblkno,
-														 HASH_WRITE,
-														 LH_OVERFLOW_PAGE,
-														 bstrategy);
-		Page		nextpage = BufferGetPage(nextbuf);
-		HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);
-
-		Assert(nextopaque->hasho_bucket == bucket);
-		nextopaque->hasho_prevblkno = prevblkno;
-		_hash_wrtbuf(rel, nextbuf);
-	}
+		nextbuf = _hash_getbuf_with_strategy(rel,
+											 nextblkno,
+											 HASH_WRITE,
+											 LH_OVERFLOW_PAGE,
+											 bstrategy);
 
 	/* Note: bstrategy is intentionally not used for metapage and bitmap */
 
@@ -452,7 +551,7 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf,
 	metap = HashPageGetMeta(BufferGetPage(metabuf));
 
 	/* Identify which bit to set */
-	ovflbitno = blkno_to_bitno(metap, ovflblkno);
+	ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno);
 
 	bitmappage = ovflbitno >> BMPG_SHIFT(metap);
 	bitmapbit = ovflbitno & BMPG_MASK(metap);
@@ -462,67 +561,193 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf,
 	blkno = metap->hashm_mapp[bitmappage];
 
 	/* Release metapage lock while we access the bitmap page */
-	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
 
-	/* Clear the bitmap bit to indicate that this overflow page is free */
+	/* read the bitmap page to clear the bitmap bit */
 	mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE);
 	mappage = BufferGetPage(mapbuf);
 	freep = HashPageGetBitmap(mappage);
 	Assert(ISSET(freep, bitmapbit));
-	CLRBIT(freep, bitmapbit);
-	_hash_wrtbuf(rel, mapbuf);
 
 	/* Get write-lock on metapage to update firstfree */
-	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
+	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+
+	/* This operation needs to log multiple tuples, prepare WAL for that */
+	if (RelationNeedsWAL(rel))
+		XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups);
+
+	START_CRIT_SECTION();
+
+	/*
+	 * we have to insert tuples on the "write" page, being careful to preserve
+	 * hashkey ordering.  (If we insert many tuples into the same "write" page
+	 * it would be worth qsort'ing them).
+	 */
+	if (nitups > 0)
+	{
+		_hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
+		MarkBufferDirty(wbuf);
+	}
+
+	/*
+	 * Reinitialize the freed overflow page.  Just zeroing the page won't
+	 * work, because WAL replay routines expect pages to be initialized. See
+	 * explanation of RBM_NORMAL mode atop XLogReadBufferExtended.  We are
+	 * careful to make the special space valid here so that tools like
+	 * pageinspect won't get confused.
+	 */
+	_hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
+
+	ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
+
+	ovflopaque->hasho_prevblkno = InvalidBlockNumber;
+	ovflopaque->hasho_nextblkno = InvalidBlockNumber;
+	ovflopaque->hasho_bucket = -1;
+	ovflopaque->hasho_flag = LH_UNUSED_PAGE;
+	ovflopaque->hasho_page_id = HASHO_PAGE_ID;
+
+	MarkBufferDirty(ovflbuf);
+
+	if (BufferIsValid(prevbuf))
+	{
+		Page		prevpage = BufferGetPage(prevbuf);
+		HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);
+
+		Assert(prevopaque->hasho_bucket == bucket);
+		prevopaque->hasho_nextblkno = nextblkno;
+		MarkBufferDirty(prevbuf);
+	}
+	if (BufferIsValid(nextbuf))
+	{
+		Page		nextpage = BufferGetPage(nextbuf);
+		HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);
+
+		Assert(nextopaque->hasho_bucket == bucket);
+		nextopaque->hasho_prevblkno = prevblkno;
+		MarkBufferDirty(nextbuf);
+	}
+
+	/* Clear the bitmap bit to indicate that this overflow page is free */
+	CLRBIT(freep, bitmapbit);
+	MarkBufferDirty(mapbuf);
 
 	/* if this is now the first free page, update hashm_firstfree */
 	if (ovflbitno < metap->hashm_firstfree)
 	{
 		metap->hashm_firstfree = ovflbitno;
-		_hash_wrtbuf(rel, metabuf);
+		update_metap = true;
+		MarkBufferDirty(metabuf);
 	}
-	else
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
 	{
-		/* no need to change metapage */
-		_hash_relbuf(rel, metabuf);
+		xl_hash_squeeze_page xlrec;
+		XLogRecPtr	recptr;
+		int			i;
+
+		xlrec.prevblkno = prevblkno;
+		xlrec.nextblkno = nextblkno;
+		xlrec.ntups = nitups;
+		xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf);
+		xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf);
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage);
+
+		/*
+		 * bucket buffer needs to be registered to ensure that we can acquire
+		 * a cleanup lock on it during replay.
+		 */
+		if (!xlrec.is_prim_bucket_same_wrt)
+			XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
+
+		XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD);
+		if (xlrec.ntups > 0)
+		{
+			XLogRegisterBufData(1, (char *) itup_offsets,
+								nitups * sizeof(OffsetNumber));
+			for (i = 0; i < nitups; i++)
+				XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
+		}
+
+		XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD);
+
+		/*
+		 * If prevpage and the writepage (block in which we are moving tuples
+		 * from overflow) are same, then no need to separately register
+		 * prevpage.  During replay, we can directly update the nextblock in
+		 * writepage.
+		 */
+		if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
+			XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD);
+
+		if (BufferIsValid(nextbuf))
+			XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD);
+
+		XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD);
+		XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32));
+
+		if (update_metap)
+		{
+			XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD);
+			XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32));
+		}
+
+		recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE);
+
+		PageSetLSN(BufferGetPage(wbuf), recptr);
+		PageSetLSN(BufferGetPage(ovflbuf), recptr);
+
+		if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
+			PageSetLSN(BufferGetPage(prevbuf), recptr);
+		if (BufferIsValid(nextbuf))
+			PageSetLSN(BufferGetPage(nextbuf), recptr);
+
+		PageSetLSN(BufferGetPage(mapbuf), recptr);
+
+		if (update_metap)
+			PageSetLSN(BufferGetPage(metabuf), recptr);
 	}
 
+	END_CRIT_SECTION();
+
+	/* release previous bucket if it is not same as write bucket */
+	if (BufferIsValid(prevbuf) && prevblkno != writeblkno)
+		_hash_relbuf(rel, prevbuf);
+
+	if (BufferIsValid(ovflbuf))
+		_hash_relbuf(rel, ovflbuf);
+
+	if (BufferIsValid(nextbuf))
+		_hash_relbuf(rel, nextbuf);
+
+	_hash_relbuf(rel, mapbuf);
+	_hash_relbuf(rel, metabuf);
+
 	return nextblkno;
 }
 
 
 /*
- *	_hash_initbitmap()
- *
- *	 Initialize a new bitmap page.  The metapage has a write-lock upon
- *	 entering the function, and must be written by caller after return.
+ *	_hash_initbitmapbuffer()
  *
- * 'blkno' is the block number of the new bitmap page.
- *
- * All bits in the new bitmap page are set to "1", indicating "in use".
+ *	 Initialize a new bitmap page.  All bits in the new bitmap page are set to
+ *	 "1", indicating "in use".
  */
 void
-_hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno,
-				 ForkNumber forkNum)
+_hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
 {
-	Buffer		buf;
 	Page		pg;
 	HashPageOpaque op;
 	uint32	   *freep;
 
-	/*
-	 * It is okay to write-lock the new bitmap page while holding metapage
-	 * write lock, because no one else could be contending for the new page.
-	 * Also, the metapage lock makes it safe to extend the index using
-	 * _hash_getnewbuf.
-	 *
-	 * There is some loss of concurrency in possibly doing I/O for the new
-	 * page while holding the metapage lock, but this path is taken so seldom
-	 * that it's not worth worrying about.
-	 */
-	buf = _hash_getnewbuf(rel, blkno, forkNum);
 	pg = BufferGetPage(buf);
 
+	/* initialize the page */
+	if (initpage)
+		_hash_pageinit(pg, BufferGetPageSize(buf));
+
 	/* initialize the page's special space */
 	op = (HashPageOpaque) PageGetSpecialPointer(pg);
 	op->hasho_prevblkno = InvalidBlockNumber;
@@ -533,22 +758,14 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno,
 
 	/* set all of the bits to 1 */
 	freep = HashPageGetBitmap(pg);
-	MemSet(freep, 0xFF, BMPGSZ_BYTE(metap));
-
-	/* write out the new bitmap page (releasing write lock and pin) */
-	_hash_wrtbuf(rel, buf);
+	MemSet(freep, 0xFF, bmsize);
 
-	/* add the new bitmap page to the metapage's list of bitmaps */
-	/* metapage already has a write lock */
-	if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
-		ereport(ERROR,
-				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("out of overflow pages in hash index \"%s\"",
-						RelationGetRelationName(rel))));
-
-	metap->hashm_mapp[metap->hashm_nmaps] = blkno;
-
-	metap->hashm_nmaps++;
+	/*
+	 * Set pd_lower just past the end of the bitmap page data.  We could even
+	 * set pd_lower equal to pd_upper, but this is more precise and makes the
+	 * page look compressible to xlog.c.
+	 */
+	((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg;
 }
 
 
@@ -570,8 +787,15 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno,
  *	required that to be true on entry as well, but it's a lot easier for
  *	callers to leave empty overflow pages and let this guy clean it up.
  *
- *	Caller must hold exclusive lock on the target bucket.  This allows
- *	us to safely lock multiple pages in the bucket.
+ *	Caller must acquire cleanup lock on the primary page of the target
+ *	bucket to exclude any scans that are in progress, which could easily
+ *	be confused into returning the same tuple more than once or some tuples
+ *	not at all by the rearrangement we are performing here.  To prevent
+ *	any concurrent scan to cross the squeeze scan we use lock chaining
+ *	similar to hasbucketcleanup.  Refer comments atop hashbucketcleanup.
+ *
+ *	We need to retain a pin on the primary bucket to ensure that no concurrent
+ *	split can start.
  *
  *	Since this function is invoked in VACUUM, we provide an access strategy
  *	parameter that controls fetches of the bucket pages.
@@ -580,6 +804,7 @@ void
 _hash_squeezebucket(Relation rel,
 					Bucket bucket,
 					BlockNumber bucket_blkno,
+					Buffer bucket_buf,
 					BufferAccessStrategy bstrategy)
 {
 	BlockNumber wblkno;
@@ -590,26 +815,22 @@ _hash_squeezebucket(Relation rel,
 	Page		rpage;
 	HashPageOpaque wopaque;
 	HashPageOpaque ropaque;
-	bool		wbuf_dirty;
 
 	/*
-	 * start squeezing into the base bucket page.
+	 * start squeezing into the primary bucket page.
 	 */
 	wblkno = bucket_blkno;
-	wbuf = _hash_getbuf_with_strategy(rel,
-									  wblkno,
-									  HASH_WRITE,
-									  LH_BUCKET_PAGE,
-									  bstrategy);
+	wbuf = bucket_buf;
 	wpage = BufferGetPage(wbuf);
 	wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
 
 	/*
-	 * if there aren't any overflow pages, there's nothing to squeeze.
+	 * if there aren't any overflow pages, there's nothing to squeeze. caller
+	 * is responsible for releasing the pin on primary bucket page.
 	 */
 	if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
 	{
-		_hash_relbuf(rel, wbuf);
+		LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
 		return;
 	}
 
@@ -639,14 +860,21 @@ _hash_squeezebucket(Relation rel,
 	/*
 	 * squeeze the tuples.
 	 */
-	wbuf_dirty = false;
 	for (;;)
 	{
 		OffsetNumber roffnum;
 		OffsetNumber maxroffnum;
 		OffsetNumber deletable[MaxOffsetNumber];
-		int			ndeletable = 0;
-
+		IndexTuple	itups[MaxIndexTuplesPerPage];
+		Size		tups_size[MaxIndexTuplesPerPage];
+		OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
+		uint16		ndeletable = 0;
+		uint16		nitups = 0;
+		Size		all_tups_size = 0;
+		int			i;
+		bool		retain_pin = false;
+
+readpage:
 		/* Scan each tuple in "read" page */
 		maxroffnum = PageGetMaxOffsetNumber(rpage);
 		for (roffnum = FirstOffsetNumber;
@@ -656,6 +884,10 @@ _hash_squeezebucket(Relation rel,
 			IndexTuple	itup;
 			Size		itemsz;
 
+			/* skip dead tuples */
+			if (ItemIdIsDead(PageGetItemId(rpage, roffnum)))
+				continue;
+
 			itup = (IndexTuple) PageGetItem(rpage,
 											PageGetItemId(rpage, roffnum));
 			itemsz = IndexTupleDSize(*itup);
@@ -663,56 +895,144 @@ _hash_squeezebucket(Relation rel,
 
 			/*
 			 * Walk up the bucket chain, looking for a page big enough for
-			 * this item.  Exit if we reach the read page.
+			 * this item and all other accumulated items.  Exit if we reach
+			 * the read page.
 			 */
-			while (PageGetFreeSpace(wpage) < itemsz)
+			while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz))
 			{
+				Buffer		next_wbuf = InvalidBuffer;
+				bool		tups_moved = false;
+
 				Assert(!PageIsEmpty(wpage));
 
+				if (wblkno == bucket_blkno)
+					retain_pin = true;
+
 				wblkno = wopaque->hasho_nextblkno;
 				Assert(BlockNumberIsValid(wblkno));
 
-				if (wbuf_dirty)
-					_hash_wrtbuf(rel, wbuf);
+				/* don't need to move to next page if we reached the read page */
+				if (wblkno != rblkno)
+					next_wbuf = _hash_getbuf_with_strategy(rel,
+														   wblkno,
+														   HASH_WRITE,
+														   LH_OVERFLOW_PAGE,
+														   bstrategy);
+
+				if (nitups > 0)
+				{
+					Assert(nitups == ndeletable);
+
+					/*
+					 * This operation needs to log multiple tuples, prepare
+					 * WAL for that.
+					 */
+					if (RelationNeedsWAL(rel))
+						XLogEnsureRecordSpace(0, 3 + nitups);
+
+					START_CRIT_SECTION();
+
+					/*
+					 * we have to insert tuples on the "write" page, being
+					 * careful to preserve hashkey ordering.  (If we insert
+					 * many tuples into the same "write" page it would be
+					 * worth qsort'ing them).
+					 */
+					_hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
+					MarkBufferDirty(wbuf);
+
+					/* Delete tuples we already moved off read page */
+					PageIndexMultiDelete(rpage, deletable, ndeletable);
+					MarkBufferDirty(rbuf);
+
+					/* XLOG stuff */
+					if (RelationNeedsWAL(rel))
+					{
+						XLogRecPtr	recptr;
+						xl_hash_move_page_contents xlrec;
+
+						xlrec.ntups = nitups;
+						xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false;
+
+						XLogBeginInsert();
+						XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents);
+
+						/*
+						 * bucket buffer needs to be registered to ensure that
+						 * we can acquire a cleanup lock on it during replay.
+						 */
+						if (!xlrec.is_prim_bucket_same_wrt)
+							XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
+
+						XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD);
+						XLogRegisterBufData(1, (char *) itup_offsets,
+											nitups * sizeof(OffsetNumber));
+						for (i = 0; i < nitups; i++)
+							XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
+
+						XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
+						XLogRegisterBufData(2, (char *) deletable,
+										  ndeletable * sizeof(OffsetNumber));
+
+						recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS);
+
+						PageSetLSN(BufferGetPage(wbuf), recptr);
+						PageSetLSN(BufferGetPage(rbuf), recptr);
+					}
+
+					END_CRIT_SECTION();
+
+					tups_moved = true;
+				}
+
+				/*
+				 * release the lock on previous page after acquiring the lock
+				 * on next page
+				 */
+				if (retain_pin)
+					LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
 				else
 					_hash_relbuf(rel, wbuf);
 
 				/* nothing more to do if we reached the read page */
 				if (rblkno == wblkno)
 				{
-					if (ndeletable > 0)
-					{
-						/* Delete tuples we already moved off read page */
-						PageIndexMultiDelete(rpage, deletable, ndeletable);
-						_hash_wrtbuf(rel, rbuf);
-					}
-					else
-						_hash_relbuf(rel, rbuf);
+					_hash_relbuf(rel, rbuf);
 					return;
 				}
 
-				wbuf = _hash_getbuf_with_strategy(rel,
-												  wblkno,
-												  HASH_WRITE,
-												  LH_OVERFLOW_PAGE,
-												  bstrategy);
+				wbuf = next_wbuf;
 				wpage = BufferGetPage(wbuf);
 				wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
 				Assert(wopaque->hasho_bucket == bucket);
-				wbuf_dirty = false;
+				retain_pin = false;
+
+				/* be tidy */
+				for (i = 0; i < nitups; i++)
+					pfree(itups[i]);
+				nitups = 0;
+				all_tups_size = 0;
+				ndeletable = 0;
+
+				/*
+				 * after moving the tuples, rpage would have been compacted,
+				 * so we need to rescan it.
+				 */
+				if (tups_moved)
+					goto readpage;
 			}
 
-			/*
-			 * we have found room so insert on the "write" page, being careful
-			 * to preserve hashkey ordering.  (If we insert many tuples into
-			 * the same "write" page it would be worth qsort'ing instead of
-			 * doing repeated _hash_pgaddtup.)
-			 */
-			(void) _hash_pgaddtup(rel, wbuf, itemsz, itup);
-			wbuf_dirty = true;
-
 			/* remember tuple for deletion from "read" page */
 			deletable[ndeletable++] = roffnum;
+
+			/*
+			 * we need a copy of index tuples as they can be freed as part of
+			 * overflow page, however we need them to write a WAL record in
+			 * _hash_freeovflpage.
+			 */
+			itups[nitups] = CopyIndexTuple(itup);
+			tups_size[nitups++] = itemsz;
+			all_tups_size += itemsz;
 		}
 
 		/*
@@ -724,29 +1044,30 @@ _hash_squeezebucket(Relation rel,
 		 * Tricky point here: if our read and write pages are adjacent in the
 		 * bucket chain, our write lock on wbuf will conflict with
 		 * _hash_freeovflpage's attempt to update the sibling links of the
-		 * removed page.  However, in that case we are done anyway, so we can
-		 * simply drop the write lock before calling _hash_freeovflpage.
+		 * removed page.  In that case, we don't need to lock it again.
 		 */
 		rblkno = ropaque->hasho_prevblkno;
 		Assert(BlockNumberIsValid(rblkno));
 
+		/* free this overflow page (releases rbuf) */
+		_hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets,
+						   tups_size, nitups, bstrategy);
+
+		/* be tidy */
+		for (i = 0; i < nitups; i++)
+			pfree(itups[i]);
+
 		/* are we freeing the page adjacent to wbuf? */
 		if (rblkno == wblkno)
 		{
-			/* yes, so release wbuf lock first */
-			if (wbuf_dirty)
-				_hash_wrtbuf(rel, wbuf);
+			/* retain the pin on primary bucket page till end of bucket scan */
+			if (wblkno == bucket_blkno)
+				LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
 			else
 				_hash_relbuf(rel, wbuf);
-			/* free this overflow page (releases rbuf) */
-			_hash_freeovflpage(rel, rbuf, bstrategy);
-			/* done */
 			return;
 		}
 
-		/* free this overflow page, then get the previous one */
-		_hash_freeovflpage(rel, rbuf, bstrategy);
-
 		rbuf = _hash_getbuf_with_strategy(rel,
 										  rblkno,
 										  HASH_WRITE,
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 178463fcb6..4544889294 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -3,7 +3,7 @@
  * hashpage.c
  *	  Hash table page management code for the Postgres hash access method
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -29,6 +29,7 @@
 #include "postgres.h"
 
 #include "access/hash.h"
+#include "access/hash_xlog.h"
 #include "miscadmin.h"
 #include "storage/lmgr.h"
 #include "storage/smgr.h"
@@ -38,10 +39,12 @@ static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
 					uint32 nblocks);
 static void _hash_splitbucket(Relation rel, Buffer metabuf,
 				  Bucket obucket, Bucket nbucket,
-				  BlockNumber start_oblkno,
+				  Buffer obuf,
 				  Buffer nbuf,
+				  HTAB *htab,
 				  uint32 maxbucket,
 				  uint32 highmask, uint32 lowmask);
+static void log_split_page(Relation rel, Buffer buf);
 
 
 /*
@@ -55,46 +58,6 @@ static void _hash_splitbucket(Relation rel, Buffer metabuf,
 
 
 /*
- * _hash_getlock() -- Acquire an lmgr lock.
- *
- * 'whichlock' should the block number of a bucket's primary bucket page to
- * acquire the per-bucket lock.  (See README for details of the use of these
- * locks.)
- *
- * 'access' must be HASH_SHARE or HASH_EXCLUSIVE.
- */
-void
-_hash_getlock(Relation rel, BlockNumber whichlock, int access)
-{
-	if (USELOCKING(rel))
-		LockPage(rel, whichlock, access);
-}
-
-/*
- * _hash_try_getlock() -- Acquire an lmgr lock, but only if it's free.
- *
- * Same as above except we return FALSE without blocking if lock isn't free.
- */
-bool
-_hash_try_getlock(Relation rel, BlockNumber whichlock, int access)
-{
-	if (USELOCKING(rel))
-		return ConditionalLockPage(rel, whichlock, access);
-	else
-		return true;
-}
-
-/*
- * _hash_droplock() -- Release an lmgr lock.
- */
-void
-_hash_droplock(Relation rel, BlockNumber whichlock, int access)
-{
-	if (USELOCKING(rel))
-		UnlockPage(rel, whichlock, access);
-}
-
-/*
  *	_hash_getbuf() -- Get a buffer by block number for read or write.
  *
  *		'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
@@ -132,6 +95,35 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
 }
 
 /*
+ * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup.
+ *
+ *		We read the page and try to acquire a cleanup lock.  If we get it,
+ *		we return the buffer; otherwise, we return InvalidBuffer.
+ */
+Buffer
+_hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags)
+{
+	Buffer		buf;
+
+	if (blkno == P_NEW)
+		elog(ERROR, "hash AM does not use P_NEW");
+
+	buf = ReadBuffer(rel, blkno);
+
+	if (!ConditionalLockBufferForCleanup(buf))
+	{
+		ReleaseBuffer(buf);
+		return InvalidBuffer;
+	}
+
+	/* ref count and lock type are correct */
+
+	_hash_checkpage(rel, buf, flags);
+
+	return buf;
+}
+
+/*
  *	_hash_getinitbuf() -- Get and initialize a buffer by block number.
  *
  *		This must be used only to fetch pages that are known to be before
@@ -167,6 +159,36 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno)
 }
 
 /*
+ *	_hash_initbuf() -- Get and initialize a buffer by bucket number.
+ */
+void
+_hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag,
+			  bool initpage)
+{
+	HashPageOpaque pageopaque;
+	Page		page;
+
+	page = BufferGetPage(buf);
+
+	/* initialize the page */
+	if (initpage)
+		_hash_pageinit(page, BufferGetPageSize(buf));
+
+	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+	/*
+	 * Set hasho_prevblkno with current hashm_maxbucket. This value will be
+	 * used to validate cached HashMetaPageData. See
+	 * _hash_getbucketbuf_from_hashkey().
+	 */
+	pageopaque->hasho_prevblkno = max_bucket;
+	pageopaque->hasho_nextblkno = InvalidBlockNumber;
+	pageopaque->hasho_bucket = num_bucket;
+	pageopaque->hasho_flag = flag;
+	pageopaque->hasho_page_id = HASHO_PAGE_ID;
+}
+
+/*
  *	_hash_getnewbuf() -- Get a new page at the end of the index.
  *
  *		This has the same API as _hash_getinitbuf, except that we are adding
@@ -266,53 +288,39 @@ _hash_dropbuf(Relation rel, Buffer buf)
 }
 
 /*
- *	_hash_wrtbuf() -- write a hash page to disk.
+ *	_hash_dropscanbuf() -- release buffers used in scan.
  *
- *		This routine releases the lock held on the buffer and our refcount
- *		for it.  It is an error to call _hash_wrtbuf() without a write lock
- *		and a pin on the buffer.
- *
- * NOTE: this routine should go away when/if hash indexes are WAL-ified.
- * The correct sequence of operations is to mark the buffer dirty, then
- * write the WAL record, then release the lock and pin; so marking dirty
- * can't be combined with releasing.
+ * This routine unpins the buffers used during scan on which we
+ * hold no lock.
  */
 void
-_hash_wrtbuf(Relation rel, Buffer buf)
+_hash_dropscanbuf(Relation rel, HashScanOpaque so)
 {
-	MarkBufferDirty(buf);
-	UnlockReleaseBuffer(buf);
-}
-
-/*
- * _hash_chgbufaccess() -- Change the lock type on a buffer, without
- *			dropping our pin on it.
- *
- * from_access and to_access may be HASH_READ, HASH_WRITE, or HASH_NOLOCK,
- * the last indicating that no buffer-level lock is held or wanted.
- *
- * When from_access == HASH_WRITE, we assume the buffer is dirty and tell
- * bufmgr it must be written out.  If the caller wants to release a write
- * lock on a page that's not been modified, it's okay to pass from_access
- * as HASH_READ (a bit ugly, but handy in some places).
- */
-void
-_hash_chgbufaccess(Relation rel,
-				   Buffer buf,
-				   int from_access,
-				   int to_access)
-{
-	if (from_access == HASH_WRITE)
-		MarkBufferDirty(buf);
-	if (from_access != HASH_NOLOCK)
-		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-	if (to_access != HASH_NOLOCK)
-		LockBuffer(buf, to_access);
+	/* release pin we hold on primary bucket page */
+	if (BufferIsValid(so->hashso_bucket_buf) &&
+		so->hashso_bucket_buf != so->hashso_curbuf)
+		_hash_dropbuf(rel, so->hashso_bucket_buf);
+	so->hashso_bucket_buf = InvalidBuffer;
+
+	/* release pin we hold on primary bucket page  of bucket being split */
+	if (BufferIsValid(so->hashso_split_bucket_buf) &&
+		so->hashso_split_bucket_buf != so->hashso_curbuf)
+		_hash_dropbuf(rel, so->hashso_split_bucket_buf);
+	so->hashso_split_bucket_buf = InvalidBuffer;
+
+	/* release any pin we still hold */
+	if (BufferIsValid(so->hashso_curbuf))
+		_hash_dropbuf(rel, so->hashso_curbuf);
+	so->hashso_curbuf = InvalidBuffer;
+
+	/* reset split scan */
+	so->hashso_buc_populated = false;
+	so->hashso_buc_split = false;
 }
 
 
 /*
- *	_hash_metapinit() -- Initialize the metadata page of a hash index,
+ *	_hash_init() -- Initialize the metadata page of a hash index,
  *				the initial buckets, and the initial bitmap page.
  *
  * The initial number of buckets is dependent on num_tuples, an estimate
@@ -324,19 +332,18 @@ _hash_chgbufaccess(Relation rel,
  * multiple buffer locks is ignored.
  */
 uint32
-_hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
+_hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
 {
-	HashMetaPage metap;
-	HashPageOpaque pageopaque;
 	Buffer		metabuf;
 	Buffer		buf;
+	Buffer		bitmapbuf;
 	Page		pg;
+	HashMetaPage metap;
+	RegProcedure procid;
 	int32		data_width;
 	int32		item_width;
 	int32		ffactor;
-	double		dnumbuckets;
 	uint32		num_buckets;
-	uint32		log2_num_buckets;
 	uint32		i;
 
 	/* safety check */
@@ -358,10 +365,151 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
 	if (ffactor < 10)
 		ffactor = 10;
 
+	procid = index_getprocid(rel, 1, HASHPROC);
+
+	/*
+	 * We initialize the metapage, the first N bucket pages, and the first
+	 * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
+	 * calls to occur.  This ensures that the smgr level has the right idea of
+	 * the physical index length.
+	 *
+	 * Critical section not required, because on error the creation of the
+	 * whole relation will be rolled back.
+	 */
+	metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
+	_hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false);
+	MarkBufferDirty(metabuf);
+
+	pg = BufferGetPage(metabuf);
+	metap = HashPageGetMeta(pg);
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_hash_init_meta_page xlrec;
+		XLogRecPtr	recptr;
+
+		xlrec.num_tuples = num_tuples;
+		xlrec.procid = metap->hashm_procid;
+		xlrec.ffactor = metap->hashm_ffactor;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage);
+		XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT);
+
+		recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE);
+
+		PageSetLSN(BufferGetPage(metabuf), recptr);
+	}
+
+	num_buckets = metap->hashm_maxbucket + 1;
+
+	/*
+	 * Release buffer lock on the metapage while we initialize buckets.
+	 * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
+	 * won't accomplish anything.  It's a bad idea to hold buffer locks for
+	 * long intervals in any case, since that can block the bgwriter.
+	 */
+	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+
+	/*
+	 * Initialize and WAL Log the first N buckets
+	 */
+	for (i = 0; i < num_buckets; i++)
+	{
+		BlockNumber blkno;
+
+		/* Allow interrupts, in case N is huge */
+		CHECK_FOR_INTERRUPTS();
+
+		blkno = BUCKET_TO_BLKNO(metap, i);
+		buf = _hash_getnewbuf(rel, blkno, forkNum);
+		_hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false);
+		MarkBufferDirty(buf);
+
+		log_newpage(&rel->rd_node,
+					forkNum,
+					blkno,
+					BufferGetPage(buf),
+					true);
+		_hash_relbuf(rel, buf);
+	}
+
+	/* Now reacquire buffer lock on metapage */
+	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+
+	/*
+	 * Initialize bitmap page
+	 */
+	bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum);
+	_hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false);
+	MarkBufferDirty(bitmapbuf);
+
+	/* add the new bitmap page to the metapage's list of bitmaps */
+	/* metapage already has a write lock */
+	if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("out of overflow pages in hash index \"%s\"",
+						RelationGetRelationName(rel))));
+
+	metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1;
+
+	metap->hashm_nmaps++;
+	MarkBufferDirty(metabuf);
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_hash_init_bitmap_page xlrec;
+		XLogRecPtr	recptr;
+
+		xlrec.bmsize = metap->hashm_bmsize;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage);
+		XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT);
+
+		/*
+		 * This is safe only because nobody else can be modifying the index at
+		 * this stage; it's only visible to the transaction that is creating
+		 * it.
+		 */
+		XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
+
+		recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE);
+
+		PageSetLSN(BufferGetPage(bitmapbuf), recptr);
+		PageSetLSN(BufferGetPage(metabuf), recptr);
+	}
+
+	/* all done */
+	_hash_relbuf(rel, bitmapbuf);
+	_hash_relbuf(rel, metabuf);
+
+	return num_buckets;
+}
+
+/*
+ *	_hash_init_metabuffer() -- Initialize the metadata page of a hash index.
+ */
+void
+_hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
+					  uint16 ffactor, bool initpage)
+{
+	HashMetaPage metap;
+	HashPageOpaque pageopaque;
+	Page		page;
+	double		dnumbuckets;
+	uint32		num_buckets;
+	uint32		spare_index;
+	uint32		i;
+
 	/*
 	 * Choose the number of initial bucket pages to match the fill factor
 	 * given the estimated number of tuples.  We round up the result to the
-	 * next power of 2, however, and always force at least 2 bucket pages. The
+	 * total number of buckets which has to be allocated before using its
+	 * _hashm_spare element. However always force at least 2 bucket pages. The
 	 * upper limit is determined by considerations explained in
 	 * _hash_expandtable().
 	 */
@@ -371,36 +519,30 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
 	else if (dnumbuckets >= (double) 0x40000000)
 		num_buckets = 0x40000000;
 	else
-		num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets);
+		num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets));
 
-	log2_num_buckets = _hash_log2(num_buckets);
-	Assert(num_buckets == (((uint32) 1) << log2_num_buckets));
-	Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS);
+	spare_index = _hash_spareindex(num_buckets);
+	Assert(spare_index < HASH_MAX_SPLITPOINTS);
 
-	/*
-	 * We initialize the metapage, the first N bucket pages, and the first
-	 * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
-	 * calls to occur.  This ensures that the smgr level has the right idea of
-	 * the physical index length.
-	 */
-	metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
-	pg = BufferGetPage(metabuf);
+	page = BufferGetPage(buf);
+	if (initpage)
+		_hash_pageinit(page, BufferGetPageSize(buf));
 
-	pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
+	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
 	pageopaque->hasho_prevblkno = InvalidBlockNumber;
 	pageopaque->hasho_nextblkno = InvalidBlockNumber;
 	pageopaque->hasho_bucket = -1;
 	pageopaque->hasho_flag = LH_META_PAGE;
 	pageopaque->hasho_page_id = HASHO_PAGE_ID;
 
-	metap = HashPageGetMeta(pg);
+	metap = HashPageGetMeta(page);
 
 	metap->hashm_magic = HASH_MAGIC;
 	metap->hashm_version = HASH_VERSION;
 	metap->hashm_ntuples = 0;
 	metap->hashm_nmaps = 0;
 	metap->hashm_ffactor = ffactor;
-	metap->hashm_bsize = HashGetMaxBitmapSize(pg);
+	metap->hashm_bsize = HashGetMaxBitmapSize(page);
 	/* find largest bitmap array size that will fit in page size */
 	for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
 	{
@@ -417,63 +559,35 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
 	 * pretty useless for normal operation (in fact, hashm_procid is not used
 	 * anywhere), but it might be handy for forensic purposes so we keep it.
 	 */
-	metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
+	metap->hashm_procid = procid;
 
 	/*
 	 * We initialize the index with N buckets, 0 .. N-1, occupying physical
-	 * blocks 1 to N.  The first freespace bitmap page is in block N+1. Since
-	 * N is a power of 2, we can set the masks this way:
+	 * blocks 1 to N.  The first freespace bitmap page is in block N+1.
 	 */
-	metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1;
-	metap->hashm_highmask = (num_buckets << 1) - 1;
+	metap->hashm_maxbucket = num_buckets - 1;
+
+	/*
+	 * Set highmask as next immediate ((2 ^ x) - 1), which should be
+	 * sufficient to cover num_buckets.
+	 */
+	metap->hashm_highmask = (1 << (_hash_log2(num_buckets + 1))) - 1;
+	metap->hashm_lowmask = (metap->hashm_highmask >> 1);
 
 	MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
 	MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
 
 	/* Set up mapping for one spare page after the initial splitpoints */
-	metap->hashm_spares[log2_num_buckets] = 1;
-	metap->hashm_ovflpoint = log2_num_buckets;
+	metap->hashm_spares[spare_index] = 1;
+	metap->hashm_ovflpoint = spare_index;
 	metap->hashm_firstfree = 0;
 
 	/*
-	 * Release buffer lock on the metapage while we initialize buckets.
-	 * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
-	 * won't accomplish anything.  It's a bad idea to hold buffer locks for
-	 * long intervals in any case, since that can block the bgwriter.
-	 */
-	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
-
-	/*
-	 * Initialize the first N buckets
-	 */
-	for (i = 0; i < num_buckets; i++)
-	{
-		/* Allow interrupts, in case N is huge */
-		CHECK_FOR_INTERRUPTS();
-
-		buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum);
-		pg = BufferGetPage(buf);
-		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
-		pageopaque->hasho_prevblkno = InvalidBlockNumber;
-		pageopaque->hasho_nextblkno = InvalidBlockNumber;
-		pageopaque->hasho_bucket = i;
-		pageopaque->hasho_flag = LH_BUCKET_PAGE;
-		pageopaque->hasho_page_id = HASHO_PAGE_ID;
-		_hash_wrtbuf(rel, buf);
-	}
-
-	/* Now reacquire buffer lock on metapage */
-	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
-
-	/*
-	 * Initialize first bitmap page
+	 * Set pd_lower just past the end of the metadata.  This is to log full
+	 * page image of metapage in xloginsert.c.
 	 */
-	_hash_initbitmap(rel, metap, num_buckets + 1, forkNum);
-
-	/* all done */
-	_hash_wrtbuf(rel, metabuf);
-
-	return num_buckets;
+	((PageHeader) page)->pd_lower =
+		((char *) metap + sizeof(HashMetaPageData)) - (char *) page;
 }
 
 /*
@@ -482,16 +596,17 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
 void
 _hash_pageinit(Page page, Size size)
 {
-	Assert(PageIsNew(page));
 	PageInit(page, size, sizeof(HashPageOpaqueData));
 }
 
 /*
  * Attempt to expand the hash table by creating one new bucket.
  *
- * This will silently do nothing if it cannot get the needed locks.
+ * This will silently do nothing if we don't get cleanup lock on old or
+ * new bucket.
  *
- * The caller should hold no locks on the hash index.
+ * Complete the pending splits and remove the tuples from old bucket,
+ * if there are any left over from the previous split.
  *
  * The caller must hold a pin, but no lock, on the metapage buffer.
  * The buffer is returned in the same state.
@@ -506,15 +621,24 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	BlockNumber start_oblkno;
 	BlockNumber start_nblkno;
 	Buffer		buf_nblkno;
+	Buffer		buf_oblkno;
+	Page		opage;
+	Page		npage;
+	HashPageOpaque oopaque;
+	HashPageOpaque nopaque;
 	uint32		maxbucket;
 	uint32		highmask;
 	uint32		lowmask;
+	bool		metap_update_masks = false;
+	bool		metap_update_splitpoint = false;
+
+restart_expand:
 
 	/*
 	 * Write-lock the meta page.  It used to be necessary to acquire a
 	 * heavyweight lock to begin a split, but that is no longer required.
 	 */
-	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
+	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 
 	_hash_checkpage(rel, metabuf, LH_META_PAGE);
 	metap = HashPageGetMeta(BufferGetPage(metabuf));
@@ -542,17 +666,22 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	 * than a disk block then this would be an independent constraint.
 	 *
 	 * If you change this, see also the maximum initial number of buckets in
-	 * _hash_metapinit().
+	 * _hash_init().
 	 */
 	if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
 		goto fail;
 
 	/*
-	 * Determine which bucket is to be split, and attempt to lock the old
-	 * bucket.  If we can't get the lock, give up.
+	 * Determine which bucket is to be split, and attempt to take cleanup lock
+	 * on the old bucket.  If we can't get the lock, give up.
+	 *
+	 * The cleanup lock protects us not only against other backends, but
+	 * against our own backend as well.
 	 *
-	 * The lock protects us against other backends, but not against our own
-	 * backend.  Must check for active scans separately.
+	 * The cleanup lock is mainly to protect the split from concurrent
+	 * inserts. See src/backend/access/hash/README, Lock Definitions for
+	 * further details.  Due to this locking restriction, if there is any
+	 * pending scan, the split will give up which is not good, but harmless.
 	 */
 	new_bucket = metap->hashm_maxbucket + 1;
 
@@ -560,14 +689,86 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 
 	start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
 
-	if (_hash_has_active_scan(rel, old_bucket))
+	buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE);
+	if (!buf_oblkno)
 		goto fail;
 
-	if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
-		goto fail;
+	opage = BufferGetPage(buf_oblkno);
+	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+
+	/*
+	 * We want to finish the split from a bucket as there is no apparent
+	 * benefit by not doing so and it will make the code complicated to finish
+	 * the split that involves multiple buckets considering the case where new
+	 * split also fails.  We don't need to consider the new bucket for
+	 * completing the split here as it is not possible that a re-split of new
+	 * bucket starts when there is still a pending split from old bucket.
+	 */
+	if (H_BUCKET_BEING_SPLIT(oopaque))
+	{
+		/*
+		 * Copy bucket mapping info now; refer the comment in code below where
+		 * we copy this information before calling _hash_splitbucket to see
+		 * why this is okay.
+		 */
+		maxbucket = metap->hashm_maxbucket;
+		highmask = metap->hashm_highmask;
+		lowmask = metap->hashm_lowmask;
+
+		/*
+		 * Release the lock on metapage and old_bucket, before completing the
+		 * split.
+		 */
+		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+		LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK);
+
+		_hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket,
+						   highmask, lowmask);
+
+		/* release the pin on old buffer and retry for expand. */
+		_hash_dropbuf(rel, buf_oblkno);
+
+		goto restart_expand;
+	}
 
 	/*
-	 * Likewise lock the new bucket (should never fail).
+	 * Clean the tuples remained from the previous split.  This operation
+	 * requires cleanup lock and we already have one on the old bucket, so
+	 * let's do it. We also don't want to allow further splits from the bucket
+	 * till the garbage of previous split is cleaned.  This has two
+	 * advantages; first, it helps in avoiding the bloat due to garbage and
+	 * second is, during cleanup of bucket, we are always sure that the
+	 * garbage tuples belong to most recently split bucket.  On the contrary,
+	 * if we allow cleanup of bucket after meta page is updated to indicate
+	 * the new split and before the actual split, the cleanup operation won't
+	 * be able to decide whether the tuple has been moved to the newly created
+	 * bucket and ended up deleting such tuples.
+	 */
+	if (H_NEEDS_SPLIT_CLEANUP(oopaque))
+	{
+		/*
+		 * Copy bucket mapping info now; refer to the comment in code below
+		 * where we copy this information before calling _hash_splitbucket to
+		 * see why this is okay.
+		 */
+		maxbucket = metap->hashm_maxbucket;
+		highmask = metap->hashm_highmask;
+		lowmask = metap->hashm_lowmask;
+
+		/* Release the metapage lock. */
+		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+
+		hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL,
+						  maxbucket, highmask, lowmask, NULL, NULL, true,
+						  NULL, NULL);
+
+		_hash_dropbuf(rel, buf_oblkno);
+
+		goto restart_expand;
+	}
+
+	/*
+	 * There shouldn't be any active scan on new bucket.
 	 *
 	 * Note: it is safe to compute the new bucket's blkno here, even though we
 	 * may still need to update the BUCKET_TO_BLKNO mapping.  This is because
@@ -576,32 +777,28 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	 */
 	start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
 
-	if (_hash_has_active_scan(rel, new_bucket))
-		elog(ERROR, "scan in progress on supposedly new bucket");
-
-	if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
-		elog(ERROR, "could not get lock on supposedly new bucket");
-
 	/*
-	 * If the split point is increasing (hashm_maxbucket's log base 2
-	 * increases), we need to allocate a new batch of bucket pages.
+	 * If the split point is increasing we need to allocate a new batch of
+	 * bucket pages.
 	 */
-	spare_ndx = _hash_log2(new_bucket + 1);
+	spare_ndx = _hash_spareindex(new_bucket + 1);
 	if (spare_ndx > metap->hashm_ovflpoint)
 	{
+		uint32		buckets_to_add;
+
 		Assert(spare_ndx == metap->hashm_ovflpoint + 1);
 
 		/*
-		 * The number of buckets in the new splitpoint is equal to the total
-		 * number already in existence, i.e. new_bucket.  Currently this maps
-		 * one-to-one to blocks required, but someday we may need a more
-		 * complicated calculation here.
+		 * We treat allocation of buckets as a separate WAL-logged action.
+		 * Even if we fail after this operation, won't leak bucket pages;
+		 * rather, the next split will consume this space. In any case, even
+		 * without failure we don't use all the space in one split operation.
 		 */
-		if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
+		buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket;
+		if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add))
 		{
 			/* can't split due to BlockNumber overflow */
-			_hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
-			_hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
+			_hash_relbuf(rel, buf_oblkno);
 			goto fail;
 		}
 	}
@@ -609,21 +806,29 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	/*
 	 * Physically allocate the new bucket's primary page.  We want to do this
 	 * before changing the metapage's mapping info, in case we can't get the
-	 * disk space.
+	 * disk space.  Ideally, we don't need to check for cleanup lock on new
+	 * bucket as no other backend could find this bucket unless meta page is
+	 * updated.  However, it is good to be consistent with old bucket locking.
 	 */
 	buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM);
+	if (!IsBufferCleanupOK(buf_nblkno))
+	{
+		_hash_relbuf(rel, buf_oblkno);
+		_hash_relbuf(rel, buf_nblkno);
+		goto fail;
+	}
 
 	/*
-	 * Okay to proceed with split.  Update the metapage bucket mapping info.
-	 *
-	 * Since we are scribbling on the metapage data right in the shared
-	 * buffer, any failure in this next little bit leaves us with a big
+	 * Since we are scribbling on the pages in the shared buffers, establish a
+	 * critical section.  Any failure in this next code leaves us with a big
 	 * problem: the metapage is effectively corrupt but could get written back
-	 * to disk.  We don't really expect any failure, but just to be sure,
-	 * establish a critical section.
+	 * to disk.
 	 */
 	START_CRIT_SECTION();
 
+	/*
+	 * Okay to proceed with split.  Update the metapage bucket mapping info.
+	 */
 	metap->hashm_maxbucket = new_bucket;
 
 	if (new_bucket > metap->hashm_highmask)
@@ -631,22 +836,22 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 		/* Starting a new doubling */
 		metap->hashm_lowmask = metap->hashm_highmask;
 		metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
+		metap_update_masks = true;
 	}
 
 	/*
-	 * If the split point is increasing (hashm_maxbucket's log base 2
-	 * increases), we need to adjust the hashm_spares[] array and
-	 * hashm_ovflpoint so that future overflow pages will be created beyond
-	 * this new batch of bucket pages.
+	 * If the split point is increasing we need to adjust the hashm_spares[]
+	 * array and hashm_ovflpoint so that future overflow pages will be created
+	 * beyond this new batch of bucket pages.
 	 */
 	if (spare_ndx > metap->hashm_ovflpoint)
 	{
 		metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
 		metap->hashm_ovflpoint = spare_ndx;
+		metap_update_splitpoint = true;
 	}
 
-	/* Done mucking with metapage */
-	END_CRIT_SECTION();
+	MarkBufferDirty(metabuf);
 
 	/*
 	 * Copy bucket mapping info now; this saves re-accessing the meta page
@@ -659,18 +864,92 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	highmask = metap->hashm_highmask;
 	lowmask = metap->hashm_lowmask;
 
-	/* Write out the metapage and drop lock, but keep pin */
-	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
+	opage = BufferGetPage(buf_oblkno);
+	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+
+	/*
+	 * Mark the old bucket to indicate that split is in progress.  (At
+	 * operation end, we will clear the split-in-progress flag.)  Also, for a
+	 * primary bucket page, hasho_prevblkno stores the number of buckets that
+	 * existed as of the last split, so we must update that value here.
+	 */
+	oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
+	oopaque->hasho_prevblkno = maxbucket;
+
+	MarkBufferDirty(buf_oblkno);
+
+	npage = BufferGetPage(buf_nblkno);
+
+	/*
+	 * initialize the new bucket's primary page and mark it to indicate that
+	 * split is in progress.
+	 */
+	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
+	nopaque->hasho_prevblkno = maxbucket;
+	nopaque->hasho_nextblkno = InvalidBlockNumber;
+	nopaque->hasho_bucket = new_bucket;
+	nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
+	nopaque->hasho_page_id = HASHO_PAGE_ID;
+
+	MarkBufferDirty(buf_nblkno);
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_hash_split_allocate_page xlrec;
+		XLogRecPtr	recptr;
+
+		xlrec.new_bucket = maxbucket;
+		xlrec.old_bucket_flag = oopaque->hasho_flag;
+		xlrec.new_bucket_flag = nopaque->hasho_flag;
+		xlrec.flags = 0;
+
+		XLogBeginInsert();
+
+		XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD);
+		XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT);
+		XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD);
+
+		if (metap_update_masks)
+		{
+			xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS;
+			XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32));
+			XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32));
+		}
+
+		if (metap_update_splitpoint)
+		{
+			xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT;
+			XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint,
+								sizeof(uint32));
+			XLogRegisterBufData(2,
+					   (char *) &metap->hashm_spares[metap->hashm_ovflpoint],
+								sizeof(uint32));
+		}
+
+		XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage);
+
+		recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE);
+
+		PageSetLSN(BufferGetPage(buf_oblkno), recptr);
+		PageSetLSN(BufferGetPage(buf_nblkno), recptr);
+		PageSetLSN(BufferGetPage(metabuf), recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	/* drop lock, but keep pin */
+	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
 
 	/* Relocate records to the new bucket */
 	_hash_splitbucket(rel, metabuf,
 					  old_bucket, new_bucket,
-					  start_oblkno, buf_nblkno,
+					  buf_oblkno, buf_nblkno, NULL,
 					  maxbucket, highmask, lowmask);
 
-	/* Release bucket locks, allowing others to access them */
-	_hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
-	_hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
+	/* all done, now release the locks and pins on primary buckets. */
+	_hash_relbuf(rel, buf_oblkno);
+	_hash_relbuf(rel, buf_nblkno);
 
 	return;
 
@@ -678,7 +957,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 fail:
 
 	/* We didn't write the metapage, so just drop lock */
-	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
 }
 
 
@@ -698,7 +977,7 @@ fail:
  * hash indexes sequentially anyway, that probably doesn't matter.
  *
  * XXX It's annoying that this code is executed with the metapage lock held.
- * We need to interlock against _hash_getovflpage() adding a new overflow page
+ * We need to interlock against _hash_addovflpage() adding a new overflow page
  * concurrently, but it'd likely be better to use LockRelationForExtension
  * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
  * so it may not be worth worrying about.
@@ -711,6 +990,8 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 {
 	BlockNumber lastblock;
 	char		zerobuf[BLCKSZ];
+	Page		page;
+	HashPageOpaque ovflopaque;
 
 	lastblock = firstblock + nblocks - 1;
 
@@ -721,7 +1002,29 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 	if (lastblock < firstblock || lastblock == InvalidBlockNumber)
 		return false;
 
-	MemSet(zerobuf, 0, sizeof(zerobuf));
+	page = (Page) zerobuf;
+
+	/*
+	 * Initialize the page.  Just zeroing the page won't work; see
+	 * _hash_freeovflpage for similar usage.  We take care to make the special
+	 * space valid for the benefit of tools such as pageinspect.
+	 */
+	_hash_pageinit(page, BLCKSZ);
+
+	ovflopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+	ovflopaque->hasho_prevblkno = InvalidBlockNumber;
+	ovflopaque->hasho_nextblkno = InvalidBlockNumber;
+	ovflopaque->hasho_bucket = -1;
+	ovflopaque->hasho_flag = LH_UNUSED_PAGE;
+	ovflopaque->hasho_page_id = HASHO_PAGE_ID;
+
+	if (RelationNeedsWAL(rel))
+		log_newpage(&rel->rd_node,
+					MAIN_FORKNUM,
+					lastblock,
+					zerobuf,
+					true);
 
 	RelationOpenSmgr(rel);
 	smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false);
@@ -733,18 +1036,28 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 /*
  * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
  *
+ * This routine is used to partition the tuples between old and new bucket and
+ * is used to finish the incomplete split operations.  To finish the previously
+ * interrupted split operation, the caller needs to fill htab.  If htab is set,
+ * then we skip the movement of tuples that exists in htab, otherwise NULL
+ * value of htab indicates movement of all the tuples that belong to the new
+ * bucket.
+ *
  * We are splitting a bucket that consists of a base bucket page and zero
  * or more overflow (bucket chain) pages.  We must relocate tuples that
- * belong in the new bucket, and compress out any free space in the old
- * bucket.
+ * belong in the new bucket.
  *
- * The caller must hold exclusive locks on both buckets to ensure that
+ * The caller must hold cleanup locks on both buckets to ensure that
  * no one else is trying to access them (see README).
  *
  * The caller must hold a pin, but no lock, on the metapage buffer.
  * The buffer is returned in the same state.  (The metapage is only
  * touched if it becomes necessary to add or remove overflow pages.)
  *
+ * Split needs to retain pin on primary bucket pages of both old and new
+ * buckets till end of operation.  This is to prevent vacuum from starting
+ * while a split is in progress.
+ *
  * In addition, the caller must have created the new bucket's base page,
  * which is passed in buffer nbuf, pinned and write-locked.  That lock and
  * pin are released here.  (The API is set up this way because we must do
@@ -756,36 +1069,32 @@ _hash_splitbucket(Relation rel,
 				  Buffer metabuf,
 				  Bucket obucket,
 				  Bucket nbucket,
-				  BlockNumber start_oblkno,
+				  Buffer obuf,
 				  Buffer nbuf,
+				  HTAB *htab,
 				  uint32 maxbucket,
 				  uint32 highmask,
 				  uint32 lowmask)
 {
-	Buffer		obuf;
+	Buffer		bucket_obuf;
+	Buffer		bucket_nbuf;
 	Page		opage;
 	Page		npage;
 	HashPageOpaque oopaque;
 	HashPageOpaque nopaque;
+	OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
+	IndexTuple	itups[MaxIndexTuplesPerPage];
+	Size		all_tups_size = 0;
+	int			i;
+	uint16		nitups = 0;
 
-	/*
-	 * It should be okay to simultaneously write-lock pages from each bucket,
-	 * since no one else can be trying to acquire buffer lock on pages of
-	 * either bucket.
-	 */
-	obuf = _hash_getbuf(rel, start_oblkno, HASH_WRITE, LH_BUCKET_PAGE);
+	bucket_obuf = obuf;
 	opage = BufferGetPage(obuf);
 	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
 
+	bucket_nbuf = nbuf;
 	npage = BufferGetPage(nbuf);
-
-	/* initialize the new bucket's primary page */
 	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
-	nopaque->hasho_prevblkno = InvalidBlockNumber;
-	nopaque->hasho_nextblkno = InvalidBlockNumber;
-	nopaque->hasho_bucket = nbucket;
-	nopaque->hasho_flag = LH_BUCKET_PAGE;
-	nopaque->hasho_page_id = HASHO_PAGE_ID;
 
 	/*
 	 * Partition the tuples in the old bucket between the old bucket and the
@@ -798,8 +1107,6 @@ _hash_splitbucket(Relation rel,
 		BlockNumber oblkno;
 		OffsetNumber ooffnum;
 		OffsetNumber omaxoffnum;
-		OffsetNumber deletable[MaxOffsetNumber];
-		int			ndeletable = 0;
 
 		/* Scan each tuple in old page */
 		omaxoffnum = PageGetMaxOffsetNumber(opage);
@@ -810,54 +1117,86 @@ _hash_splitbucket(Relation rel,
 			IndexTuple	itup;
 			Size		itemsz;
 			Bucket		bucket;
+			bool		found = false;
+
+			/* skip dead tuples */
+			if (ItemIdIsDead(PageGetItemId(opage, ooffnum)))
+				continue;
 
 			/*
-			 * Fetch the item's hash key (conveniently stored in the item) and
-			 * determine which bucket it now belongs in.
+			 * Before inserting a tuple, probe the hash table containing TIDs
+			 * of tuples belonging to new bucket, if we find a match, then
+			 * skip that tuple, else fetch the item's hash key (conveniently
+			 * stored in the item) and determine which bucket it now belongs
+			 * in.
 			 */
 			itup = (IndexTuple) PageGetItem(opage,
 											PageGetItemId(opage, ooffnum));
+
+			if (htab)
+				(void) hash_search(htab, &itup->t_tid, HASH_FIND, &found);
+
+			if (found)
+				continue;
+
 			bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
 										  maxbucket, highmask, lowmask);
 
 			if (bucket == nbucket)
 			{
+				IndexTuple	new_itup;
+
+				/*
+				 * make a copy of index tuple as we have to scribble on it.
+				 */
+				new_itup = CopyIndexTuple(itup);
+
+				/*
+				 * mark the index tuple as moved by split, such tuples are
+				 * skipped by scan if there is split in progress for a bucket.
+				 */
+				new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK;
+
 				/*
 				 * insert the tuple into the new bucket.  if it doesn't fit on
 				 * the current page in the new bucket, we must allocate a new
 				 * overflow page and place the tuple on that page instead.
-				 *
-				 * XXX we have a problem here if we fail to get space for a
-				 * new overflow page: we'll error out leaving the bucket split
-				 * only partially complete, meaning the index is corrupt,
-				 * since searches may fail to find entries they should find.
 				 */
-				itemsz = IndexTupleDSize(*itup);
+				itemsz = IndexTupleDSize(*new_itup);
 				itemsz = MAXALIGN(itemsz);
 
-				if (PageGetFreeSpace(npage) < itemsz)
+				if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz))
 				{
-					/* write out nbuf and drop lock, but keep pin */
-					_hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
+					/*
+					 * Change the shared buffer state in critical section,
+					 * otherwise any error could make it unrecoverable.
+					 */
+					START_CRIT_SECTION();
+
+					_hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
+					MarkBufferDirty(nbuf);
+					/* log the split operation before releasing the lock */
+					log_split_page(rel, nbuf);
+
+					END_CRIT_SECTION();
+
+					/* drop lock, but keep pin */
+					LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
+
+					/* be tidy */
+					for (i = 0; i < nitups; i++)
+						pfree(itups[i]);
+					nitups = 0;
+					all_tups_size = 0;
+
 					/* chain to a new overflow page */
-					nbuf = _hash_addovflpage(rel, metabuf, nbuf);
+					nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false);
 					npage = BufferGetPage(nbuf);
-					/* we don't need nopaque within the loop */
+					nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
 				}
 
-				/*
-				 * Insert tuple on new page, using _hash_pgaddtup to ensure
-				 * correct ordering by hashkey.  This is a tad inefficient
-				 * since we may have to shuffle itempointers repeatedly.
-				 * Possible future improvement: accumulate all the items for
-				 * the new page and qsort them before insertion.
-				 */
-				(void) _hash_pgaddtup(rel, nbuf, itemsz, itup);
-
-				/*
-				 * Mark tuple for deletion from old page.
-				 */
-				deletable[ndeletable++] = ooffnum;
+				itups[nitups++] = new_itup;
+				all_tups_size += itemsz;
 			}
 			else
 			{
@@ -870,35 +1209,378 @@ _hash_splitbucket(Relation rel,
 
 		oblkno = oopaque->hasho_nextblkno;
 
-		/*
-		 * Done scanning this old page.  If we moved any tuples, delete them
-		 * from the old page.
-		 */
-		if (ndeletable > 0)
-		{
-			PageIndexMultiDelete(opage, deletable, ndeletable);
-			_hash_wrtbuf(rel, obuf);
-		}
+		/* retain the pin on the old primary bucket */
+		if (obuf == bucket_obuf)
+			LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
 		else
 			_hash_relbuf(rel, obuf);
 
 		/* Exit loop if no more overflow pages in old bucket */
 		if (!BlockNumberIsValid(oblkno))
+		{
+			/*
+			 * Change the shared buffer state in critical section, otherwise
+			 * any error could make it unrecoverable.
+			 */
+			START_CRIT_SECTION();
+
+			_hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
+			MarkBufferDirty(nbuf);
+			/* log the split operation before releasing the lock */
+			log_split_page(rel, nbuf);
+
+			END_CRIT_SECTION();
+
+			if (nbuf == bucket_nbuf)
+				LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
+			else
+				_hash_relbuf(rel, nbuf);
+
+			/* be tidy */
+			for (i = 0; i < nitups; i++)
+				pfree(itups[i]);
 			break;
+		}
 
 		/* Else, advance to next old page */
-		obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
+		obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE);
 		opage = BufferGetPage(obuf);
 		oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
 	}
 
 	/*
 	 * We're at the end of the old bucket chain, so we're done partitioning
-	 * the tuples.  Before quitting, call _hash_squeezebucket to ensure the
-	 * tuples remaining in the old bucket (including the overflow pages) are
-	 * packed as tightly as possible.  The new bucket is already tight.
+	 * the tuples.  Mark the old and new buckets to indicate split is
+	 * finished.
+	 *
+	 * To avoid deadlocks due to locking order of buckets, first lock the old
+	 * bucket and then the new bucket.
+	 */
+	LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE);
+	opage = BufferGetPage(bucket_obuf);
+	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+
+	LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE);
+	npage = BufferGetPage(bucket_nbuf);
+	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
+
+	START_CRIT_SECTION();
+
+	oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT;
+	nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED;
+
+	/*
+	 * After the split is finished, mark the old bucket to indicate that it
+	 * contains deletable tuples.  Vacuum will clear split-cleanup flag after
+	 * deleting such tuples.
+	 */
+	oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP;
+
+	/*
+	 * now write the buffers, here we don't release the locks as caller is
+	 * responsible to release locks.
+	 */
+	MarkBufferDirty(bucket_obuf);
+	MarkBufferDirty(bucket_nbuf);
+
+	if (RelationNeedsWAL(rel))
+	{
+		XLogRecPtr	recptr;
+		xl_hash_split_complete xlrec;
+
+		xlrec.old_bucket_flag = oopaque->hasho_flag;
+		xlrec.new_bucket_flag = nopaque->hasho_flag;
+
+		XLogBeginInsert();
+
+		XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete);
+
+		XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD);
+		XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD);
+
+		recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE);
+
+		PageSetLSN(BufferGetPage(bucket_obuf), recptr);
+		PageSetLSN(BufferGetPage(bucket_nbuf), recptr);
+	}
+
+	END_CRIT_SECTION();
+}
+
+/*
+ *	_hash_finish_split() -- Finish the previously interrupted split operation
+ *
+ * To complete the split operation, we form the hash table of TIDs in new
+ * bucket which is then used by split operation to skip tuples that are
+ * already moved before the split operation was previously interrupted.
+ *
+ * The caller must hold a pin, but no lock, on the metapage and old bucket's
+ * primary page buffer.  The buffers are returned in the same state.  (The
+ * metapage is only touched if it becomes necessary to add or remove overflow
+ * pages.)
+ */
+void
+_hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
+				   uint32 maxbucket, uint32 highmask, uint32 lowmask)
+{
+	HASHCTL		hash_ctl;
+	HTAB	   *tidhtab;
+	Buffer		bucket_nbuf = InvalidBuffer;
+	Buffer		nbuf;
+	Page		npage;
+	BlockNumber nblkno;
+	BlockNumber bucket_nblkno;
+	HashPageOpaque npageopaque;
+	Bucket		nbucket;
+	bool		found;
+
+	/* Initialize hash tables used to track TIDs */
+	memset(&hash_ctl, 0, sizeof(hash_ctl));
+	hash_ctl.keysize = sizeof(ItemPointerData);
+	hash_ctl.entrysize = sizeof(ItemPointerData);
+	hash_ctl.hcxt = CurrentMemoryContext;
+
+	tidhtab =
+		hash_create("bucket ctids",
+					256,		/* arbitrary initial size */
+					&hash_ctl,
+					HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+	bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket);
+
+	/*
+	 * Scan the new bucket and build hash table of TIDs
+	 */
+	for (;;)
+	{
+		OffsetNumber noffnum;
+		OffsetNumber nmaxoffnum;
+
+		nbuf = _hash_getbuf(rel, nblkno, HASH_READ,
+							LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+
+		/* remember the primary bucket buffer to acquire cleanup lock on it. */
+		if (nblkno == bucket_nblkno)
+			bucket_nbuf = nbuf;
+
+		npage = BufferGetPage(nbuf);
+		npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
+
+		/* Scan each tuple in new page */
+		nmaxoffnum = PageGetMaxOffsetNumber(npage);
+		for (noffnum = FirstOffsetNumber;
+			 noffnum <= nmaxoffnum;
+			 noffnum = OffsetNumberNext(noffnum))
+		{
+			IndexTuple	itup;
+
+			/* Fetch the item's TID and insert it in hash table. */
+			itup = (IndexTuple) PageGetItem(npage,
+											PageGetItemId(npage, noffnum));
+
+			(void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found);
+
+			Assert(!found);
+		}
+
+		nblkno = npageopaque->hasho_nextblkno;
+
+		/*
+		 * release our write lock without modifying buffer and ensure to
+		 * retain the pin on primary bucket.
+		 */
+		if (nbuf == bucket_nbuf)
+			LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
+		else
+			_hash_relbuf(rel, nbuf);
+
+		/* Exit loop if no more overflow pages in new bucket */
+		if (!BlockNumberIsValid(nblkno))
+			break;
+	}
+
+	/*
+	 * Conditionally get the cleanup lock on old and new buckets to perform
+	 * the split operation.  If we don't get the cleanup locks, silently give
+	 * up and next insertion on old bucket will try again to complete the
+	 * split.
+	 */
+	if (!ConditionalLockBufferForCleanup(obuf))
+	{
+		hash_destroy(tidhtab);
+		return;
+	}
+	if (!ConditionalLockBufferForCleanup(bucket_nbuf))
+	{
+		LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
+		hash_destroy(tidhtab);
+		return;
+	}
+
+	npage = BufferGetPage(bucket_nbuf);
+	npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
+	nbucket = npageopaque->hasho_bucket;
+
+	_hash_splitbucket(rel, metabuf, obucket,
+					  nbucket, obuf, bucket_nbuf, tidhtab,
+					  maxbucket, highmask, lowmask);
+
+	_hash_relbuf(rel, bucket_nbuf);
+	LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
+	hash_destroy(tidhtab);
+}
+
+/*
+ *	log_split_page() -- Log the split operation
+ *
+ *	We log the split operation when the new page in new bucket gets full,
+ *	so we log the entire page.
+ *
+ *	'buf' must be locked by the caller which is also responsible for unlocking
+ *	it.
+ */
+static void
+log_split_page(Relation rel, Buffer buf)
+{
+	if (RelationNeedsWAL(rel))
+	{
+		XLogRecPtr	recptr;
+
+		XLogBeginInsert();
+
+		XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
+
+		recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE);
+
+		PageSetLSN(BufferGetPage(buf), recptr);
+	}
+}
+
+/*
+ *	_hash_getcachedmetap() -- Returns cached metapage data.
+ *
+ *	If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on
+ *	the metapage.  If not set, we'll set it before returning if we have to
+ *	refresh the cache, and return with a pin but no lock on it; caller is
+ *	responsible for releasing the pin.
+ *
+ *	We refresh the cache if it's not initialized yet or force_refresh is true.
+ */
+HashMetaPage
+_hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
+{
+	Page		page;
+
+	Assert(metabuf);
+	if (force_refresh || rel->rd_amcache == NULL)
+	{
+		char	   *cache = NULL;
+
+		/*
+		 * It's important that we don't set rd_amcache to an invalid value.
+		 * Either MemoryContextAlloc or _hash_getbuf could fail, so don't
+		 * install a pointer to the newly-allocated storage in the actual
+		 * relcache entry until both have succeeeded.
+		 */
+		if (rel->rd_amcache == NULL)
+			cache = MemoryContextAlloc(rel->rd_indexcxt,
+									   sizeof(HashMetaPageData));
+
+		/* Read the metapage. */
+		if (BufferIsValid(*metabuf))
+			LockBuffer(*metabuf, BUFFER_LOCK_SHARE);
+		else
+			*metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ,
+									LH_META_PAGE);
+		page = BufferGetPage(*metabuf);
+
+		/* Populate the cache. */
+		if (rel->rd_amcache == NULL)
+			rel->rd_amcache = cache;
+		memcpy(rel->rd_amcache, HashPageGetMeta(page),
+			   sizeof(HashMetaPageData));
+
+		/* Release metapage lock, but keep the pin. */
+		LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK);
+	}
+
+	return (HashMetaPage) rel->rd_amcache;
+}
+
+/*
+ *	_hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
+ *										 hashkey.
+ *
+ *	Bucket pages do not move or get removed once they are allocated. This give
+ *	us an opportunity to use the previously saved metapage contents to reach
+ *	the target bucket buffer, instead of reading from the metapage every time.
+ *	This saves one buffer access every time we want to reach the target bucket
+ *	buffer, which is very helpful savings in bufmgr traffic and contention.
+ *
+ *	The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
+ *	bucket buffer has to be locked for reading or writing.
+ *
+ *	The out parameter cachedmetap is set with metapage contents used for
+ *	hashkey to bucket buffer mapping. Some callers need this info to reach the
+ *	old bucket in case of bucket split, see _hash_doinsert().
+ */
+Buffer
+_hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access,
+								HashMetaPage *cachedmetap)
+{
+	HashMetaPage metap;
+	Buffer		buf;
+	Buffer		metabuf = InvalidBuffer;
+	Page		page;
+	Bucket		bucket;
+	BlockNumber blkno;
+	HashPageOpaque opaque;
+
+	/* We read from target bucket buffer, hence locking is must. */
+	Assert(access == HASH_READ || access == HASH_WRITE);
+
+	metap = _hash_getcachedmetap(rel, &metabuf, false);
+	Assert(metap != NULL);
+
+	/*
+	 * Loop until we get a lock on the correct target bucket.
 	 */
-	_hash_wrtbuf(rel, nbuf);
+	for (;;)
+	{
+		/*
+		 * Compute the target bucket number, and convert to block number.
+		 */
+		bucket = _hash_hashkey2bucket(hashkey,
+									  metap->hashm_maxbucket,
+									  metap->hashm_highmask,
+									  metap->hashm_lowmask);
+
+		blkno = BUCKET_TO_BLKNO(metap, bucket);
+
+		/* Fetch the primary bucket page for the bucket */
+		buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
+		page = BufferGetPage(buf);
+		opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+		Assert(opaque->hasho_bucket == bucket);
+		Assert(opaque->hasho_prevblkno != InvalidBlockNumber);
 
-	_hash_squeezebucket(rel, obucket, start_oblkno, NULL);
+		/*
+		 * If this bucket hasn't been split, we're done.
+		 */
+		if (opaque->hasho_prevblkno <= metap->hashm_maxbucket)
+			break;
+
+		/* Drop lock on this buffer, update cached metapage, and retry. */
+		_hash_relbuf(rel, buf);
+		metap = _hash_getcachedmetap(rel, &metabuf, true);
+		Assert(metap != NULL);
+	}
+
+	if (BufferIsValid(metabuf))
+		_hash_dropbuf(rel, metabuf);
+
+	if (cachedmetap)
+		*cachedmetap = metap;
+
+	return buf;
 }
diff --git a/src/backend/access/hash/hashscan.c b/src/backend/access/hash/hashscan.c
deleted file mode 100644
index fe97ef201a..0000000000
--- a/src/backend/access/hash/hashscan.c
+++ /dev/null
@@ -1,153 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * hashscan.c
- *	  manage scans on hash tables
- *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- *	  src/backend/access/hash/hashscan.c
- *
- *-------------------------------------------------------------------------
- */
-
-#include "postgres.h"
-
-#include "access/hash.h"
-#include "access/relscan.h"
-#include "utils/memutils.h"
-#include "utils/rel.h"
-#include "utils/resowner.h"
-
-
-/*
- * We track all of a backend's active scans on hash indexes using a list
- * of HashScanListData structs, which are allocated in TopMemoryContext.
- * It's okay to use a long-lived context because we rely on the ResourceOwner
- * mechanism to clean up unused entries after transaction or subtransaction
- * abort.  We can't safely keep the entries in the executor's per-query
- * context, because that might be already freed before we get a chance to
- * clean up the list.  (XXX seems like there should be a better way to
- * manage this...)
- */
-typedef struct HashScanListData
-{
-	IndexScanDesc hashsl_scan;
-	ResourceOwner hashsl_owner;
-	struct HashScanListData *hashsl_next;
-} HashScanListData;
-
-typedef HashScanListData *HashScanList;
-
-static HashScanList HashScans = NULL;
-
-
-/*
- * ReleaseResources_hash() --- clean up hash subsystem resources.
- *
- * This is here because it needs to touch this module's static var HashScans.
- */
-void
-ReleaseResources_hash(void)
-{
-	HashScanList l;
-	HashScanList prev;
-	HashScanList next;
-
-	/*
-	 * Release all HashScanList items belonging to the current ResourceOwner.
-	 * Note that we do not release the underlying IndexScanDesc; that's in
-	 * executor memory and will go away on its own (in fact quite possibly has
-	 * gone away already, so we mustn't try to touch it here).
-	 *
-	 * Note: this should be a no-op during normal query shutdown. However, in
-	 * an abort situation ExecutorEnd is not called and so there may be open
-	 * index scans to clean up.
-	 */
-	prev = NULL;
-
-	for (l = HashScans; l != NULL; l = next)
-	{
-		next = l->hashsl_next;
-		if (l->hashsl_owner == CurrentResourceOwner)
-		{
-			if (prev == NULL)
-				HashScans = next;
-			else
-				prev->hashsl_next = next;
-
-			pfree(l);
-			/* prev does not change */
-		}
-		else
-			prev = l;
-	}
-}
-
-/*
- *	_hash_regscan() -- register a new scan.
- */
-void
-_hash_regscan(IndexScanDesc scan)
-{
-	HashScanList new_el;
-
-	new_el = (HashScanList) MemoryContextAlloc(TopMemoryContext,
-											   sizeof(HashScanListData));
-	new_el->hashsl_scan = scan;
-	new_el->hashsl_owner = CurrentResourceOwner;
-	new_el->hashsl_next = HashScans;
-	HashScans = new_el;
-}
-
-/*
- *	_hash_dropscan() -- drop a scan from the scan list
- */
-void
-_hash_dropscan(IndexScanDesc scan)
-{
-	HashScanList chk,
-				last;
-
-	last = NULL;
-	for (chk = HashScans;
-		 chk != NULL && chk->hashsl_scan != scan;
-		 chk = chk->hashsl_next)
-		last = chk;
-
-	if (chk == NULL)
-		elog(ERROR, "hash scan list trashed; cannot find 0x%p", (void *) scan);
-
-	if (last == NULL)
-		HashScans = chk->hashsl_next;
-	else
-		last->hashsl_next = chk->hashsl_next;
-
-	pfree(chk);
-}
-
-/*
- * Is there an active scan in this bucket?
- */
-bool
-_hash_has_active_scan(Relation rel, Bucket bucket)
-{
-	Oid			relid = RelationGetRelid(rel);
-	HashScanList l;
-
-	for (l = HashScans; l != NULL; l = l->hashsl_next)
-	{
-		if (relid == l->hashsl_scan->indexRelation->rd_id)
-		{
-			HashScanOpaque so = (HashScanOpaque) l->hashsl_scan->opaque;
-
-			if (so->hashso_bucket_valid &&
-				so->hashso_bucket == bucket)
-				return true;
-		}
-	}
-
-	return false;
-}
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index 48255584e1..2d9204903f 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -3,7 +3,7 @@
  * hashsearch.c
  *	  search code for postgres hash tables
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -63,47 +63,149 @@ _hash_next(IndexScanDesc scan, ScanDirection dir)
 }
 
 /*
- * Advance to next page in a bucket, if any.
+ * Advance to next page in a bucket, if any.  If we are scanning the bucket
+ * being populated during split operation then this function advances to the
+ * bucket being split after the last bucket page of bucket being populated.
  */
 static void
-_hash_readnext(Relation rel,
+_hash_readnext(IndexScanDesc scan,
 			   Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
 {
 	BlockNumber blkno;
+	Relation	rel = scan->indexRelation;
+	HashScanOpaque so = (HashScanOpaque) scan->opaque;
+	bool		block_found = false;
 
 	blkno = (*opaquep)->hasho_nextblkno;
-	_hash_relbuf(rel, *bufp);
+
+	/*
+	 * Retain the pin on primary bucket page till the end of scan.  Refer the
+	 * comments in _hash_first to know the reason of retaining pin.
+	 */
+	if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
+		LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
+	else
+		_hash_relbuf(rel, *bufp);
+
 	*bufp = InvalidBuffer;
 	/* check for interrupts while we're not holding any buffer lock */
 	CHECK_FOR_INTERRUPTS();
 	if (BlockNumberIsValid(blkno))
 	{
 		*bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
+		block_found = true;
+	}
+	else if (so->hashso_buc_populated && !so->hashso_buc_split)
+	{
+		/*
+		 * end of bucket, scan bucket being split if there was a split in
+		 * progress at the start of scan.
+		 */
+		*bufp = so->hashso_split_bucket_buf;
+
+		/*
+		 * buffer for bucket being split must be valid as we acquire the pin
+		 * on it before the start of scan and retain it till end of scan.
+		 */
+		Assert(BufferIsValid(*bufp));
+
+		LockBuffer(*bufp, BUFFER_LOCK_SHARE);
+
+		/*
+		 * setting hashso_buc_split to true indicates that we are scanning
+		 * bucket being split.
+		 */
+		so->hashso_buc_split = true;
+
+		block_found = true;
+	}
+
+	if (block_found)
+	{
 		*pagep = BufferGetPage(*bufp);
+		TestForOldSnapshot(scan->xs_snapshot, rel, *pagep);
 		*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
 	}
 }
 
 /*
- * Advance to previous page in a bucket, if any.
+ * Advance to previous page in a bucket, if any.  If the current scan has
+ * started during split operation then this function advances to bucket
+ * being populated after the first bucket page of bucket being split.
  */
 static void
-_hash_readprev(Relation rel,
+_hash_readprev(IndexScanDesc scan,
 			   Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
 {
 	BlockNumber blkno;
+	Relation	rel = scan->indexRelation;
+	HashScanOpaque so = (HashScanOpaque) scan->opaque;
+	bool		haveprevblk;
 
 	blkno = (*opaquep)->hasho_prevblkno;
-	_hash_relbuf(rel, *bufp);
+
+	/*
+	 * Retain the pin on primary bucket page till the end of scan.  Refer the
+	 * comments in _hash_first to know the reason of retaining pin.
+	 */
+	if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
+	{
+		LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
+		haveprevblk = false;
+	}
+	else
+	{
+		_hash_relbuf(rel, *bufp);
+		haveprevblk = true;
+	}
+
 	*bufp = InvalidBuffer;
 	/* check for interrupts while we're not holding any buffer lock */
 	CHECK_FOR_INTERRUPTS();
-	if (BlockNumberIsValid(blkno))
+
+	if (haveprevblk)
 	{
+		Assert(BlockNumberIsValid(blkno));
 		*bufp = _hash_getbuf(rel, blkno, HASH_READ,
 							 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 		*pagep = BufferGetPage(*bufp);
+		TestForOldSnapshot(scan->xs_snapshot, rel, *pagep);
 		*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
+
+		/*
+		 * We always maintain the pin on bucket page for whole scan operation,
+		 * so releasing the additional pin we have acquired here.
+		 */
+		if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
+			_hash_dropbuf(rel, *bufp);
+	}
+	else if (so->hashso_buc_populated && so->hashso_buc_split)
+	{
+		/*
+		 * end of bucket, scan bucket being populated if there was a split in
+		 * progress at the start of scan.
+		 */
+		*bufp = so->hashso_bucket_buf;
+
+		/*
+		 * buffer for bucket being populated must be valid as we acquire the
+		 * pin on it before the start of scan and retain it till end of scan.
+		 */
+		Assert(BufferIsValid(*bufp));
+
+		LockBuffer(*bufp, BUFFER_LOCK_SHARE);
+		*pagep = BufferGetPage(*bufp);
+		*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
+
+		/* move to the end of bucket chain */
+		while (BlockNumberIsValid((*opaquep)->hasho_nextblkno))
+			_hash_readnext(scan, bufp, pagep, opaquep);
+
+		/*
+		 * setting hashso_buc_split to false indicates that we are scanning
+		 * bucket being populated.
+		 */
+		so->hashso_buc_split = false;
 	}
 }
 
@@ -124,14 +226,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 	ScanKey		cur;
 	uint32		hashkey;
 	Bucket		bucket;
-	BlockNumber blkno;
-	BlockNumber oldblkno = InvalidBuffer;
-	bool		retry = false;
 	Buffer		buf;
-	Buffer		metabuf;
 	Page		page;
 	HashPageOpaque opaque;
-	HashMetaPage metap;
 	IndexTuple	itup;
 	ItemPointer current;
 	OffsetNumber offnum;
@@ -186,70 +283,77 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 
 	so->hashso_sk_hash = hashkey;
 
-	/* Read the metapage */
-	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	page = BufferGetPage(metabuf);
-	metap = HashPageGetMeta(page);
+	buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
+	page = BufferGetPage(buf);
+	TestForOldSnapshot(scan->xs_snapshot, rel, page);
+	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+	bucket = opaque->hasho_bucket;
+
+	so->hashso_bucket_buf = buf;
 
 	/*
-	 * Loop until we get a lock on the correct target bucket.
+	 * If a bucket split is in progress, then while scanning the bucket being
+	 * populated, we need to skip tuples that were copied from bucket being
+	 * split.  We also need to maintain a pin on the bucket being split to
+	 * ensure that split-cleanup work done by vacuum doesn't remove tuples
+	 * from it till this scan is done.  We need to maintain a pin on the
+	 * bucket being populated to ensure that vacuum doesn't squeeze that
+	 * bucket till this scan is complete; otherwise, the ordering of tuples
+	 * can't be maintained during forward and backward scans.  Here, we have
+	 * to be cautious about locking order: first, acquire the lock on bucket
+	 * being split; then, release the lock on it but not the pin; then,
+	 * acquire a lock on bucket being populated and again re-verify whether
+	 * the bucket split is still in progress.  Acquiring the lock on bucket
+	 * being split first ensures that the vacuum waits for this scan to
+	 * finish.
 	 */
-	for (;;)
+	if (H_BUCKET_BEING_POPULATED(opaque))
 	{
-		/*
-		 * Compute the target bucket number, and convert to block number.
-		 */
-		bucket = _hash_hashkey2bucket(hashkey,
-									  metap->hashm_maxbucket,
-									  metap->hashm_highmask,
-									  metap->hashm_lowmask);
+		BlockNumber old_blkno;
+		Buffer		old_buf;
 
-		blkno = BUCKET_TO_BLKNO(metap, bucket);
-
-		/* Release metapage lock, but keep pin. */
-		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+		old_blkno = _hash_get_oldblock_from_newbucket(rel, bucket);
 
 		/*
-		 * If the previous iteration of this loop locked what is still the
-		 * correct target bucket, we are done.  Otherwise, drop any old lock
-		 * and lock what now appears to be the correct bucket.
+		 * release the lock on new bucket and re-acquire it after acquiring
+		 * the lock on old bucket.
 		 */
-		if (retry)
-		{
-			if (oldblkno == blkno)
-				break;
-			_hash_droplock(rel, oldblkno, HASH_SHARE);
-		}
-		_hash_getlock(rel, blkno, HASH_SHARE);
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+		old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE);
+		TestForOldSnapshot(scan->xs_snapshot, rel, BufferGetPage(old_buf));
 
 		/*
-		 * Reacquire metapage lock and check that no bucket split has taken
-		 * place while we were awaiting the bucket lock.
+		 * remember the split bucket buffer so as to use it later for
+		 * scanning.
 		 */
-		_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
-		oldblkno = blkno;
-		retry = true;
-	}
-
-	/* done with the metapage */
-	_hash_dropbuf(rel, metabuf);
+		so->hashso_split_bucket_buf = old_buf;
+		LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
 
-	/* Update scan opaque state to show we have lock on the bucket */
-	so->hashso_bucket = bucket;
-	so->hashso_bucket_valid = true;
-	so->hashso_bucket_blkno = blkno;
+		LockBuffer(buf, BUFFER_LOCK_SHARE);
+		page = BufferGetPage(buf);
+		opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+		Assert(opaque->hasho_bucket == bucket);
 
-	/* Fetch the primary bucket page for the bucket */
-	buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
-	page = BufferGetPage(buf);
-	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
-	Assert(opaque->hasho_bucket == bucket);
+		if (H_BUCKET_BEING_POPULATED(opaque))
+			so->hashso_buc_populated = true;
+		else
+		{
+			_hash_dropbuf(rel, so->hashso_split_bucket_buf);
+			so->hashso_split_bucket_buf = InvalidBuffer;
+		}
+	}
 
 	/* If a backwards scan is requested, move to the end of the chain */
 	if (ScanDirectionIsBackward(dir))
 	{
-		while (BlockNumberIsValid(opaque->hasho_nextblkno))
-			_hash_readnext(rel, &buf, &page, &opaque);
+		/*
+		 * Backward scans that start during split needs to start from end of
+		 * bucket being split.
+		 */
+		while (BlockNumberIsValid(opaque->hasho_nextblkno) ||
+			   (so->hashso_buc_populated && !so->hashso_buc_split))
+			_hash_readnext(scan, &buf, &page, &opaque);
 	}
 
 	/* Now find the first tuple satisfying the qualification */
@@ -273,6 +377,12 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
  *		false.  Else, return true and set the hashso_curpos for the
  *		scan to the right thing.
  *
+ *		Here we need to ensure that if the scan has started during split, then
+ *		skip the tuples that are moved by split while scanning bucket being
+ *		populated and then scan the bucket being split to cover all such
+ *		tuples.  This is done to ensure that we don't miss tuples in the scans
+ *		that are started during split.
+ *
  *		'bufP' points to the current buffer, which is pinned and read-locked.
  *		On success exit, we have pin and read-lock on whichever page
  *		contains the right item; on failure, we have released all buffers.
@@ -338,14 +448,31 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 					{
 						Assert(offnum >= FirstOffsetNumber);
 						itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+
+						/*
+						 * skip the tuples that are moved by split operation
+						 * for the scan that has started when split was in
+						 * progress
+						 */
+						if (so->hashso_buc_populated && !so->hashso_buc_split &&
+							(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK))
+						{
+							offnum = OffsetNumberNext(offnum);	/* move forward */
+							continue;
+						}
+
 						if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
 							break;		/* yes, so exit for-loop */
 					}
 
+					/* Before leaving current page, deal with any killed items */
+					if (so->numKilled > 0)
+						_hash_kill_items(scan);
+
 					/*
 					 * ran off the end of this page, try the next
 					 */
-					_hash_readnext(rel, &buf, &page, &opaque);
+					_hash_readnext(scan, &buf, &page, &opaque);
 					if (BufferIsValid(buf))
 					{
 						maxoff = PageGetMaxOffsetNumber(page);
@@ -353,7 +480,6 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 					}
 					else
 					{
-						/* end of bucket */
 						itup = NULL;
 						break;	/* exit for-loop */
 					}
@@ -379,22 +505,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 					{
 						Assert(offnum <= maxoff);
 						itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+
+						/*
+						 * skip the tuples that are moved by split operation
+						 * for the scan that has started when split was in
+						 * progress
+						 */
+						if (so->hashso_buc_populated && !so->hashso_buc_split &&
+							(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK))
+						{
+							offnum = OffsetNumberPrev(offnum);	/* move back */
+							continue;
+						}
+
 						if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
 							break;		/* yes, so exit for-loop */
 					}
 
+					/* Before leaving current page, deal with any killed items */
+					if (so->numKilled > 0)
+						_hash_kill_items(scan);
+
 					/*
 					 * ran off the end of this page, try the next
 					 */
-					_hash_readprev(rel, &buf, &page, &opaque);
+					_hash_readprev(scan, &buf, &page, &opaque);
 					if (BufferIsValid(buf))
 					{
+						TestForOldSnapshot(scan->xs_snapshot, rel, page);
 						maxoff = PageGetMaxOffsetNumber(page);
 						offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
 					}
 					else
 					{
-						/* end of bucket */
 						itup = NULL;
 						break;	/* exit for-loop */
 					}
@@ -410,9 +553,16 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 
 		if (itup == NULL)
 		{
-			/* we ran off the end of the bucket without finding a match */
+			/*
+			 * We ran off the end of the bucket without finding a match.
+			 * Release the pin on bucket buffers.  Normally, such pins are
+			 * released at end of scan, however scrolling cursors can
+			 * reacquire the bucket lock and pin in the same scan multiple
+			 * times.
+			 */
 			*bufP = so->hashso_curbuf = InvalidBuffer;
 			ItemPointerSetInvalid(current);
+			_hash_dropscanbuf(rel, so);
 			return false;
 		}
 
diff --git a/src/backend/access/hash/hashsort.c b/src/backend/access/hash/hashsort.c
index 8938ab5b24..41d615df8b 100644
--- a/src/backend/access/hash/hashsort.c
+++ b/src/backend/access/hash/hashsort.c
@@ -14,7 +14,7 @@
  * plenty of locality of access.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -37,7 +37,15 @@ struct HSpool
 {
 	Tuplesortstate *sortstate;	/* state data for tuplesort.c */
 	Relation	index;
-	uint32		hash_mask;		/* bitmask for hash codes */
+
+	/*
+	 * We sort the hash keys based on the buckets they belong to. Below masks
+	 * are used in _hash_hashkey2bucket to determine the bucket of given hash
+	 * key.
+	 */
+	uint32		high_mask;
+	uint32		low_mask;
+	uint32		max_buckets;
 };
 
 
@@ -56,11 +64,12 @@ _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
 	 * num_buckets buckets in the index, the appropriate mask can be computed
 	 * as follows.
 	 *
-	 * Note: at present, the passed-in num_buckets is always a power of 2, so
-	 * we could just compute num_buckets - 1.  We prefer not to assume that
-	 * here, though.
+	 * NOTE : This hash mask calculation should be in sync with similar
+	 * calculation in _hash_init_metabuffer.
 	 */
-	hspool->hash_mask = (((uint32) 1) << _hash_log2(num_buckets)) - 1;
+	hspool->high_mask = (((uint32) 1) << _hash_log2(num_buckets + 1)) - 1;
+	hspool->low_mask = (hspool->high_mask >> 1);
+	hspool->max_buckets = num_buckets - 1;
 
 	/*
 	 * We size the sort area as maintenance_work_mem rather than work_mem to
@@ -69,7 +78,9 @@ _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
 	 */
 	hspool->sortstate = tuplesort_begin_index_hash(heap,
 												   index,
-												   hspool->hash_mask,
+												   hspool->high_mask,
+												   hspool->low_mask,
+												   hspool->max_buckets,
 												   maintenance_work_mem,
 												   false);
 
@@ -101,18 +112,16 @@ _h_spool(HSpool *hspool, ItemPointer self, Datum *values, bool *isnull)
  * create an entire index.
  */
 void
-_h_indexbuild(HSpool *hspool)
+_h_indexbuild(HSpool *hspool, Relation heapRel)
 {
 	IndexTuple	itup;
-	bool		should_free;
 #ifdef USE_ASSERT_CHECKING
 	uint32		hashkey = 0;
 #endif
 
 	tuplesort_performsort(hspool->sortstate);
 
-	while ((itup = tuplesort_getindextuple(hspool->sortstate,
-										   true, &should_free)) != NULL)
+	while ((itup = tuplesort_getindextuple(hspool->sortstate, true)) != NULL)
 	{
 		/*
 		 * Technically, it isn't critical that hash keys be found in sorted
@@ -124,12 +133,12 @@ _h_indexbuild(HSpool *hspool)
 #ifdef USE_ASSERT_CHECKING
 		uint32		lasthashkey = hashkey;
 
-		hashkey = _hash_get_indextuple_hashkey(itup) & hspool->hash_mask;
+		hashkey = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
+									   hspool->max_buckets, hspool->high_mask,
+									   hspool->low_mask);
 		Assert(hashkey >= lasthashkey);
 #endif
 
-		_hash_doinsert(hspool->index, itup);
-		if (should_free)
-			pfree(itup);
+		_hash_doinsert(hspool->index, itup, heapRel);
 	}
 }
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
index 822862db7a..c513c3b842 100644
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -3,7 +3,7 @@
  * hashutil.c
  *	  Utility code for Postgres hash implementation.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -19,7 +19,10 @@
 #include "access/relscan.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
+#include "storage/buf_internals.h"
 
+#define CALC_NEW_BUCKET(old_bucket, lowmask) \
+			old_bucket | (lowmask + 1)
 
 /*
  * _hash_checkqual -- does the index tuple satisfy the scan conditions?
@@ -147,10 +150,76 @@ _hash_log2(uint32 num)
 }
 
 /*
+ * _hash_spareindex -- returns spare index / global splitpoint phase of the
+ *					   bucket
+ */
+uint32
+_hash_spareindex(uint32 num_bucket)
+{
+	uint32		splitpoint_group;
+	uint32		splitpoint_phases;
+
+	splitpoint_group = _hash_log2(num_bucket);
+
+	if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
+		return splitpoint_group;
+
+	/* account for single-phase groups */
+	splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
+
+	/* account for multi-phase groups before splitpoint_group */
+	splitpoint_phases +=
+		((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) <<
+		 HASH_SPLITPOINT_PHASE_BITS);
+
+	/* account for phases within current group */
+	splitpoint_phases +=
+		(((num_bucket - 1) >>
+		  (splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) &
+		 HASH_SPLITPOINT_PHASE_MASK);	/* to 0-based value. */
+
+	return splitpoint_phases;
+}
+
+/*
+ *	_hash_get_totalbuckets -- returns total number of buckets allocated till
+ *							the given splitpoint phase.
+ */
+uint32
+_hash_get_totalbuckets(uint32 splitpoint_phase)
+{
+	uint32		splitpoint_group;
+	uint32		total_buckets;
+	uint32		phases_within_splitpoint_group;
+
+	if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
+		return (1 << splitpoint_phase);
+
+	/* get splitpoint's group */
+	splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
+	splitpoint_group +=
+		((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >>
+		 HASH_SPLITPOINT_PHASE_BITS);
+
+	/* account for buckets before splitpoint_group */
+	total_buckets = (1 << (splitpoint_group - 1));
+
+	/* account for buckets within splitpoint_group */
+	phases_within_splitpoint_group =
+		(((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) &
+		  HASH_SPLITPOINT_PHASE_MASK) + 1);		/* from 0-based to 1-based */
+	total_buckets +=
+		(((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) *
+		 phases_within_splitpoint_group);
+
+	return total_buckets;
+}
+
+/*
  * _hash_checkpage -- sanity checks on the format of all hash pages
  *
- * If flags is not zero, it is a bitwise OR of the acceptable values of
- * hasho_flag.
+ * If flags is not zero, it is a bitwise OR of the acceptable page types
+ * (values of hasho_flag & LH_PAGE_TYPE).
  */
 void
 _hash_checkpage(Relation rel, Buffer buf, int flags)
@@ -352,3 +421,163 @@ _hash_binsearch_last(Page page, uint32 hash_value)
 
 	return lower;
 }
+
+/*
+ *	_hash_get_oldblock_from_newbucket() -- get the block number of a bucket
+ *			from which current (new) bucket is being split.
+ */
+BlockNumber
+_hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
+{
+	Bucket		old_bucket;
+	uint32		mask;
+	Buffer		metabuf;
+	HashMetaPage metap;
+	BlockNumber blkno;
+
+	/*
+	 * To get the old bucket from the current bucket, we need a mask to modulo
+	 * into lower half of table.  This mask is stored in meta page as
+	 * hashm_lowmask, but here we can't rely on the same, because we need a
+	 * value of lowmask that was prevalent at the time when bucket split was
+	 * started.  Masking the most significant bit of new bucket would give us
+	 * old bucket.
+	 */
+	mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1;
+	old_bucket = new_bucket & mask;
+
+	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));
+
+	blkno = BUCKET_TO_BLKNO(metap, old_bucket);
+
+	_hash_relbuf(rel, metabuf);
+
+	return blkno;
+}
+
+/*
+ *	_hash_get_newblock_from_oldbucket() -- get the block number of a bucket
+ *			that will be generated after split from old bucket.
+ *
+ * This is used to find the new bucket from old bucket based on current table
+ * half.  It is mainly required to finish the incomplete splits where we are
+ * sure that not more than one bucket could have split in progress from old
+ * bucket.
+ */
+BlockNumber
+_hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
+{
+	Bucket		new_bucket;
+	Buffer		metabuf;
+	HashMetaPage metap;
+	BlockNumber blkno;
+
+	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));
+
+	new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
+													metap->hashm_lowmask,
+													metap->hashm_maxbucket);
+	blkno = BUCKET_TO_BLKNO(metap, new_bucket);
+
+	_hash_relbuf(rel, metabuf);
+
+	return blkno;
+}
+
+/*
+ *	_hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
+ *			generated after split from current (old) bucket.
+ *
+ * This is used to find the new bucket from old bucket.  New bucket can be
+ * obtained by OR'ing old bucket with most significant bit of current table
+ * half (lowmask passed in this function can be used to identify msb of
+ * current table half).  There could be multiple buckets that could have
+ * been split from current bucket.  We need the first such bucket that exists.
+ * Caller must ensure that no more than one split has happened from old
+ * bucket.
+ */
+Bucket
+_hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
+								   uint32 lowmask, uint32 maxbucket)
+{
+	Bucket		new_bucket;
+
+	new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
+	if (new_bucket > maxbucket)
+	{
+		lowmask = lowmask >> 1;
+		new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
+	}
+
+	return new_bucket;
+}
+
+/*
+ * _hash_kill_items - set LP_DEAD state for items an indexscan caller has
+ * told us were killed.
+ *
+ * scan->opaque, referenced locally through so, contains information about the
+ * current page and killed tuples thereon (generally, this should only be
+ * called if so->numKilled > 0).
+ *
+ * We match items by heap TID before assuming they are the right ones to
+ * delete.
+ */
+void
+_hash_kill_items(IndexScanDesc scan)
+{
+	HashScanOpaque so = (HashScanOpaque) scan->opaque;
+	Page		page;
+	HashPageOpaque opaque;
+	OffsetNumber offnum,
+				maxoff;
+	int			numKilled = so->numKilled;
+	int			i;
+	bool		killedsomething = false;
+
+	Assert(so->numKilled > 0);
+	Assert(so->killedItems != NULL);
+
+	/*
+	 * Always reset the scan state, so we don't look for same items on other
+	 * pages.
+	 */
+	so->numKilled = 0;
+
+	page = BufferGetPage(so->hashso_curbuf);
+	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	for (i = 0; i < numKilled; i++)
+	{
+		offnum = so->killedItems[i].indexOffset;
+
+		while (offnum <= maxoff)
+		{
+			ItemId		iid = PageGetItemId(page, offnum);
+			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid);
+
+			if (ItemPointerEquals(&ituple->t_tid, &so->killedItems[i].heapTid))
+			{
+				/* found the item */
+				ItemIdMarkDead(iid);
+				killedsomething = true;
+				break;			/* out of inner search loop */
+			}
+			offnum = OffsetNumberNext(offnum);
+		}
+	}
+
+	/*
+	 * Since this can be redone later if needed, mark as dirty hint. Whenever
+	 * we mark anything LP_DEAD, we also set the page's
+	 * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint.
+	 */
+	if (killedsomething)
+	{
+		opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
+		MarkBufferDirtyHint(so->hashso_curbuf, true);
+	}
+}
diff --git a/src/backend/access/hash/hashvalidate.c b/src/backend/access/hash/hashvalidate.c
index d8c5ed4d98..f914c015bd 100644
--- a/src/backend/access/hash/hashvalidate.c
+++ b/src/backend/access/hash/hashvalidate.c
@@ -3,7 +3,7 @@
  * hashvalidate.c
  *	  Opclass validator for hash.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -25,6 +25,7 @@
 #include "parser/parse_coerce.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
+#include "utils/regproc.h"
 #include "utils/syscache.h"
 
 
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 2368340b08..05fd372664 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -3,7 +3,7 @@
  * heapam.c
  *	  heap access method code
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -38,6 +38,7 @@
  */
 #include "postgres.h"
 
+#include "access/bufmask.h"
 #include "access/heapam.h"
 #include "access/heapam_xlog.h"
 #include "access/hio.h"
@@ -98,11 +99,8 @@ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
 				Buffer newbuf, HeapTuple oldtup,
 				HeapTuple newtup, HeapTuple old_key_tup,
 				bool all_visible_cleared, bool new_all_visible_cleared);
-static void HeapSatisfiesHOTandKeyUpdate(Relation relation,
-							 Bitmapset *hot_attrs,
-							 Bitmapset *key_attrs, Bitmapset *id_attrs,
-							 bool *satisfies_hot, bool *satisfies_key,
-							 bool *satisfies_id,
+static Bitmapset *HeapDetermineModifiedColumns(Relation relation,
+							 Bitmapset *interesting_cols,
 							 HeapTuple oldtup, HeapTuple newtup);
 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
 					 LockTupleMode mode, LockWaitPolicy wait_policy,
@@ -1134,7 +1132,7 @@ relation_open(Oid relationId, LOCKMODE lockmode)
 
 	/* Make note that we've accessed a temporary relation */
 	if (RelationUsesLocalBuffers(r))
-		MyXactAccessedTempRel = true;
+		MyXactFlags |= XACT_FLAGS_ACCESSEDTEMPREL;
 
 	pgstat_initstats(r);
 
@@ -1180,7 +1178,7 @@ try_relation_open(Oid relationId, LOCKMODE lockmode)
 
 	/* Make note that we've accessed a temporary relation */
 	if (RelationUsesLocalBuffers(r))
-		MyXactAccessedTempRel = true;
+		MyXactFlags |= XACT_FLAGS_ACCESSEDTEMPREL;
 
 	pgstat_initstats(r);
 
@@ -1760,6 +1758,22 @@ retry:
 }
 
 /* ----------------
+ *		heap_update_snapshot
+ *
+ *		Update snapshot info in heap scan descriptor.
+ * ----------------
+ */
+void
+heap_update_snapshot(HeapScanDesc scan, Snapshot snapshot)
+{
+	Assert(IsMVCCSnapshot(snapshot));
+
+	RegisterSnapshot(snapshot);
+	scan->rs_snapshot = snapshot;
+	scan->rs_temp_snap = true;
+}
+
+/* ----------------
  *		heap_getnext	- retrieve next tuple in scan
  *
  *		Fix to work with index relations.
@@ -2337,6 +2351,17 @@ FreeBulkInsertState(BulkInsertState bistate)
 	pfree(bistate);
 }
 
+/*
+ * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
+ */
+void
+ReleaseBulkInsertStatePin(BulkInsertState bistate)
+{
+	if (bistate->current_buf != InvalidBuffer)
+		ReleaseBuffer(bistate->current_buf);
+	bistate->current_buf = InvalidBuffer;
+}
+
 
 /*
  *	heap_insert		- insert tuple into a heap
@@ -2520,7 +2545,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 							heaptup->t_len - SizeofHeapTupleHeader);
 
 		/* filtering by origin on a row level is much more efficient */
-		XLogIncludeOrigin();
+		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
 
 		recptr = XLogInsert(RM_HEAP_ID, info);
 
@@ -2862,7 +2887,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
 			XLogRegisterBufData(0, tupledata, totaldatalen);
 
 			/* filtering by origin on a row level is much more efficient */
-			XLogIncludeOrigin();
+			XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
 
 			recptr = XLogInsert(RM_HEAP2_ID, info);
 
@@ -3324,7 +3349,7 @@ l1:
 		}
 
 		/* filtering by origin on a row level is much more efficient */
-		XLogIncludeOrigin();
+		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
 
 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
 
@@ -3351,7 +3376,7 @@ l1:
 		Assert(!HeapTupleHasExternal(&tp));
 	}
 	else if (HeapTupleHasExternal(&tp))
-		toast_delete(relation, &tp);
+		toast_delete(relation, &tp, false);
 
 	/*
 	 * Mark tuple for invalidation from system caches at next command
@@ -3459,6 +3484,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	Bitmapset  *hot_attrs;
 	Bitmapset  *key_attrs;
 	Bitmapset  *id_attrs;
+	Bitmapset  *interesting_attrs;
+	Bitmapset  *modified_attrs;
 	ItemId		lp;
 	HeapTupleData oldtup;
 	HeapTuple	heaptup;
@@ -3476,10 +3503,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 				pagefree;
 	bool		have_tuple_lock = false;
 	bool		iscombo;
-	bool		satisfies_hot;
-	bool		satisfies_key;
-	bool		satisfies_id;
 	bool		use_hot_update = false;
+	bool		hot_attrs_checked = false;
 	bool		key_intact;
 	bool		all_visible_cleared = false;
 	bool		all_visible_cleared_new = false;
@@ -3505,26 +3530,51 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 				 errmsg("cannot update tuples during a parallel operation")));
 
 	/*
-	 * Fetch the list of attributes to be checked for HOT update.  This is
-	 * wasted effort if we fail to update or have to put the new tuple on a
-	 * different page.  But we must compute the list before obtaining buffer
-	 * lock --- in the worst case, if we are doing an update on one of the
-	 * relevant system catalogs, we could deadlock if we try to fetch the list
-	 * later.  In any case, the relcache caches the data so this is usually
-	 * pretty cheap.
+	 * Fetch the list of attributes to be checked for various operations.
+	 *
+	 * For HOT considerations, this is wasted effort if we fail to update or
+	 * have to put the new tuple on a different page.  But we must compute the
+	 * list before obtaining buffer lock --- in the worst case, if we are
+	 * doing an update on one of the relevant system catalogs, we could
+	 * deadlock if we try to fetch the list later.  In any case, the relcache
+	 * caches the data so this is usually pretty cheap.
 	 *
-	 * Note that we get a copy here, so we need not worry about relcache flush
-	 * happening midway through.
+	 * We also need columns used by the replica identity and columns that are
+	 * considered the "key" of rows in the table.
+	 *
+	 * Note that we get copies of each bitmap, so we need not worry about
+	 * relcache flush happening midway through.
 	 */
 	hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
 	key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
 	id_attrs = RelationGetIndexAttrBitmap(relation,
 										  INDEX_ATTR_BITMAP_IDENTITY_KEY);
 
+
 	block = ItemPointerGetBlockNumber(otid);
 	buffer = ReadBuffer(relation, block);
 	page = BufferGetPage(buffer);
 
+	interesting_attrs = NULL;
+
+	/*
+	 * If the page is already full, there is hardly any chance of doing a HOT
+	 * update on this page. It might be wasteful effort to look for index
+	 * column updates only to later reject HOT updates for lack of space in
+	 * the same page. So we be conservative and only fetch hot_attrs if the
+	 * page is not already full. Since we are already holding a pin on the
+	 * buffer, there is no chance that the buffer can get cleaned up
+	 * concurrently and even if that was possible, in the worst case we lose a
+	 * chance to do a HOT update.
+	 */
+	if (!PageIsFull(page))
+	{
+		interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
+		hot_attrs_checked = true;
+	}
+	interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
+	interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
+
 	/*
 	 * Before locking the buffer, pin the visibility map page if it appears to
 	 * be necessary.  Since we haven't got the lock yet, someone else might be
@@ -3540,7 +3590,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	Assert(ItemIdIsNormal(lp));
 
 	/*
-	 * Fill in enough data in oldtup for HeapSatisfiesHOTandKeyUpdate to work
+	 * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
 	 * properly.
 	 */
 	oldtup.t_tableOid = RelationGetRelid(relation);
@@ -3566,6 +3616,10 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 		Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
 	}
 
+	/* Determine columns modified by the update. */
+	modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
+												  &oldtup, newtup);
+
 	/*
 	 * If we're not updating any "key" column, we can grab a weaker lock type.
 	 * This allows for more concurrency when we are running simultaneously
@@ -3577,10 +3631,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	 * is updates that don't manipulate key columns, not those that
 	 * serendipitiously arrive at the same key values.
 	 */
-	HeapSatisfiesHOTandKeyUpdate(relation, hot_attrs, key_attrs, id_attrs,
-								 &satisfies_hot, &satisfies_key,
-								 &satisfies_id, &oldtup, newtup);
-	if (satisfies_key)
+	if (!bms_overlap(modified_attrs, key_attrs))
 	{
 		*lockmode = LockTupleNoKeyExclusive;
 		mxact_status = MultiXactStatusNoKeyUpdate;
@@ -3818,6 +3869,9 @@ l2:
 			ReleaseBuffer(vmbuffer);
 		bms_free(hot_attrs);
 		bms_free(key_attrs);
+		bms_free(id_attrs);
+		bms_free(modified_attrs);
+		bms_free(interesting_attrs);
 		return result;
 	}
 
@@ -4123,9 +4177,10 @@ l2:
 		/*
 		 * Since the new tuple is going into the same page, we might be able
 		 * to do a HOT update.  Check if any of the index columns have been
-		 * changed.  If not, then HOT update is possible.
+		 * changed. If the page was already full, we may have skipped checking
+		 * for index columns. If so, HOT update is possible.
 		 */
-		if (satisfies_hot)
+		if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
 			use_hot_update = true;
 	}
 	else
@@ -4140,7 +4195,9 @@ l2:
 	 * ExtractReplicaIdentity() will return NULL if nothing needs to be
 	 * logged.
 	 */
-	old_key_tuple = ExtractReplicaIdentity(relation, &oldtup, !satisfies_id, &old_key_copied);
+	old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
+									   bms_overlap(modified_attrs, id_attrs),
+										   &old_key_copied);
 
 	/* NO EREPORT(ERROR) from here till changes are logged */
 	START_CRIT_SECTION();
@@ -4287,13 +4344,16 @@ l2:
 
 	bms_free(hot_attrs);
 	bms_free(key_attrs);
+	bms_free(id_attrs);
+	bms_free(modified_attrs);
+	bms_free(interesting_attrs);
 
 	return HeapTupleMayBeUpdated;
 }
 
 /*
  * Check if the specified attribute's value is same in both given tuples.
- * Subroutine for HeapSatisfiesHOTandKeyUpdate.
+ * Subroutine for HeapDetermineModifiedColumns.
  */
 static bool
 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
@@ -4330,7 +4390,7 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
 
 	/*
 	 * Extract the corresponding values.  XXX this is pretty inefficient if
-	 * there are many indexed columns.  Should HeapSatisfiesHOTandKeyUpdate do
+	 * there are many indexed columns.  Should HeapDetermineModifiedColumns do
 	 * a single heap_deform_tuple call on each tuple, instead?	But that
 	 * doesn't work for system columns ...
 	 */
@@ -4375,114 +4435,30 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
 /*
  * Check which columns are being updated.
  *
- * This simultaneously checks conditions for HOT updates, for FOR KEY
- * SHARE updates, and REPLICA IDENTITY concerns.  Since much of the time they
- * will be checking very similar sets of columns, and doing the same tests on
- * them, it makes sense to optimize and do them together.
- *
- * We receive three bitmapsets comprising the three sets of columns we're
- * interested in.  Note these are destructively modified; that is OK since
- * this is invoked at most once in heap_update.
+ * Given an updated tuple, determine (and return into the output bitmapset),
+ * from those listed as interesting, the set of columns that changed.
  *
- * hot_result is set to TRUE if it's okay to do a HOT update (i.e. it does not
- * modified indexed columns); key_result is set to TRUE if the update does not
- * modify columns used in the key; id_result is set to TRUE if the update does
- * not modify columns in any index marked as the REPLICA IDENTITY.
+ * The input bitmapset is destructively modified; that is OK since this is
+ * invoked at most once in heap_update.
  */
-static void
-HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs,
-							 Bitmapset *key_attrs, Bitmapset *id_attrs,
-							 bool *satisfies_hot, bool *satisfies_key,
-							 bool *satisfies_id,
+static Bitmapset *
+HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols,
 							 HeapTuple oldtup, HeapTuple newtup)
 {
-	int			next_hot_attnum;
-	int			next_key_attnum;
-	int			next_id_attnum;
-	bool		hot_result = true;
-	bool		key_result = true;
-	bool		id_result = true;
-
-	/* If REPLICA IDENTITY is set to FULL, id_attrs will be empty. */
-	Assert(bms_is_subset(id_attrs, key_attrs));
-	Assert(bms_is_subset(key_attrs, hot_attrs));
-
-	/*
-	 * If one of these sets contains no remaining bits, bms_first_member will
-	 * return -1, and after adding FirstLowInvalidHeapAttributeNumber (which
-	 * is negative!)  we'll get an attribute number that can't possibly be
-	 * real, and thus won't match any actual attribute number.
-	 */
-	next_hot_attnum = bms_first_member(hot_attrs);
-	next_hot_attnum += FirstLowInvalidHeapAttributeNumber;
-	next_key_attnum = bms_first_member(key_attrs);
-	next_key_attnum += FirstLowInvalidHeapAttributeNumber;
-	next_id_attnum = bms_first_member(id_attrs);
-	next_id_attnum += FirstLowInvalidHeapAttributeNumber;
+	int			attnum;
+	Bitmapset  *modified = NULL;
 
-	for (;;)
+	while ((attnum = bms_first_member(interesting_cols)) >= 0)
 	{
-		bool		changed;
-		int			check_now;
+		attnum += FirstLowInvalidHeapAttributeNumber;
 
-		/*
-		 * Since the HOT attributes are a superset of the key attributes and
-		 * the key attributes are a superset of the id attributes, this logic
-		 * is guaranteed to identify the next column that needs to be checked.
-		 */
-		if (hot_result && next_hot_attnum > FirstLowInvalidHeapAttributeNumber)
-			check_now = next_hot_attnum;
-		else if (key_result && next_key_attnum > FirstLowInvalidHeapAttributeNumber)
-			check_now = next_key_attnum;
-		else if (id_result && next_id_attnum > FirstLowInvalidHeapAttributeNumber)
-			check_now = next_id_attnum;
-		else
-			break;
-
-		/* See whether it changed. */
-		changed = !heap_tuple_attr_equals(RelationGetDescr(relation),
-										  check_now, oldtup, newtup);
-		if (changed)
-		{
-			if (check_now == next_hot_attnum)
-				hot_result = false;
-			if (check_now == next_key_attnum)
-				key_result = false;
-			if (check_now == next_id_attnum)
-				id_result = false;
-
-			/* if all are false now, we can stop checking */
-			if (!hot_result && !key_result && !id_result)
-				break;
-		}
-
-		/*
-		 * Advance the next attribute numbers for the sets that contain the
-		 * attribute we just checked.  As we work our way through the columns,
-		 * the next_attnum values will rise; but when each set becomes empty,
-		 * bms_first_member() will return -1 and the attribute number will end
-		 * up with a value less than FirstLowInvalidHeapAttributeNumber.
-		 */
-		if (hot_result && check_now == next_hot_attnum)
-		{
-			next_hot_attnum = bms_first_member(hot_attrs);
-			next_hot_attnum += FirstLowInvalidHeapAttributeNumber;
-		}
-		if (key_result && check_now == next_key_attnum)
-		{
-			next_key_attnum = bms_first_member(key_attrs);
-			next_key_attnum += FirstLowInvalidHeapAttributeNumber;
-		}
-		if (id_result && check_now == next_id_attnum)
-		{
-			next_id_attnum = bms_first_member(id_attrs);
-			next_id_attnum += FirstLowInvalidHeapAttributeNumber;
-		}
+		if (!heap_tuple_attr_equals(RelationGetDescr(relation),
+									attnum, oldtup, newtup))
+			modified = bms_add_member(modified,
+								attnum - FirstLowInvalidHeapAttributeNumber);
 	}
 
-	*satisfies_hot = hot_result;
-	*satisfies_key = key_result;
-	*satisfies_id = id_result;
+	return modified;
 }
 
 /*
@@ -5745,6 +5721,17 @@ l4:
 			goto out_locked;
 		}
 
+		/*
+		 * Also check Xmin: if this tuple was created by an aborted
+		 * (sub)transaction, then we already locked the last live one in the
+		 * chain, thus we're done, so return success.
+		 */
+		if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data)))
+		{
+			UnlockReleaseBuffer(buf);
+			return HeapTupleMayBeUpdated;
+		}
+
 		old_infomask = mytup.t_data->t_infomask;
 		old_infomask2 = mytup.t_data->t_infomask2;
 		xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
@@ -6047,7 +6034,7 @@ heap_finish_speculative(Relation relation, HeapTuple tuple)
 		XLogBeginInsert();
 
 		/* We want the same filtering on this as on a plain insert */
-		XLogIncludeOrigin();
+		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
 
 		XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
@@ -6082,7 +6069,8 @@ heap_finish_speculative(Relation relation, HeapTuple tuple)
  * could deadlock with each other, which would not be acceptable.
  *
  * This is somewhat redundant with heap_delete, but we prefer to have a
- * dedicated routine with stripped down requirements.
+ * dedicated routine with stripped down requirements.  Note that this is also
+ * used to delete the TOAST tuples created during speculative insertion.
  *
  * This routine does not affect logical decoding as it only looks at
  * confirmation records.
@@ -6126,7 +6114,7 @@ heap_abort_speculative(Relation relation, HeapTuple tuple)
 	 */
 	if (tp.t_data->t_choice.t_heap.t_xmin != xid)
 		elog(ERROR, "attempted to kill a tuple inserted by another transaction");
-	if (!HeapTupleHeaderIsSpeculative(tp.t_data))
+	if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
 		elog(ERROR, "attempted to kill a non-speculative tuple");
 	Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
 
@@ -6196,7 +6184,10 @@ heap_abort_speculative(Relation relation, HeapTuple tuple)
 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 
 	if (HeapTupleHasExternal(&tp))
-		toast_delete(relation, &tp);
+	{
+		Assert(!IsToastRelation(relation));
+		toast_delete(relation, &tp, true);
+	}
 
 	/*
 	 * Never need to mark tuple for invalidation, since catalogs don't support
@@ -6770,8 +6761,8 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
  * Note: it might seem we could make the changes without exclusive lock, since
  * TransactionId read/write is assumed atomic anyway.  However there is a race
  * condition: someone who just fetched an old XID that we overwrite here could
- * conceivably not finish checking the XID against pg_clog before we finish
- * the VACUUM and perhaps truncate off the part of pg_clog he needs.  Getting
+ * conceivably not finish checking the XID against pg_xact before we finish
+ * the VACUUM and perhaps truncate off the part of pg_xact he needs.  Getting
  * exclusive lock ensures no other backend is in process of checking the
  * tuple status.  Also, getting exclusive lock makes it safe to adjust the
  * infomask bits.
@@ -7711,7 +7702,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	}
 
 	/* filtering by origin on a row level is much more efficient */
-	XLogIncludeOrigin();
+	XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
 
 	recptr = XLogInsert(RM_HEAP_ID, info);
 
@@ -9139,3 +9130,80 @@ heap_sync(Relation rel)
 		heap_close(toastrel, AccessShareLock);
 	}
 }
+
+/*
+ * Mask a heap page before performing consistency checks on it.
+ */
+void
+heap_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+	OffsetNumber off;
+
+	mask_page_lsn(page);
+
+	mask_page_hint_bits(page);
+	mask_unused_space(page);
+
+	for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
+	{
+		ItemId		iid = PageGetItemId(page, off);
+		char	   *page_item;
+
+		page_item = (char *) (page + ItemIdGetOffset(iid));
+
+		if (ItemIdIsNormal(iid))
+		{
+			HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
+
+			/*
+			 * If xmin of a tuple is not yet frozen, we should ignore
+			 * differences in hint bits, since they can be set without
+			 * emitting WAL.
+			 */
+			if (!HeapTupleHeaderXminFrozen(page_htup))
+				page_htup->t_infomask &= ~HEAP_XACT_MASK;
+			else
+			{
+				/* Still we need to mask xmax hint bits. */
+				page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
+				page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
+			}
+
+			/*
+			 * During replay, we set Command Id to FirstCommandId. Hence, mask
+			 * it. See heap_xlog_insert() for details.
+			 */
+			page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
+
+			/*
+			 * For a speculative tuple, heap_insert() does not set ctid in the
+			 * caller-passed heap tuple itself, leaving the ctid field to
+			 * contain a speculative token value - a per-backend monotonically
+			 * increasing identifier. Besides, it does not WAL-log ctid under
+			 * any circumstances.
+			 *
+			 * During redo, heap_xlog_insert() sets t_ctid to current block
+			 * number and self offset number. It doesn't care about any
+			 * speculative insertions in master. Hence, we set t_ctid to
+			 * current block number and self offset number to ignore any
+			 * inconsistency.
+			 */
+			if (HeapTupleHeaderIsSpeculative(page_htup))
+				ItemPointerSet(&page_htup->t_ctid, blkno, off);
+		}
+
+		/*
+		 * Ignore any padding bytes after the tuple, when the length of the
+		 * item is not MAXALIGNed.
+		 */
+		if (ItemIdHasStorage(iid))
+		{
+			int			len = ItemIdGetLength(iid);
+			int			padlen = MAXALIGN(len) - len;
+
+			if (padlen > 0)
+				memset(page_item + len, MASK_MARKER, padlen);
+		}
+	}
+}
diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index c90fb71965..6529fe3d6b 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -3,7 +3,7 @@
  * hio.c
  *	  POSTGRES heap access method input/output code.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 200861eef1..4f41511764 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -3,7 +3,7 @@
  * pruneheap.c
  *	  heap page pruning and HOT-chain management code
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index f9ce9861e2..60dcb67a20 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -92,7 +92,7 @@
  * heap's TOAST table will go through the normal bufmgr.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994-5, Regents of the University of California
  *
  * IDENTIFICATION
@@ -119,6 +119,8 @@
 
 #include "lib/ilist.h"
 
+#include "pgstat.h"
+
 #include "replication/logical.h"
 #include "replication/slot.h"
 
@@ -209,7 +211,7 @@ typedef struct RewriteMappingFile
 } RewriteMappingFile;
 
 /*
- * A single In-Memeory logical rewrite mapping, hanging of
+ * A single In-Memory logical rewrite mapping, hanging off
  * RewriteMappingFile->mappings.
  */
 typedef struct RewriteMappingDataEntry
@@ -258,9 +260,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	 */
 	rw_cxt = AllocSetContextCreate(CurrentMemoryContext,
 								   "Table rewrite",
-								   ALLOCSET_DEFAULT_MINSIZE,
-								   ALLOCSET_DEFAULT_INITSIZE,
-								   ALLOCSET_DEFAULT_MAXSIZE);
+								   ALLOCSET_DEFAULT_SIZES);
 	old_cxt = MemoryContextSwitchTo(rw_cxt);
 
 	/* Create and fill in the state struct */
@@ -918,7 +918,8 @@ logical_heap_rewrite_flush_mappings(RewriteState state)
 		 * Note that we deviate from the usual WAL coding practices here,
 		 * check the above "Logical rewrite support" comment for reasoning.
 		 */
-		written = FileWrite(src->vfd, waldata_start, len);
+		written = FileWrite(src->vfd, waldata_start, len,
+							WAIT_EVENT_LOGICAL_REWRITE_WRITE);
 		if (written != len)
 			ereport(ERROR,
 					(errcode_for_file_access(),
@@ -959,7 +960,7 @@ logical_end_heap_rewrite(RewriteState state)
 	hash_seq_init(&seq_status, state->rs_logical_mappings);
 	while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
 	{
-		if (FileSync(src->vfd) != 0)
+		if (FileSync(src->vfd, WAIT_EVENT_LOGICAL_REWRITE_SYNC) != 0)
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m", src->path)));
@@ -1143,11 +1144,13 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
 	 * Truncate all data that's not guaranteed to have been safely fsynced (by
 	 * previous record or by the last checkpoint).
 	 */
+	pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE);
 	if (ftruncate(fd, xlrec->offset) != 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not truncate file \"%s\" to %u: %m",
 						path, (uint32) xlrec->offset)));
+	pgstat_report_wait_end();
 
 	/* now seek to the position we want to write our data to */
 	if (lseek(fd, xlrec->offset, SEEK_SET) != xlrec->offset)
@@ -1161,20 +1164,24 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
 	len = xlrec->num_mappings * sizeof(LogicalRewriteMappingData);
 
 	/* write out tail end of mapping file (again) */
+	pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE);
 	if (write(fd, data, len) != len)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not write to file \"%s\": %m", path)));
+	pgstat_report_wait_end();
 
 	/*
 	 * Now fsync all previously written data. We could improve things and only
 	 * do this for the last write to a file, but the required bookkeeping
 	 * doesn't seem worth the trouble.
 	 */
+	pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC);
 	if (pg_fsync(fd) != 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not fsync file \"%s\": %m", path)));
+	pgstat_report_wait_end();
 
 	CloseTransientFile(fd);
 }
@@ -1196,7 +1203,7 @@ CheckPointLogicalRewriteHeap(void)
 	XLogRecPtr	redo;
 	DIR		   *mappings_dir;
 	struct dirent *mapping_de;
-	char		path[MAXPGPATH];
+	char		path[MAXPGPATH + 20];
 
 	/*
 	 * We start of with a minimum of the last redo pointer. No new decoding
@@ -1227,7 +1234,7 @@ CheckPointLogicalRewriteHeap(void)
 			strcmp(mapping_de->d_name, "..") == 0)
 			continue;
 
-		snprintf(path, MAXPGPATH, "pg_logical/mappings/%s", mapping_de->d_name);
+		snprintf(path, sizeof(path), "pg_logical/mappings/%s", mapping_de->d_name);
 		if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
 			continue;
 
@@ -1268,10 +1275,12 @@ CheckPointLogicalRewriteHeap(void)
 			 * changed or have only been created since the checkpoint's start,
 			 * but it's currently not deemed worth the effort.
 			 */
-			else if (pg_fsync(fd) != 0)
+			pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC);
+			if (pg_fsync(fd) != 0)
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not fsync file \"%s\": %m", path)));
+			pgstat_report_wait_end();
 			CloseTransientFile(fd);
 		}
 	}
diff --git a/src/backend/access/heap/syncscan.c b/src/backend/access/heap/syncscan.c
index a0f500edc8..20640cbbaf 100644
--- a/src/backend/access/heap/syncscan.c
+++ b/src/backend/access/heap/syncscan.c
@@ -36,7 +36,7 @@
  *		ss_report_location	- update current scan location
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -48,6 +48,8 @@
 
 #include "access/heapam.h"
 #include "miscadmin.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
 #include "utils/rel.h"
 
 
diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c
index 452a9ecb68..b9963ab5ef 100644
--- a/src/backend/access/heap/tuptoaster.c
+++ b/src/backend/access/heap/tuptoaster.c
@@ -4,7 +4,7 @@
  *	  Support routines for external and compressed storage of
  *	  variable size attributes.
  *
- * Copyright (c) 2000-2016, PostgreSQL Global Development Group
+ * Copyright (c) 2000-2017, PostgreSQL Global Development Group
  *
  *
  * IDENTIFICATION
@@ -67,7 +67,7 @@ typedef struct toast_compress_header
 #define TOAST_COMPRESS_SET_RAWSIZE(ptr, len) \
 	(((toast_compress_header *) (ptr))->rawsize = (len))
 
-static void toast_delete_datum(Relation rel, Datum value);
+static void toast_delete_datum(Relation rel, Datum value, bool is_speculative);
 static Datum toast_save_datum(Relation rel, Datum value,
 				 struct varlena * oldexternal, int options);
 static bool toastrel_valueid_exists(Relation toastrel, Oid valueid);
@@ -461,7 +461,7 @@ toast_datum_size(Datum value)
  * ----------
  */
 void
-toast_delete(Relation rel, HeapTuple oldtup)
+toast_delete(Relation rel, HeapTuple oldtup, bool is_speculative)
 {
 	TupleDesc	tupleDesc;
 	Form_pg_attribute *att;
@@ -508,7 +508,7 @@ toast_delete(Relation rel, HeapTuple oldtup)
 			if (toast_isnull[i])
 				continue;
 			else if (VARATT_IS_EXTERNAL_ONDISK(PointerGetDatum(value)))
-				toast_delete_datum(rel, value);
+				toast_delete_datum(rel, value, is_speculative);
 		}
 	}
 }
@@ -1068,7 +1068,7 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup,
 	if (need_delold)
 		for (i = 0; i < numAttrs; i++)
 			if (toast_delold[i])
-				toast_delete_datum(rel, toast_oldvalues[i]);
+				toast_delete_datum(rel, toast_oldvalues[i], false);
 
 	return result_tuple;
 }
@@ -1296,6 +1296,74 @@ toast_flatten_tuple_to_datum(HeapTupleHeader tup,
 
 
 /* ----------
+ * toast_build_flattened_tuple -
+ *
+ *	Build a tuple containing no out-of-line toasted fields.
+ *	(This does not eliminate compressed or short-header datums.)
+ *
+ *	This is essentially just like heap_form_tuple, except that it will
+ *	expand any external-data pointers beforehand.
+ *
+ *	It's not very clear whether it would be preferable to decompress
+ *	in-line compressed datums while at it.  For now, we don't.
+ * ----------
+ */
+HeapTuple
+toast_build_flattened_tuple(TupleDesc tupleDesc,
+							Datum *values,
+							bool *isnull)
+{
+	HeapTuple	new_tuple;
+	Form_pg_attribute *att = tupleDesc->attrs;
+	int			numAttrs = tupleDesc->natts;
+	int			num_to_free;
+	int			i;
+	Datum		new_values[MaxTupleAttributeNumber];
+	Pointer		freeable_values[MaxTupleAttributeNumber];
+
+	/*
+	 * We can pass the caller's isnull array directly to heap_form_tuple, but
+	 * we potentially need to modify the values array.
+	 */
+	Assert(numAttrs <= MaxTupleAttributeNumber);
+	memcpy(new_values, values, numAttrs * sizeof(Datum));
+
+	num_to_free = 0;
+	for (i = 0; i < numAttrs; i++)
+	{
+		/*
+		 * Look at non-null varlena attributes
+		 */
+		if (!isnull[i] && att[i]->attlen == -1)
+		{
+			struct varlena *new_value;
+
+			new_value = (struct varlena *) DatumGetPointer(new_values[i]);
+			if (VARATT_IS_EXTERNAL(new_value))
+			{
+				new_value = heap_tuple_fetch_attr(new_value);
+				new_values[i] = PointerGetDatum(new_value);
+				freeable_values[num_to_free++] = (Pointer) new_value;
+			}
+		}
+	}
+
+	/*
+	 * Form the reconfigured tuple.
+	 */
+	new_tuple = heap_form_tuple(tupleDesc, new_values, isnull);
+
+	/*
+	 * Free allocated temp values
+	 */
+	for (i = 0; i < num_to_free; i++)
+		pfree(freeable_values[i]);
+
+	return new_tuple;
+}
+
+
+/* ----------
  * toast_compress_datum -
  *
  *	Create a compressed version of a varlena datum
@@ -1611,7 +1679,9 @@ toast_save_datum(Relation rel, Datum value,
 		 * Create the index entry.  We cheat a little here by not using
 		 * FormIndexDatum: this relies on the knowledge that the index columns
 		 * are the same as the initial columns of the table for all the
-		 * indexes.
+		 * indexes.  We also cheat by not providing an IndexInfo: this is okay
+		 * for now because btree doesn't need one, but we might have to be
+		 * more honest someday.
 		 *
 		 * Note also that there had better not be any user-created index on
 		 * the TOAST table, since we don't bother to update anything else.
@@ -1624,7 +1694,8 @@ toast_save_datum(Relation rel, Datum value,
 							 &(toasttup->t_self),
 							 toastrel,
 							 toastidxs[i]->rd_index->indisunique ?
-							 UNIQUE_CHECK_YES : UNIQUE_CHECK_NO);
+							 UNIQUE_CHECK_YES : UNIQUE_CHECK_NO,
+							 NULL);
 		}
 
 		/*
@@ -1663,7 +1734,7 @@ toast_save_datum(Relation rel, Datum value,
  * ----------
  */
 static void
-toast_delete_datum(Relation rel, Datum value)
+toast_delete_datum(Relation rel, Datum value, bool is_speculative)
 {
 	struct varlena *attr = (struct varlena *) DatumGetPointer(value);
 	struct varatt_external toast_pointer;
@@ -1714,7 +1785,10 @@ toast_delete_datum(Relation rel, Datum value)
 		/*
 		 * Have a chunk, delete it
 		 */
-		simple_heap_delete(toastrel, &toasttup->t_self);
+		if (is_speculative)
+			heap_abort_speculative(toastrel, toasttup);
+		else
+			simple_heap_delete(toastrel, &toasttup->t_self);
 	}
 
 	/*
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 3ad4a9f587..e5616ce051 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -3,7 +3,7 @@
  * visibilitymap.c
  *	  bitmap for tracking visibility of heap tuples
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -508,6 +508,9 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
 
 		LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);
 
+		/* NO EREPORT(ERROR) from here till changes are logged */
+		START_CRIT_SECTION();
+
 		/* Clear out the unwanted bytes. */
 		MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1));
 
@@ -523,7 +526,20 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
 		 */
 		map[truncByte] &= (1 << truncOffset) - 1;
 
+		/*
+		 * Truncation of a relation is WAL-logged at a higher-level, and we
+		 * will be called at WAL replay. But if checksums are enabled, we need
+		 * to still write a WAL record to protect against a torn page, if the
+		 * page is flushed to disk before the truncation WAL record. We cannot
+		 * use MarkBufferDirtyHint here, because that will not dirty the page
+		 * during recovery.
+		 */
 		MarkBufferDirty(mapBuffer);
+		if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
+			log_newpage_buffer(mapBuffer, false);
+
+		END_CRIT_SECTION();
+
 		UnlockReleaseBuffer(mapBuffer);
 	}
 	else
diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c
index 28f6cde896..7b597a072f 100644
--- a/src/backend/access/index/amapi.c
+++ b/src/backend/access/index/amapi.c
@@ -3,7 +3,7 @@
  * amapi.c
  *	  Support routines for API for Postgres index access methods.
  *
- * Copyright (c) 2015-2016, PostgreSQL Global Development Group
+ * Copyright (c) 2015-2017, PostgreSQL Global Development Group
  *
  *
  * IDENTIFICATION
@@ -17,6 +17,7 @@
 #include "access/htup_details.h"
 #include "catalog/pg_am.h"
 #include "catalog/pg_opclass.h"
+#include "utils/builtins.h"
 #include "utils/syscache.h"
 
 
diff --git a/src/backend/access/index/amvalidate.c b/src/backend/access/index/amvalidate.c
index 1a3c5f16b9..80865e9ff9 100644
--- a/src/backend/access/index/amvalidate.c
+++ b/src/backend/access/index/amvalidate.c
@@ -3,7 +3,7 @@
  * amvalidate.c
  *	  Support routines for index access methods' amvalidate functions.
  *
- * Copyright (c) 2016, PostgreSQL Global Development Group
+ * Copyright (c) 2016-2017, PostgreSQL Global Development Group
  *
  *
  * IDENTIFICATION
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 65c941d812..a91fda7bcd 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -3,7 +3,7 @@
  * genam.c
  *	  general index access method routines
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -119,6 +119,8 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
 
 	scan->xs_itup = NULL;
 	scan->xs_itupdesc = NULL;
+	scan->xs_hitup = NULL;
+	scan->xs_hitupdesc = NULL;
 
 	ItemPointerSetInvalid(&scan->xs_ctup.t_self);
 	scan->xs_ctup.t_data = NULL;
@@ -166,6 +168,10 @@ IndexScanEnd(IndexScanDesc scan)
  * The passed-in values/nulls arrays are the "raw" input to the index AM,
  * e.g. results of FormIndexDatum --- this is not necessarily what is stored
  * in the index, but it's what the user perceives to be stored.
+ *
+ * Note: if you change anything here, check whether
+ * ExecBuildSlotPartitionKeyDescription() in execMain.c needs a similar
+ * change.
  */
 char *
 BuildIndexValueDescription(Relation indexRelation,
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 54b71cb2f7..cc5ac8b857 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -3,7 +3,7 @@
  * indexam.c
  *	  general index access method routines
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -20,6 +20,10 @@
  *		index_insert	- insert an index tuple into a relation
  *		index_markpos	- mark a scan position
  *		index_restrpos	- restore a scan position
+ *		index_parallelscan_estimate - estimate shared memory for parallel scan
+ *		index_parallelscan_initialize - initialize parallel scan
+ *		index_parallelrescan  - (re)start a parallel scan of an index
+ *		index_beginscan_parallel - join parallel index scan
  *		index_getnext_tid	- get the next TID from a scan
  *		index_fetch_heap		- get the scan's next heap tuple
  *		index_getnext	- get the next heap tuple from a scan
@@ -120,7 +124,8 @@ do { \
 } while(0)
 
 static IndexScanDesc index_beginscan_internal(Relation indexRelation,
-						 int nkeys, int norderbys, Snapshot snapshot);
+						 int nkeys, int norderbys, Snapshot snapshot,
+						 ParallelIndexScanDesc pscan, bool temp_snap);
 
 
 /* ----------------------------------------------------------------
@@ -191,7 +196,8 @@ index_insert(Relation indexRelation,
 			 bool *isnull,
 			 ItemPointer heap_t_ctid,
 			 Relation heapRelation,
-			 IndexUniqueCheck checkUnique)
+			 IndexUniqueCheck checkUnique,
+			 IndexInfo *indexInfo)
 {
 	RELATION_CHECKS;
 	CHECK_REL_PROCEDURE(aminsert);
@@ -203,7 +209,7 @@ index_insert(Relation indexRelation,
 
 	return indexRelation->rd_amroutine->aminsert(indexRelation, values, isnull,
 												 heap_t_ctid, heapRelation,
-												 checkUnique);
+												 checkUnique, indexInfo);
 }
 
 /*
@@ -219,7 +225,7 @@ index_beginscan(Relation heapRelation,
 {
 	IndexScanDesc scan;
 
-	scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot);
+	scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false);
 
 	/*
 	 * Save additional parameters into the scandesc.  Everything else was set
@@ -244,7 +250,7 @@ index_beginscan_bitmap(Relation indexRelation,
 {
 	IndexScanDesc scan;
 
-	scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot);
+	scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false);
 
 	/*
 	 * Save additional parameters into the scandesc.  Everything else was set
@@ -260,8 +266,11 @@ index_beginscan_bitmap(Relation indexRelation,
  */
 static IndexScanDesc
 index_beginscan_internal(Relation indexRelation,
-						 int nkeys, int norderbys, Snapshot snapshot)
+						 int nkeys, int norderbys, Snapshot snapshot,
+						 ParallelIndexScanDesc pscan, bool temp_snap)
 {
+	IndexScanDesc scan;
+
 	RELATION_CHECKS;
 	CHECK_REL_PROCEDURE(ambeginscan);
 
@@ -276,8 +285,13 @@ index_beginscan_internal(Relation indexRelation,
 	/*
 	 * Tell the AM to open a scan.
 	 */
-	return indexRelation->rd_amroutine->ambeginscan(indexRelation, nkeys,
+	scan = indexRelation->rd_amroutine->ambeginscan(indexRelation, nkeys,
 													norderbys);
+	/* Initialize information for parallel scan. */
+	scan->parallel_scan = pscan;
+	scan->xs_temp_snap = temp_snap;
+
+	return scan;
 }
 
 /* ----------------
@@ -341,6 +355,9 @@ index_endscan(IndexScanDesc scan)
 	/* Release index refcount acquired by index_beginscan */
 	RelationDecrementReferenceCount(scan->indexRelation);
 
+	if (scan->xs_temp_snap)
+		UnregisterSnapshot(scan->xs_snapshot);
+
 	/* Release the scan data structure itself */
 	IndexScanEnd(scan);
 }
@@ -389,6 +406,115 @@ index_restrpos(IndexScanDesc scan)
 	scan->indexRelation->rd_amroutine->amrestrpos(scan);
 }
 
+/*
+ * index_parallelscan_estimate - estimate shared memory for parallel scan
+ *
+ * Currently, we don't pass any information to the AM-specific estimator,
+ * so it can probably only return a constant.  In the future, we might need
+ * to pass more information.
+ */
+Size
+index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot)
+{
+	Size		nbytes;
+
+	RELATION_CHECKS;
+
+	nbytes = offsetof(ParallelIndexScanDescData, ps_snapshot_data);
+	nbytes = add_size(nbytes, EstimateSnapshotSpace(snapshot));
+	nbytes = MAXALIGN(nbytes);
+
+	/*
+	 * If amestimateparallelscan is not provided, assume there is no
+	 * AM-specific data needed.  (It's hard to believe that could work, but
+	 * it's easy enough to cater to it here.)
+	 */
+	if (indexRelation->rd_amroutine->amestimateparallelscan != NULL)
+		nbytes = add_size(nbytes,
+					  indexRelation->rd_amroutine->amestimateparallelscan());
+
+	return nbytes;
+}
+
+/*
+ * index_parallelscan_initialize - initialize parallel scan
+ *
+ * We initialize both the ParallelIndexScanDesc proper and the AM-specific
+ * information which follows it.
+ *
+ * This function calls access method specific initialization routine to
+ * initialize am specific information.  Call this just once in the leader
+ * process; then, individual workers attach via index_beginscan_parallel.
+ */
+void
+index_parallelscan_initialize(Relation heapRelation, Relation indexRelation,
+							  Snapshot snapshot, ParallelIndexScanDesc target)
+{
+	Size		offset;
+
+	RELATION_CHECKS;
+
+	offset = add_size(offsetof(ParallelIndexScanDescData, ps_snapshot_data),
+					  EstimateSnapshotSpace(snapshot));
+	offset = MAXALIGN(offset);
+
+	target->ps_relid = RelationGetRelid(heapRelation);
+	target->ps_indexid = RelationGetRelid(indexRelation);
+	target->ps_offset = offset;
+	SerializeSnapshot(snapshot, target->ps_snapshot_data);
+
+	/* aminitparallelscan is optional; assume no-op if not provided by AM */
+	if (indexRelation->rd_amroutine->aminitparallelscan != NULL)
+	{
+		void	   *amtarget;
+
+		amtarget = OffsetToPointer(target, offset);
+		indexRelation->rd_amroutine->aminitparallelscan(amtarget);
+	}
+}
+
+/* ----------------
+ *		index_parallelrescan  - (re)start a parallel scan of an index
+ * ----------------
+ */
+void
+index_parallelrescan(IndexScanDesc scan)
+{
+	SCAN_CHECKS;
+
+	/* amparallelrescan is optional; assume no-op if not provided by AM */
+	if (scan->indexRelation->rd_amroutine->amparallelrescan != NULL)
+		scan->indexRelation->rd_amroutine->amparallelrescan(scan);
+}
+
+/*
+ * index_beginscan_parallel - join parallel index scan
+ *
+ * Caller must be holding suitable locks on the heap and the index.
+ */
+IndexScanDesc
+index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys,
+						 int norderbys, ParallelIndexScanDesc pscan)
+{
+	Snapshot	snapshot;
+	IndexScanDesc scan;
+
+	Assert(RelationGetRelid(heaprel) == pscan->ps_relid);
+	snapshot = RestoreSnapshot(pscan->ps_snapshot_data);
+	RegisterSnapshot(snapshot);
+	scan = index_beginscan_internal(indexrel, nkeys, norderbys, snapshot,
+									pscan, true);
+
+	/*
+	 * Save additional parameters into the scandesc.  Everything else was set
+	 * up by index_beginscan_internal.
+	 */
+	scan->heapRelation = heaprel;
+	scan->xs_snapshot = snapshot;
+
+	return scan;
+}
+
 /* ----------------
  * index_getnext_tid - get the next TID from a scan
  *
@@ -409,8 +535,8 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 	/*
 	 * The AM's amgettuple proc finds the next index entry matching the scan
 	 * keys, and puts the TID into scan->xs_ctup.t_self.  It should also set
-	 * scan->xs_recheck and possibly scan->xs_itup, though we pay no attention
-	 * to those fields here.
+	 * scan->xs_recheck and possibly scan->xs_itup/scan->xs_hitup, though we
+	 * pay no attention to those fields here.
 	 */
 	found = scan->indexRelation->rd_amroutine->amgettuple(scan, direction);
 
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 067d15c803..a3f11da8d5 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -521,11 +521,12 @@ because it allows running applications to continue while the standby
 changes state into a normally running server.
 
 The interlocking required to avoid returning incorrect results from
-MVCC scans is not required on standby nodes. That is because
+non-MVCC scans is not required on standby nodes. That is because
 HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesSelf(),
 HeapTupleSatisfiesDirty() and HeapTupleSatisfiesVacuum() are only
 ever used during write transactions, which cannot exist on the standby.
-This leaves HeapTupleSatisfiesMVCC() and HeapTupleSatisfiesToast().
+MVCC scans are already protected by definition, so HeapTupleSatisfiesMVCC()
+is not a problem.  That leaves concern only for HeapTupleSatisfiesToast().
 HeapTupleSatisfiesToast() doesn't use MVCC semantics, though that's
 because it doesn't need to - if the main heap row is visible then the
 toast rows will also be visible. So as long as we follow a toast
diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c
index 0d60da61cc..4b131efb87 100644
--- a/src/backend/access/nbtree/nbtcompare.c
+++ b/src/backend/access/nbtree/nbtcompare.c
@@ -3,7 +3,7 @@
  * nbtcompare.c
  *	  Comparison functions for btree access method.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index ef69290b6c..6dca8109fd 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -3,7 +3,7 @@
  * nbtinsert.c
  *	  Item insertion in Lehman and Yao btrees for Postgres.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -17,6 +17,7 @@
 
 #include "access/heapam.h"
 #include "access/nbtree.h"
+#include "access/nbtxlog.h"
 #include "access/transam.h"
 #include "access/xloginsert.h"
 #include "miscadmin.h"
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 2001dc14fb..f815fd40b2 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -4,7 +4,7 @@
  *	  BTree-specific page management code for the Postgres btree access
  *	  method.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -23,6 +23,7 @@
 #include "postgres.h"
 
 #include "access/nbtree.h"
+#include "access/nbtxlog.h"
 #include "access/transam.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 4668c5ee59..116f5f32f6 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -8,7 +8,7 @@
  *	  This file contains only the public interface routines.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -23,11 +23,14 @@
 #include "access/xlog.h"
 #include "catalog/index.h"
 #include "commands/vacuum.h"
+#include "pgstat.h"
+#include "storage/condition_variable.h"
 #include "storage/indexfsm.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
 #include "storage/smgr.h"
 #include "tcop/tcopprot.h"		/* pgrminclude ignore */
+#include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
 #include "utils/memutils.h"
 
@@ -62,6 +65,45 @@ typedef struct
 	MemoryContext pagedelcontext;
 } BTVacState;
 
+/*
+ * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
+ *
+ * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
+ * a new page; others must wait.
+ *
+ * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
+ * to a new page; some process can start doing that.
+ *
+ * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
+ * We reach this state once for every distinct combination of array keys.
+ */
+typedef enum
+{
+	BTPARALLEL_NOT_INITIALIZED,
+	BTPARALLEL_ADVANCING,
+	BTPARALLEL_IDLE,
+	BTPARALLEL_DONE
+} BTPS_State;
+
+/*
+ * BTParallelScanDescData contains btree specific shared information required
+ * for parallel scan.
+ */
+typedef struct BTParallelScanDescData
+{
+	BlockNumber btps_scanPage;	/* latest or next page to be scanned */
+	BTPS_State	btps_pageStatus;/* indicates whether next page is available
+								 * for scan. see above for possible states of
+								 * parallel scan. */
+	int			btps_arrayKeyCount;		/* count indicating number of array
+										 * scan keys processed by parallel
+										 * scan */
+	slock_t		btps_mutex;		/* protects above variables */
+	ConditionVariable btps_cv;	/* used to synchronize parallel scan */
+}	BTParallelScanDescData;
+
+typedef struct BTParallelScanDescData *BTParallelScanDesc;
+
 
 static void btbuildCallback(Relation index,
 				HeapTuple htup,
@@ -98,6 +140,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->amstorage = false;
 	amroutine->amclusterable = true;
 	amroutine->ampredlocks = true;
+	amroutine->amcanparallel = true;
 	amroutine->amkeytype = InvalidOid;
 
 	amroutine->ambuild = btbuild;
@@ -117,6 +160,9 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->amendscan = btendscan;
 	amroutine->ammarkpos = btmarkpos;
 	amroutine->amrestrpos = btrestrpos;
+	amroutine->amestimateparallelscan = btestimateparallelscan;
+	amroutine->aminitparallelscan = btinitparallelscan;
+	amroutine->amparallelrescan = btparallelrescan;
 
 	PG_RETURN_POINTER(amroutine);
 }
@@ -242,13 +288,18 @@ btbuildempty(Relation index)
 	metapage = (Page) palloc(BLCKSZ);
 	_bt_initmetapage(metapage, P_NONE, 0);
 
-	/* Write the page.  If archiving/streaming, XLOG it. */
+	/*
+	 * Write the page and log it.  It might seem that an immediate sync would
+	 * be sufficient to guarantee that the file exists on disk, but recovery
+	 * itself might remove it while replaying, for example, an
+	 * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we need
+	 * this even when wal_level=minimal.
+	 */
 	PageSetChecksumInplace(metapage, BTREE_METAPAGE);
 	smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
 			  (char *) metapage, true);
-	if (XLogIsNeeded())
-		log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
-					BTREE_METAPAGE, metapage, false);
+	log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
+				BTREE_METAPAGE, metapage, false);
 
 	/*
 	 * An immediate sync is required even if we xlog'd the page, because the
@@ -267,7 +318,8 @@ btbuildempty(Relation index)
 bool
 btinsert(Relation rel, Datum *values, bool *isnull,
 		 ItemPointer ht_ctid, Relation heapRel,
-		 IndexUniqueCheck checkUnique)
+		 IndexUniqueCheck checkUnique,
+		 IndexInfo *indexInfo)
 {
 	bool		result;
 	IndexTuple	itup;
@@ -481,6 +533,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	}
 
 	so->markItemIndex = -1;
+	so->arrayKeyCount = 0;
 	BTScanPosUnpinIfPinned(so->markPos);
 	BTScanPosInvalidate(so->markPos);
 
@@ -643,6 +696,217 @@ btrestrpos(IndexScanDesc scan)
 }
 
 /*
+ * btestimateparallelscan -- estimate storage for BTParallelScanDescData
+ */
+Size
+btestimateparallelscan(void)
+{
+	return sizeof(BTParallelScanDescData);
+}
+
+/*
+ * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
+ */
+void
+btinitparallelscan(void *target)
+{
+	BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
+
+	SpinLockInit(&bt_target->btps_mutex);
+	bt_target->btps_scanPage = InvalidBlockNumber;
+	bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
+	bt_target->btps_arrayKeyCount = 0;
+	ConditionVariableInit(&bt_target->btps_cv);
+}
+
+/*
+ *	btparallelrescan() -- reset parallel scan
+ */
+void
+btparallelrescan(IndexScanDesc scan)
+{
+	BTParallelScanDesc btscan;
+	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+
+	Assert(parallel_scan);
+
+	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+												  parallel_scan->ps_offset);
+
+	/*
+	 * In theory, we don't need to acquire the spinlock here, because there
+	 * shouldn't be any other workers running at this point, but we do so for
+	 * consistency.
+	 */
+	SpinLockAcquire(&btscan->btps_mutex);
+	btscan->btps_scanPage = InvalidBlockNumber;
+	btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
+	btscan->btps_arrayKeyCount = 0;
+	SpinLockRelease(&btscan->btps_mutex);
+}
+
+/*
+ * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
+ *		page.  Other scans must wait until we call bt_parallel_release() or
+ *		bt_parallel_done().
+ *
+ * The return value is true if we successfully seized the scan and false
+ * if we did not.  The latter case occurs if no pages remain for the current
+ * set of scankeys.
+ *
+ * If the return value is true, *pageno returns the next or current page
+ * of the scan (depending on the scan direction).  An invalid block number
+ * means the scan hasn't yet started, and P_NONE means we've reached the end.
+ * The first time a participating process reaches the last page, it will return
+ * true and set *pageno to P_NONE; after that, further attempts to seize the
+ * scan will return false.
+ *
+ * Callers should ignore the value of pageno if the return value is false.
+ */
+bool
+_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BTPS_State	pageStatus;
+	bool		exit_loop = false;
+	bool		status = true;
+	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+	BTParallelScanDesc btscan;
+
+	*pageno = P_NONE;
+
+	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+												  parallel_scan->ps_offset);
+
+	while (1)
+	{
+		SpinLockAcquire(&btscan->btps_mutex);
+		pageStatus = btscan->btps_pageStatus;
+
+		if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
+		{
+			/* Parallel scan has already advanced to a new set of scankeys. */
+			status = false;
+		}
+		else if (pageStatus == BTPARALLEL_DONE)
+		{
+			/*
+			 * We're done with this set of scankeys.  This may be the end, or
+			 * there could be more sets to try.
+			 */
+			status = false;
+		}
+		else if (pageStatus != BTPARALLEL_ADVANCING)
+		{
+			/*
+			 * We have successfully seized control of the scan for the purpose
+			 * of advancing it to a new page!
+			 */
+			btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
+			*pageno = btscan->btps_scanPage;
+			exit_loop = true;
+		}
+		SpinLockRelease(&btscan->btps_mutex);
+		if (exit_loop || !status)
+			break;
+		ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
+	}
+	ConditionVariableCancelSleep();
+
+	return status;
+}
+
+/*
+ * _bt_parallel_release() -- Complete the process of advancing the scan to a
+ *		new page.  We now have the new value btps_scanPage; some other backend
+ *		can now begin advancing the scan.
+ */
+void
+_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
+{
+	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+	BTParallelScanDesc btscan;
+
+	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+												  parallel_scan->ps_offset);
+
+	SpinLockAcquire(&btscan->btps_mutex);
+	btscan->btps_scanPage = scan_page;
+	btscan->btps_pageStatus = BTPARALLEL_IDLE;
+	SpinLockRelease(&btscan->btps_mutex);
+	ConditionVariableSignal(&btscan->btps_cv);
+}
+
+/*
+ * _bt_parallel_done() -- Mark the parallel scan as complete.
+ *
+ * When there are no pages left to scan, this function should be called to
+ * notify other workers.  Otherwise, they might wait forever for the scan to
+ * advance to the next page.
+ */
+void
+_bt_parallel_done(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+	BTParallelScanDesc btscan;
+	bool		status_changed = false;
+
+	/* Do nothing, for non-parallel scans */
+	if (parallel_scan == NULL)
+		return;
+
+	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+												  parallel_scan->ps_offset);
+
+	/*
+	 * Mark the parallel scan as done for this combination of scan keys,
+	 * unless some other process already did so.  See also
+	 * _bt_advance_array_keys.
+	 */
+	SpinLockAcquire(&btscan->btps_mutex);
+	if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
+		btscan->btps_pageStatus != BTPARALLEL_DONE)
+	{
+		btscan->btps_pageStatus = BTPARALLEL_DONE;
+		status_changed = true;
+	}
+	SpinLockRelease(&btscan->btps_mutex);
+
+	/* wake up all the workers associated with this parallel scan */
+	if (status_changed)
+		ConditionVariableBroadcast(&btscan->btps_cv);
+}
+
+/*
+ * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
+ *			keys.
+ *
+ * Updates the count of array keys processed for both local and parallel
+ * scans.
+ */
+void
+_bt_parallel_advance_array_keys(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+	BTParallelScanDesc btscan;
+
+	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+												  parallel_scan->ps_offset);
+
+	so->arrayKeyCount++;
+	SpinLockAcquire(&btscan->btps_mutex);
+	if (btscan->btps_pageStatus == BTPARALLEL_DONE)
+	{
+		btscan->btps_scanPage = InvalidBlockNumber;
+		btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
+		btscan->btps_arrayKeyCount++;
+	}
+	SpinLockRelease(&btscan->btps_mutex);
+}
+
+/*
  * Bulk deletion of all index entries pointing to a set of heap tuples.
  * The set of target tuples is specified via a callback routine that tells
  * whether any given heap tuple (identified by ItemPointer) is being deleted.
@@ -763,9 +1027,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	/* Create a temporary memory context to run _bt_pagedel in */
 	vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
 												  "_bt_pagedel",
-												  ALLOCSET_DEFAULT_MINSIZE,
-												  ALLOCSET_DEFAULT_INITSIZE,
-												  ALLOCSET_DEFAULT_MAXSIZE);
+												  ALLOCSET_DEFAULT_SIZES);
 
 	/*
 	 * The outer loop iterates over all index pages except the metapage, in
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index ee46023c5a..2f32b2e78d 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -4,7 +4,7 @@
  *	  Search code for postgres btrees.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -30,9 +30,13 @@ static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
 			 OffsetNumber offnum, IndexTuple itup);
 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
+static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
+static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
+					  ScanDirection dir);
 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
+static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
 
 
 /*
@@ -544,8 +548,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	ScanKeyData notnullkeys[INDEX_MAX_KEYS];
 	int			keysCount = 0;
 	int			i;
+	bool		status = true;
 	StrategyNumber strat_total;
 	BTScanPosItem *currItem;
+	BlockNumber blkno;
 
 	Assert(!BTScanPosIsValid(so->currPos));
 
@@ -564,6 +570,30 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	if (!so->qual_ok)
 		return false;
 
+	/*
+	 * For parallel scans, get the starting page from shared state. If the
+	 * scan has not started, proceed to find out first leaf page in the usual
+	 * way while keeping other participating processes waiting.  If the scan
+	 * has already begun, use the page number from the shared structure.
+	 */
+	if (scan->parallel_scan != NULL)
+	{
+		status = _bt_parallel_seize(scan, &blkno);
+		if (!status)
+			return false;
+		else if (blkno == P_NONE)
+		{
+			_bt_parallel_done(scan);
+			return false;
+		}
+		else if (blkno != InvalidBlockNumber)
+		{
+			if (!_bt_parallel_readpage(scan, blkno, dir))
+				return false;
+			goto readcomplete;
+		}
+	}
+
 	/*----------
 	 * Examine the scan keys to discover where we need to start the scan.
 	 *
@@ -743,7 +773,19 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * there.
 	 */
 	if (keysCount == 0)
-		return _bt_endpoint(scan, dir);
+	{
+		bool		match;
+
+		match = _bt_endpoint(scan, dir);
+
+		if (!match)
+		{
+			/* No match, so mark (parallel) scan finished */
+			_bt_parallel_done(scan);
+		}
+
+		return match;
+	}
 
 	/*
 	 * We want to start the scan somewhere within the index.  Set up an
@@ -773,7 +815,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 
 			Assert(subkey->sk_flags & SK_ROW_MEMBER);
 			if (subkey->sk_flags & SK_ISNULL)
+			{
+				_bt_parallel_done(scan);
 				return false;
+			}
 			memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
 
 			/*
@@ -993,25 +1038,21 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		 * because nothing finer to lock exists.
 		 */
 		PredicateLockRelation(rel, scan->xs_snapshot);
+
+		/*
+		 * mark parallel scan as done, so that all the workers can finish
+		 * their scan
+		 */
+		_bt_parallel_done(scan);
+		BTScanPosInvalidate(so->currPos);
+
 		return false;
 	}
 	else
 		PredicateLockPage(rel, BufferGetBlockNumber(buf),
 						  scan->xs_snapshot);
 
-	/* initialize moreLeft/moreRight appropriately for scan direction */
-	if (ScanDirectionIsForward(dir))
-	{
-		so->currPos.moreLeft = false;
-		so->currPos.moreRight = true;
-	}
-	else
-	{
-		so->currPos.moreLeft = true;
-		so->currPos.moreRight = false;
-	}
-	so->numKilled = 0;			/* just paranoia */
-	Assert(so->markItemIndex == -1);
+	_bt_initialize_more_data(so, dir);
 
 	/* position to the precise item on the page */
 	offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
@@ -1060,6 +1101,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
 	}
 
+readcomplete:
 	/* OK, itemIndex says what to return */
 	currItem = &so->currPos.items[so->currPos.itemIndex];
 	scan->xs_ctup.t_self = currItem->heapTid;
@@ -1132,6 +1174,10 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
  * that there can be no more matching tuples in the current scan direction.
  *
+ * In the case of a parallel scan, caller must have called _bt_parallel_seize
+ * prior to calling this function; this function will invoke
+ * _bt_parallel_release before returning.
+ *
  * Returns true if any matching items found on the page, false if none.
  */
 static bool
@@ -1154,6 +1200,16 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 	page = BufferGetPage(so->currPos.buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	/* allow next page be processed by parallel worker */
+	if (scan->parallel_scan)
+	{
+		if (ScanDirectionIsForward(dir))
+			_bt_parallel_release(scan, opaque->btpo_next);
+		else
+			_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+	}
+
 	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);
 
@@ -1278,21 +1334,16 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
  * if pinned, we'll drop the pin before moving to next page.  The buffer is
  * not locked on entry.
  *
- * On success exit, so->currPos is updated to contain data from the next
- * interesting page.  For success on a scan using a non-MVCC snapshot we hold
- * a pin, but not a read lock, on that page.  If we do not hold the pin, we
- * set so->currPos.buf to InvalidBuffer.  We return TRUE to indicate success.
- *
- * If there are no more matching records in the given direction, we drop all
- * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
+ * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
+ * read lock, on that page.  If we do not hold the pin, we set so->currPos.buf
+ * to InvalidBuffer.  We return TRUE to indicate success.
  */
 static bool
 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-	Relation	rel;
-	Page		page;
-	BTPageOpaque opaque;
+	BlockNumber blkno = InvalidBlockNumber;
+	bool		status = true;
 
 	Assert(BTScanPosIsValid(so->currPos));
 
@@ -1319,25 +1370,103 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 		so->markItemIndex = -1;
 	}
 
-	rel = scan->indexRelation;
-
 	if (ScanDirectionIsForward(dir))
 	{
 		/* Walk right to the next page with data */
-		/* We must rely on the previously saved nextPage link! */
-		BlockNumber blkno = so->currPos.nextPage;
+		if (scan->parallel_scan != NULL)
+		{
+			/*
+			 * Seize the scan to get the next block number; if the scan has
+			 * ended already, bail out.
+			 */
+			status = _bt_parallel_seize(scan, &blkno);
+			if (!status)
+			{
+				/* release the previous buffer, if pinned */
+				BTScanPosUnpinIfPinned(so->currPos);
+				BTScanPosInvalidate(so->currPos);
+				return false;
+			}
+		}
+		else
+		{
+			/* Not parallel, so use the previously-saved nextPage link. */
+			blkno = so->currPos.nextPage;
+		}
 
 		/* Remember we left a page with data */
 		so->currPos.moreLeft = true;
 
 		/* release the previous buffer, if pinned */
 		BTScanPosUnpinIfPinned(so->currPos);
+	}
+	else
+	{
+		/* Remember we left a page with data */
+		so->currPos.moreRight = true;
+
+		if (scan->parallel_scan != NULL)
+		{
+			/*
+			 * Seize the scan to get the current block number; if the scan has
+			 * ended already, bail out.
+			 */
+			status = _bt_parallel_seize(scan, &blkno);
+			BTScanPosUnpinIfPinned(so->currPos);
+			if (!status)
+			{
+				BTScanPosInvalidate(so->currPos);
+				return false;
+			}
+		}
+		else
+		{
+			/* Not parallel, so just use our own notion of the current page */
+			blkno = so->currPos.currPage;
+		}
+	}
+
+	if (!_bt_readnextpage(scan, blkno, dir))
+		return false;
+
+	/* Drop the lock, and maybe the pin, on the current page */
+	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+	return true;
+}
 
+/*
+ *	_bt_readnextpage() -- Read next page containing valid data for scan
+ *
+ * On success exit, so->currPos is updated to contain data from the next
+ * interesting page.  Caller is responsible to release lock and pin on
+ * buffer on success.  We return TRUE to indicate success.
+ *
+ * If there are no more matching records in the given direction, we drop all
+ * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
+ */
+static bool
+_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Relation	rel;
+	Page		page;
+	BTPageOpaque opaque;
+	bool		status = true;
+
+	rel = scan->indexRelation;
+
+	if (ScanDirectionIsForward(dir))
+	{
 		for (;;)
 		{
-			/* if we're at end of scan, give up */
+			/*
+			 * if we're at end of scan, give up and mark parallel scan as
+			 * done, so that all the workers can finish their scan
+			 */
 			if (blkno == P_NONE || !so->currPos.moreRight)
 			{
+				_bt_parallel_done(scan);
 				BTScanPosInvalidate(so->currPos);
 				return false;
 			}
@@ -1345,10 +1474,10 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 			CHECK_FOR_INTERRUPTS();
 			/* step right one page */
 			so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
-			/* check for deleted page */
 			page = BufferGetPage(so->currPos.buf);
 			TestForOldSnapshot(scan->xs_snapshot, rel, page);
 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+			/* check for deleted page */
 			if (!P_IGNORE(opaque))
 			{
 				PredicateLockPage(rel, blkno, scan->xs_snapshot);
@@ -1359,14 +1488,32 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 			}
 
 			/* nope, keep going */
-			blkno = opaque->btpo_next;
+			if (scan->parallel_scan != NULL)
+			{
+				status = _bt_parallel_seize(scan, &blkno);
+				if (!status)
+				{
+					_bt_relbuf(rel, so->currPos.buf);
+					BTScanPosInvalidate(so->currPos);
+					return false;
+				}
+			}
+			else
+				blkno = opaque->btpo_next;
 			_bt_relbuf(rel, so->currPos.buf);
 		}
 	}
 	else
 	{
-		/* Remember we left a page with data */
-		so->currPos.moreRight = true;
+		/*
+		 * Should only happen in parallel cases, when some other backend
+		 * advanced the scan.
+		 */
+		if (so->currPos.currPage != blkno)
+		{
+			BTScanPosUnpinIfPinned(so->currPos);
+			so->currPos.currPage = blkno;
+		}
 
 		/*
 		 * Walk left to the next page with data.  This is much more complex
@@ -1401,6 +1548,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 			if (!so->currPos.moreLeft)
 			{
 				_bt_relbuf(rel, so->currPos.buf);
+				_bt_parallel_done(scan);
 				BTScanPosInvalidate(so->currPos);
 				return false;
 			}
@@ -1412,6 +1560,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 			/* if we're physically at end of index, return failure */
 			if (so->currPos.buf == InvalidBuffer)
 			{
+				_bt_parallel_done(scan);
 				BTScanPosInvalidate(so->currPos);
 				return false;
 			}
@@ -1432,9 +1581,46 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 				if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
 					break;
 			}
+
+			/*
+			 * For parallel scans, get the last page scanned as it is quite
+			 * possible that by the time we try to seize the scan, some other
+			 * worker has already advanced the scan to a different page.  We
+			 * must continue based on the latest page scanned by any worker.
+			 */
+			if (scan->parallel_scan != NULL)
+			{
+				_bt_relbuf(rel, so->currPos.buf);
+				status = _bt_parallel_seize(scan, &blkno);
+				if (!status)
+				{
+					BTScanPosInvalidate(so->currPos);
+					return false;
+				}
+				so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+			}
 		}
 	}
 
+	return true;
+}
+
+/*
+ *	_bt_parallel_readpage() -- Read current page containing valid data for scan
+ *
+ * On success, release lock and maybe pin on buffer.  We return TRUE to
+ * indicate success.
+ */
+static bool
+_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	_bt_initialize_more_data(so, dir);
+
+	if (!_bt_readnextpage(scan, blkno, dir))
+		return false;
+
 	/* Drop the lock, and maybe the pin, on the current page */
 	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
 
@@ -1712,19 +1898,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 	/* remember which buffer we have pinned */
 	so->currPos.buf = buf;
 
-	/* initialize moreLeft/moreRight appropriately for scan direction */
-	if (ScanDirectionIsForward(dir))
-	{
-		so->currPos.moreLeft = false;
-		so->currPos.moreRight = true;
-	}
-	else
-	{
-		so->currPos.moreLeft = true;
-		so->currPos.moreRight = false;
-	}
-	so->numKilled = 0;			/* just paranoia */
-	so->markItemIndex = -1;		/* ditto */
+	_bt_initialize_more_data(so, dir);
 
 	/*
 	 * Now load data from the first page of the scan.
@@ -1753,3 +1927,25 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 
 	return true;
 }
+
+/*
+ * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
+ * for scan direction
+ */
+static inline void
+_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
+{
+	/* initialize moreLeft/moreRight appropriately for scan direction */
+	if (ScanDirectionIsForward(dir))
+	{
+		so->currPos.moreLeft = false;
+		so->currPos.moreRight = true;
+	}
+	else
+	{
+		so->currPos.moreLeft = true;
+		so->currPos.moreRight = false;
+	}
+	so->numKilled = 0;			/* just paranoia */
+	so->markItemIndex = -1;		/* ditto */
+}
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 99a014e8f4..3d041c47c0 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -55,7 +55,7 @@
  * This code isn't concerned about the FSM at all. The caller is responsible
  * for initializing that.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -680,9 +680,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	bool		merge = (btspool2 != NULL);
 	IndexTuple	itup,
 				itup2 = NULL;
-	bool		should_free,
-				should_free2,
-				load1;
+	bool		load1;
 	TupleDesc	tupdes = RelationGetDescr(wstate->index);
 	int			i,
 				keysz = RelationGetNumberOfAttributes(wstate->index);
@@ -697,10 +695,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 		 */
 
 		/* the preparation of merge */
-		itup = tuplesort_getindextuple(btspool->sortstate,
-									   true, &should_free);
-		itup2 = tuplesort_getindextuple(btspool2->sortstate,
-										true, &should_free2);
+		itup = tuplesort_getindextuple(btspool->sortstate, true);
+		itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
 		indexScanKey = _bt_mkscankey_nodata(wstate->index);
 
 		/* Prepare SortSupport data for each column */
@@ -775,18 +771,12 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 			if (load1)
 			{
 				_bt_buildadd(wstate, state, itup);
-				if (should_free)
-					pfree(itup);
-				itup = tuplesort_getindextuple(btspool->sortstate,
-											   true, &should_free);
+				itup = tuplesort_getindextuple(btspool->sortstate, true);
 			}
 			else
 			{
 				_bt_buildadd(wstate, state, itup2);
-				if (should_free2)
-					pfree(itup2);
-				itup2 = tuplesort_getindextuple(btspool2->sortstate,
-												true, &should_free2);
+				itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
 			}
 		}
 		pfree(sortKeys);
@@ -795,15 +785,13 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	{
 		/* merge is unnecessary */
 		while ((itup = tuplesort_getindextuple(btspool->sortstate,
-											   true, &should_free)) != NULL)
+											   true)) != NULL)
 		{
 			/* When we see first tuple, create first index page */
 			if (state == NULL)
 				state = _bt_pagestate(wstate, 0);
 
 			_bt_buildadd(wstate, state, itup);
-			if (should_free)
-				pfree(itup);
 		}
 	}
 
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 5d335c7f97..5b259a31d9 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -3,7 +3,7 @@
  * nbtutils.c
  *	  Utility code for Postgres btree implementation.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -232,10 +232,8 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
 	 */
 	if (so->arrayContext == NULL)
 		so->arrayContext = AllocSetContextCreate(CurrentMemoryContext,
-												 "BTree Array Context",
-												 ALLOCSET_SMALL_MINSIZE,
-												 ALLOCSET_SMALL_INITSIZE,
-												 ALLOCSET_SMALL_MAXSIZE);
+												 "BTree array context",
+												 ALLOCSET_SMALL_SIZES);
 	else
 		MemoryContextReset(so->arrayContext);
 
@@ -592,6 +590,10 @@ _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
 			break;
 	}
 
+	/* advance parallel scan */
+	if (scan->parallel_scan != NULL)
+		_bt_parallel_advance_array_keys(scan);
+
 	return found;
 }
 
diff --git a/src/backend/access/nbtree/nbtvalidate.c b/src/backend/access/nbtree/nbtvalidate.c
index 7d0bdabc1d..88e33f54cd 100644
--- a/src/backend/access/nbtree/nbtvalidate.c
+++ b/src/backend/access/nbtree/nbtvalidate.c
@@ -3,7 +3,7 @@
  * nbtvalidate.c
  *	  Opclass validator for btree.
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -22,6 +22,7 @@
 #include "catalog/pg_opfamily.h"
 #include "catalog/pg_type.h"
 #include "utils/builtins.h"
+#include "utils/regproc.h"
 #include "utils/syscache.h"
 
 
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index c536e22432..ac60db0d49 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -4,7 +4,7 @@
  *	  WAL replay logic for btrees.
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
@@ -14,8 +14,10 @@
  */
 #include "postgres.h"
 
+#include "access/bufmask.h"
 #include "access/heapam_xlog.h"
 #include "access/nbtree.h"
+#include "access/nbtxlog.h"
 #include "access/transam.h"
 #include "access/xlog.h"
 #include "access/xlogutils.h"
@@ -1028,3 +1030,52 @@ btree_redo(XLogReaderState *record)
 			elog(PANIC, "btree_redo: unknown op code %u", info);
 	}
 }
+
+/*
+ * Mask a btree page before performing consistency checks on it.
+ */
+void
+btree_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+	BTPageOpaque maskopaq;
+
+	mask_page_lsn(page);
+
+	mask_page_hint_bits(page);
+	mask_unused_space(page);
+
+	maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	if (P_ISDELETED(maskopaq))
+	{
+		/*
+		 * Mask page content on a DELETED page since it will be re-initialized
+		 * during replay. See btree_xlog_unlink_page() for details.
+		 */
+		mask_page_content(page);
+	}
+	else if (P_ISLEAF(maskopaq))
+	{
+		/*
+		 * In btree leaf pages, it is possible to modify the LP_FLAGS without
+		 * emitting any WAL record. Hence, mask the line pointer flags. See
+		 * _bt_killitems(), _bt_check_unique() for details.
+		 */
+		mask_lp_flags(page);
+	}
+
+	/*
+	 * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See
+	 * _bt_killitems(), _bt_check_unique() for details.
+	 */
+	maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+	/*
+	 * During replay of a btree page split, we don't set the BTP_SPLIT_END
+	 * flag of the right sibling and initialize the cycle_id to 0 for the same
+	 * page. See btree_xlog_split() for details.
+	 */
+	maskopaq->btpo_flags &= ~BTP_SPLIT_END;
+	maskopaq->btpo_cycleid = 0;
+}
diff --git a/src/backend/access/rmgrdesc/brindesc.c b/src/backend/access/rmgrdesc/brindesc.c
index 433526f5ec..637ebf30f8 100644
--- a/src/backend/access/rmgrdesc/brindesc.c
+++ b/src/backend/access/rmgrdesc/brindesc.c
@@ -3,7 +3,7 @@
  * brindesc.c
  *	  rmgr descriptor routines for BRIN indexes
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -61,6 +61,13 @@ brin_desc(StringInfo buf, XLogReaderState *record)
 
 		appendStringInfo(buf, "targetBlk %u", xlrec->targetBlk);
 	}
+	else if (info == XLOG_BRIN_DESUMMARIZE)
+	{
+		xl_brin_desummarize *xlrec = (xl_brin_desummarize *) rec;
+
+		appendStringInfo(buf, "pagesPerRange %u, heapBlk %u, page offset %u",
+					 xlrec->pagesPerRange, xlrec->heapBlk, xlrec->regOffset);
+	}
 }
 
 const char *
@@ -91,6 +98,9 @@ brin_identify(uint8 info)
 		case XLOG_BRIN_REVMAP_EXTEND:
 			id = "REVMAP_EXTEND";
 			break;
+		case XLOG_BRIN_DESUMMARIZE:
+			id = "DESUMMARIZE";
+			break;
 	}
 
 	return id;
diff --git a/src/backend/access/rmgrdesc/clogdesc.c b/src/backend/access/rmgrdesc/clogdesc.c
index 41ea254710..9181154ffd 100644
--- a/src/backend/access/rmgrdesc/clogdesc.c
+++ b/src/backend/access/rmgrdesc/clogdesc.c
@@ -3,7 +3,7 @@
  * clogdesc.c
  *	  rmgr descriptor routines for access/transam/clog.c
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -23,12 +23,20 @@ clog_desc(StringInfo buf, XLogReaderState *record)
 	char	   *rec = XLogRecGetData(record);
 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
 
-	if (info == CLOG_ZEROPAGE || info == CLOG_TRUNCATE)
+	if (info == CLOG_ZEROPAGE)
 	{
 		int			pageno;
 
 		memcpy(&pageno, rec, sizeof(int));
-		appendStringInfo(buf, "%d", pageno);
+		appendStringInfo(buf, "page %d", pageno);
+	}
+	else if (info == CLOG_TRUNCATE)
+	{
+		xl_clog_truncate xlrec;
+
+		memcpy(&xlrec, rec, sizeof(xl_clog_truncate));
+		appendStringInfo(buf, "page %d; oldestXact %u",
+						 xlrec.pageno, xlrec.oldestXact);
 	}
 }
 
diff --git a/src/backend/access/rmgrdesc/committsdesc.c b/src/backend/access/rmgrdesc/committsdesc.c
index 527e5dc724..3e670bd543 100644
--- a/src/backend/access/rmgrdesc/committsdesc.c
+++ b/src/backend/access/rmgrdesc/committsdesc.c
@@ -3,7 +3,7 @@
  * committsdesc.c
  *	  rmgr descriptor routines for access/transam/commit_ts.c
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -33,10 +33,10 @@ commit_ts_desc(StringInfo buf, XLogReaderState *record)
 	}
 	else if (info == COMMIT_TS_TRUNCATE)
 	{
-		int			pageno;
+		xl_commit_ts_truncate *trunc = (xl_commit_ts_truncate *) rec;
 
-		memcpy(&pageno, rec, sizeof(int));
-		appendStringInfo(buf, "%d", pageno);
+		appendStringInfo(buf, "pageno %d, oldestXid %u",
+						 trunc->pageno, trunc->oldestXid);
 	}
 	else if (info == COMMIT_TS_SETTS)
 	{
diff --git a/src/backend/access/rmgrdesc/dbasedesc.c b/src/backend/access/rmgrdesc/dbasedesc.c
index 83720ce765..768242cfd5 100644
--- a/src/backend/access/rmgrdesc/dbasedesc.c
+++ b/src/backend/access/rmgrdesc/dbasedesc.c
@@ -3,7 +3,7 @@
  * dbasedesc.c
  *	  rmgr descriptor routines for commands/dbcommands.c
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
diff --git a/src/backend/access/rmgrdesc/genericdesc.c b/src/backend/access/rmgrdesc/genericdesc.c
index 22f81570a5..c4705428f1 100644
--- a/src/backend/access/rmgrdesc/genericdesc.c
+++ b/src/backend/access/rmgrdesc/genericdesc.c
@@ -4,7 +4,7 @@
  *	  rmgr descriptor routines for access/transam/generic_xlog.c
  *
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * src/backend/access/rmgrdesc/genericdesc.c
diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c
index db832a5f78..df51f3ce1f 100644
--- a/src/backend/access/rmgrdesc/gindesc.c
+++ b/src/backend/access/rmgrdesc/gindesc.c
@@ -3,7 +3,7 @@
  * gindesc.c
  *	  rmgr descriptor routines for access/transam/gin/ginxlog.c
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -14,7 +14,7 @@
  */
 #include "postgres.h"
 
-#include "access/gin_private.h"
+#include "access/ginxlog.h"
 #include "access/xlogutils.h"
 #include "lib/stringinfo.h"
 #include "storage/relfilenode.h"
@@ -87,13 +87,13 @@ gin_desc(StringInfo buf, XLogReaderState *record)
 		case XLOG_GIN_INSERT:
 			{
 				ginxlogInsert *xlrec = (ginxlogInsert *) rec;
-				char	   *payload = rec + sizeof(ginxlogInsert);
 
 				appendStringInfo(buf, "isdata: %c isleaf: %c",
 							  (xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F',
 							 (xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F');
 				if (!(xlrec->flags & GIN_INSERT_ISLEAF))
 				{
+					char	   *payload = rec + sizeof(ginxlogInsert);
 					BlockNumber leftChildBlkno;
 					BlockNumber rightChildBlkno;
 
@@ -104,27 +104,32 @@ gin_desc(StringInfo buf, XLogReaderState *record)
 					appendStringInfo(buf, " children: %u/%u",
 									 leftChildBlkno, rightChildBlkno);
 				}
-				if (!(xlrec->flags & GIN_INSERT_ISDATA))
-					appendStringInfo(buf, " isdelete: %c",
-					(((ginxlogInsertEntry *) payload)->isDelete) ? 'T' : 'F');
-				else if (xlrec->flags & GIN_INSERT_ISLEAF)
+				if (XLogRecHasBlockImage(record, 0))
 				{
-					ginxlogRecompressDataLeaf *insertData =
-					(ginxlogRecompressDataLeaf *) payload;
-
-					if (XLogRecHasBlockImage(record, 0))
+					if (XLogRecBlockImageApply(record, 0))
 						appendStringInfoString(buf, " (full page image)");
 					else
-						desc_recompress_leaf(buf, insertData);
+						appendStringInfoString(buf, " (full page image, for WAL verification)");
 				}
 				else
 				{
-					ginxlogInsertDataInternal *insertData = (ginxlogInsertDataInternal *) payload;
+					char	   *payload = XLogRecGetBlockData(record, 0, NULL);
+
+					if (!(xlrec->flags & GIN_INSERT_ISDATA))
+						appendStringInfo(buf, " isdelete: %c",
+										 (((ginxlogInsertEntry *) payload)->isDelete) ? 'T' : 'F');
+					else if (xlrec->flags & GIN_INSERT_ISLEAF)
+						desc_recompress_leaf(buf, (ginxlogRecompressDataLeaf *) payload);
+					else
+					{
+						ginxlogInsertDataInternal *insertData =
+						(ginxlogInsertDataInternal *) payload;
 
-					appendStringInfo(buf, " pitem: %u-%u/%u",
+						appendStringInfo(buf, " pitem: %u-%u/%u",
 							 PostingItemGetBlockNumber(&insertData->newitem),
 						 ItemPointerGetBlockNumber(&insertData->newitem.key),
-					   ItemPointerGetOffsetNumber(&insertData->newitem.key));
+						ItemPointerGetOffsetNumber(&insertData->newitem.key));
+					}
 				}
 			}
 			break;
@@ -144,12 +149,20 @@ gin_desc(StringInfo buf, XLogReaderState *record)
 			break;
 		case XLOG_GIN_VACUUM_DATA_LEAF_PAGE:
 			{
-				ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) rec;
-
 				if (XLogRecHasBlockImage(record, 0))
-					appendStringInfoString(buf, " (full page image)");
+				{
+					if (XLogRecBlockImageApply(record, 0))
+						appendStringInfoString(buf, " (full page image)");
+					else
+						appendStringInfoString(buf, " (full page image, for WAL verification)");
+				}
 				else
+				{
+					ginxlogVacuumDataLeafPage *xlrec =
+					(ginxlogVacuumDataLeafPage *) XLogRecGetBlockData(record, 0, NULL);
+
 					desc_recompress_leaf(buf, &xlrec->data);
+				}
 			}
 			break;
 		case XLOG_GIN_DELETE_PAGE:
diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c
index 90bb88109f..dc0506913c 100644
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -3,7 +3,7 @@
  * gistdesc.c
  *	  rmgr descriptor routines for access/gist/gistxlog.c
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -14,7 +14,7 @@
  */
 #include "postgres.h"
 
-#include "access/gist_private.h"
+#include "access/gistxlog.h"
 #include "lib/stringinfo.h"
 #include "storage/relfilenode.h"
 
diff --git a/src/backend/access/rmgrdesc/hashdesc.c b/src/backend/access/rmgrdesc/hashdesc.c
index d37c9b1aae..35d86dc893 100644
--- a/src/backend/access/rmgrdesc/hashdesc.c
+++ b/src/backend/access/rmgrdesc/hashdesc.c
@@ -3,7 +3,7 @@
  * hashdesc.c
  *	  rmgr descriptor routines for access/hash/hash.c
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -14,15 +14,158 @@
  */
 #include "postgres.h"
 
-#include "access/hash.h"
+#include "access/hash_xlog.h"
 
 void
 hash_desc(StringInfo buf, XLogReaderState *record)
 {
+	char	   *rec = XLogRecGetData(record);
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	switch (info)
+	{
+		case XLOG_HASH_INIT_META_PAGE:
+			{
+				xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) rec;
+
+				appendStringInfo(buf, "num_tuples %g, fillfactor %d",
+								 xlrec->num_tuples, xlrec->ffactor);
+				break;
+			}
+		case XLOG_HASH_INIT_BITMAP_PAGE:
+			{
+				xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) rec;
+
+				appendStringInfo(buf, "bmsize %d", xlrec->bmsize);
+				break;
+			}
+		case XLOG_HASH_INSERT:
+			{
+				xl_hash_insert *xlrec = (xl_hash_insert *) rec;
+
+				appendStringInfo(buf, "off %u", xlrec->offnum);
+				break;
+			}
+		case XLOG_HASH_ADD_OVFL_PAGE:
+			{
+				xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) rec;
+
+				appendStringInfo(buf, "bmsize %d, bmpage_found %c",
+						   xlrec->bmsize, (xlrec->bmpage_found) ? 'T' : 'F');
+				break;
+			}
+		case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
+			{
+				xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) rec;
+
+				appendStringInfo(buf, "new_bucket %u, meta_page_masks_updated %c, issplitpoint_changed %c",
+								 xlrec->new_bucket,
+					(xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) ? 'T' : 'F',
+								 (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) ? 'T' : 'F');
+				break;
+			}
+		case XLOG_HASH_SPLIT_COMPLETE:
+			{
+				xl_hash_split_complete *xlrec = (xl_hash_split_complete *) rec;
+
+				appendStringInfo(buf, "old_bucket_flag %u, new_bucket_flag %u",
+							 xlrec->old_bucket_flag, xlrec->new_bucket_flag);
+				break;
+			}
+		case XLOG_HASH_MOVE_PAGE_CONTENTS:
+			{
+				xl_hash_move_page_contents *xlrec = (xl_hash_move_page_contents *) rec;
+
+				appendStringInfo(buf, "ntups %d, is_primary %c",
+								 xlrec->ntups,
+								 xlrec->is_prim_bucket_same_wrt ? 'T' : 'F');
+				break;
+			}
+		case XLOG_HASH_SQUEEZE_PAGE:
+			{
+				xl_hash_squeeze_page *xlrec = (xl_hash_squeeze_page *) rec;
+
+				appendStringInfo(buf, "prevblkno %u, nextblkno %u, ntups %d, is_primary %c",
+								 xlrec->prevblkno,
+								 xlrec->nextblkno,
+								 xlrec->ntups,
+								 xlrec->is_prim_bucket_same_wrt ? 'T' : 'F');
+				break;
+			}
+		case XLOG_HASH_DELETE:
+			{
+				xl_hash_delete *xlrec = (xl_hash_delete *) rec;
+
+				appendStringInfo(buf, "clear_dead_marking %c, is_primary %c",
+								 xlrec->clear_dead_marking ? 'T' : 'F',
+								 xlrec->is_primary_bucket_page ? 'T' : 'F');
+				break;
+			}
+		case XLOG_HASH_UPDATE_META_PAGE:
+			{
+				xl_hash_update_meta_page *xlrec = (xl_hash_update_meta_page *) rec;
+
+				appendStringInfo(buf, "ntuples %g",
+								 xlrec->ntuples);
+				break;
+			}
+		case XLOG_HASH_VACUUM_ONE_PAGE:
+			{
+				xl_hash_vacuum_one_page *xlrec = (xl_hash_vacuum_one_page *) rec;
+
+				appendStringInfo(buf, "ntuples %d",
+								 xlrec->ntuples);
+				break;
+			}
+	}
 }
 
 const char *
 hash_identify(uint8 info)
 {
-	return NULL;
+	const char *id = NULL;
+
+	switch (info & ~XLR_INFO_MASK)
+	{
+		case XLOG_HASH_INIT_META_PAGE:
+			id = "INIT_META_PAGE";
+			break;
+		case XLOG_HASH_INIT_BITMAP_PAGE:
+			id = "INIT_BITMAP_PAGE";
+			break;
+		case XLOG_HASH_INSERT:
+			id = "INSERT";
+			break;
+		case XLOG_HASH_ADD_OVFL_PAGE:
+			id = "ADD_OVFL_PAGE";
+			break;
+		case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
+			id = "SPLIT_ALLOCATE_PAGE";
+			break;
+		case XLOG_HASH_SPLIT_PAGE:
+			id = "SPLIT_PAGE";
+			break;
+		case XLOG_HASH_SPLIT_COMPLETE:
+			id = "SPLIT_COMPLETE";
+			break;
+		case XLOG_HASH_MOVE_PAGE_CONTENTS:
+			id = "MOVE_PAGE_CONTENTS";
+			break;
+		case XLOG_HASH_SQUEEZE_PAGE:
+			id = "SQUEEZE_PAGE";
+			break;
+		case XLOG_HASH_DELETE:
+			id = "DELETE";
+			break;
+		case XLOG_HASH_SPLIT_CLEANUP:
+			id = "SPLIT_CLEANUP";
+			break;
+		case XLOG_HASH_UPDATE_META_PAGE:
+			id = "UPDATE_META_PAGE";
+			break;
+		case XLOG_HASH_VACUUM_ONE_PAGE:
+			id = "VACUUM_ONE_PAGE";
+	}
+
+	return id;
 }
diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c
index 7c763b6b0e..44d2d6333f 100644
--- a/src/backend/access/rmgrdesc/heapdesc.c
+++ b/src/backend/access/rmgrdesc/heapdesc.c
@@ -3,7 +3,7 @@
  * heapdesc.c
  *	  rmgr descriptor routines for access/heap/heapam.c
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
diff --git a/src/backend/access/rmgrdesc/logicalmsgdesc.c b/src/backend/access/rmgrdesc/logicalmsgdesc.c
index 525826efd3..8287751e48 100644
--- a/src/backend/access/rmgrdesc/logicalmsgdesc.c
+++ b/src/backend/access/rmgrdesc/logicalmsgdesc.c
@@ -3,7 +3,7 @@
  * logicalmsgdesc.c
  *	  rmgr descriptor routines for replication/logical/message.c
  *
- * Portions Copyright (c) 2015-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2015-2017, PostgreSQL Global Development Group
  *
  *
  * IDENTIFICATION
diff --git a/src/backend/access/rmgrdesc/mxactdesc.c b/src/backend/access/rmgrdesc/mxactdesc.c
index 27c1fb0fc0..9c17447744 100644
--- a/src/backend/access/rmgrdesc/mxactdesc.c
+++ b/src/backend/access/rmgrdesc/mxactdesc.c
@@ -3,7 +3,7 @@
  * mxactdesc.c
  *	  rmgr descriptor routines for access/transam/multixact.c
  *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c
index 7631cb5c73..fbde9d6555 100644
--- a/src/backend/access/rmgrdesc/nbtdesc.c
+++ b/